In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression


In [2]:
# Load the dataset
df = pd.read_csv('Doceree-HCP_Train.csv')

In [3]:
# Drop the 'is_hcp' column
df = df.drop(['IS_HCP'], axis=1)

In [4]:
# Split the dataset into features (X) and target (y)
X = df.drop('TAXONOMY', axis=1)
y = df['TAXONOMY']

In [5]:
# Perform label encoding for categorical columns in X
categorical_cols = X.select_dtypes(include=['object']).columns
label_encoder = LabelEncoder()
for col in categorical_cols:
    X[col] = label_encoder.fit_transform(X[col].astype(str))

In [6]:
# Handle missing values in X
imputer = SimpleImputer(strategy='mean')
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Handle missing values in y
y = y.fillna('Unknown')

In [7]:
# Perform feature selection
k = 10 # Number of features to select
feature_selector = SelectKBest(score_func=f_classif, k=k)
X_selected = feature_selector.fit_transform(X, y)

  f = msb / msw


In [8]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

In [9]:
# Initialize the classifiers
classifiers = {
    'Random Forest': RandomForestClassifier(),
}

# Train and evaluate each classifier
for clf_name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{clf_name} Accuracy:", accuracy)

Random Forest Accuracy: 0.9144286466561348


In [10]:
import joblib

# Fit the model to the training data
clf.fit(X_train, y_train)

# Save the model to an HDF5 file
joblib.dump(clf, 'taxonomy.h5')

['taxonomy.h5']

In [11]:
# Load the test dataset from CSV
test_data = pd.read_csv('Doceree-HCP-Test.csv',encoding='latin1')

In [12]:
from sklearn.preprocessing import OrdinalEncoder

# Perform label encoding for categorical columns in the test dataset
categorical_cols = test_data.select_dtypes(include=['object']).columns
ordinal_encoder = OrdinalEncoder()
test_data[categorical_cols] = ordinal_encoder.fit_transform(test_data[categorical_cols].astype(str))


In [None]:
test_data = pd.DataFrame(imputer.transform(test_data), columns=test_data.columns)
test_data_selected = feature_selector.transform(test_data)

In [None]:
# Make predictions on the test dataset using the trained model
predictions = clf.predict(test_data_selected)

In [None]:
# Save the predicted target variable 'Taxonomy' to a CSV file
test_data['TAXONOMY'] = predictions
test_data.to_csv('predicted_TAXONOMY.csv', index=False)