In [None]:
!pip install opendatasets

Collecting opendatasets
  Downloading opendatasets-0.1.22-py3-none-any.whl.metadata (9.2 kB)
Downloading opendatasets-0.1.22-py3-none-any.whl (15 kB)
Installing collected packages: opendatasets
Successfully installed opendatasets-0.1.22


In [8]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, classification_report


In [None]:
od.download('https://www.kaggle.com/competitions/nexus-by-djs-nsdc-ultraceuticals/data?select=train.csv')

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username: luvvswami
Your Kaggle Key: ··········
Downloading nexus-by-djs-nsdc-ultraceuticals.zip to ./nexus-by-djs-nsdc-ultraceuticals


100%|██████████| 149M/149M [00:02<00:00, 68.6MB/s]



Extracting archive ./nexus-by-djs-nsdc-ultraceuticals/nexus-by-djs-nsdc-ultraceuticals.zip to ./nexus-by-djs-nsdc-ultraceuticals


In [9]:
df=pd.read_csv('/content/train.csv')

In [10]:
input_cols=['DRUGTYPE','Drug_high_status', 'DRUGNAME', 'Disease_of_highest_status','Drug_Status','GENENAME','BIOCLASS','SEQUENCE', 'Disease']
target_cols=['Target_Status']

In [11]:
encoder=OneHotEncoder(handle_unknown='ignore')
encoder.fit(df[input_cols])
encoded_cols=encoder.transform(df[input_cols])

In [12]:
x_train, x_test, y_train, y_test = train_test_split(encoded_cols, df[target_cols], test_size=0.2, random_state=42)

In [13]:
base_model = DecisionTreeClassifier(max_depth=30)
adaboost_model = AdaBoostClassifier(estimator=base_model, n_estimators=1500, algorithm='SAMME', random_state=42)
adaboost_model.fit(x_train, y_train.values.ravel())
y_pred = adaboost_model.predict(x_test)

In [14]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred,average='micro'))
print("Precision:", precision_score(y_test, y_pred,average='micro'))
print("F1_Score:", f1_score(y_test, y_pred,average='micro'))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9887723994349022
Recall: 0.9887723994349022
Precision: 0.9887723994349022
F1_Score: 0.9887723994349022

Classification Report:
                                  precision    recall  f1-score   support

          Application submitted       1.00      0.92      0.96        13
                       Approved       0.99      0.99      0.99      6168
         Approved (orphan drug)       1.00      1.00      1.00        32
                  BLA submitted       1.00      1.00      1.00         1
                 Clinical trial       0.97      0.96      0.96        70
        Discontinued in Phase 1       1.00      1.00      1.00       475
      Discontinued in Phase 1/2       1.00      0.92      0.96        24
        Discontinued in Phase 2       1.00      1.00      1.00      1177
      Discontinued in Phase 2/3       1.00      0.82      0.90        11
       Discontinued in Phase 2b       1.00      1.00      1.00         3
        Discontinued in Phase 3       1.00      1.00    

In [16]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=DecisionTreeClassifier(), param_grid=param_grid, cv=5)
grid_search.fit(x_train, y_train)

print("Best hyperparameters:", grid_search.best_params_)

best_model = grid_search.best_estimator_



Best hyperparameters: {'criterion': 'entropy', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}


In [17]:
y_pred_best = best_model.predict(x_test)

print("F1_Score:", f1_score(y_test, y_pred_best,average='micro'))

F1_Score: 0.9884749795523831


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid, cv=5)
grid_search.fit(x_train, y_train.values.ravel())

print("Best hyperparameters:", grid_search.best_params_)

best_model = grid_search.best_estimator_



In [None]:
test_data =pd.read_csv("/content/nexus-by-djs-nsdc-ultraceuticals/test.csv")

In [None]:
final_encoded_cols=encoder.transform(test_data[input_cols])

In [None]:
final_predictions = adaboost_model.predict(final_encoded_cols)

In [None]:
predictions_df = pd.DataFrame({'ID': test_data['ID'], 'Prediction': final_predictions})

In [None]:
predictions_df.head()

In [None]:
predictions_df.to_csv('predictions.csv', index=False)

In [None]:
print("Shape of training data:", encoded_cols.shape)
print("Shape of test data:", final_encoded_cols.shape)

In [None]:
sumbit_data = pd.read_csv("/content/predictions.csv")

In [None]:
print(sumbit_data[sumbit_data.duplicated(['ID'])])

In [None]:
sumbit_data.drop_duplicates(subset=['ID'], keep='first', inplace = True)

In [None]:
sumbit_data

In [None]:
sumbit_data.to_csv('submission.csv', index=False)