In [1]:
import pandas as pd
dataset = pd.read_csv("/content/train.csv")
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 134486 entries, 0 to 134485
Data columns (total 19 columns):
 #   Column                     Non-Null Count   Dtype 
---  ------                     --------------   ----- 
 0   ID                         134486 non-null  int64 
 1   TargetID                   134486 non-null  object
 2   DRUGID                     134486 non-null  object
 3   DRUGTYPE                   134486 non-null  object
 4   Drug_high_status           134486 non-null  object
 5   DRUGNAME                   134486 non-null  object
 6   PUBCHCID                   134486 non-null  int64 
 7   Disease_of_highest_status  134486 non-null  object
 8   Drug_Status                134486 non-null  object
 9   UNIPROID                   134486 non-null  object
 10  TARGNAME                   134486 non-null  object
 11  GENENAME                   134486 non-null  object
 12  SYNONYMS                   134486 non-null  object
 13  FUNCTION                   134486 non-null  

In [2]:
input_cols =['DRUGTYPE','Drug_high_status', 'DRUGNAME', 'Disease_of_highest_status','Drug_Status','GENENAME','BIOCLASS','SEQUENCE', 'Disease']
target_cols=['Target_Status']

In [4]:
target_cols_copy = dataset['Target_Status'].copy()
target_cols_copy

Unnamed: 0,Target_Status
0,Terminated
1,Approved
2,Approved
3,Phase 2
4,Approved
...,...
134481,Phase 2
134482,Phase 1/2
134483,Investigative
134484,Phase 3


In [11]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

label_encoder.fit(dataset['Target_Status'])

dataset['Target_Status'] = label_encoder.transform(dataset['Target_Status'])

In [12]:
dataset['Target_Status']

Unnamed: 0,Target_Status
0,31
1,1
2,1
3,22
4,1
...,...
134481,22
134482,18
134483,14
134484,26


In [13]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(handle_unknown='ignore')
encoder.fit(dataset[input_cols])
encoded_cols = encoder.transform(dataset[input_cols])
encoded_cols

<134486x5268 sparse matrix of type '<class 'numpy.float64'>'
	with 1210374 stored elements in Compressed Sparse Row format>

In [16]:
import xgboost as xgb
from sklearn.metrics import f1_score, accuracy_score
xgb_model = xgb.XGBClassifier(n_estimators=1000, learning_rate=0.5, random_state=42)
xgb_model.fit(encoded_cols, dataset[target_cols])

In [17]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=DecisionTreeClassifier(), param_grid=param_grid, cv=5)
grid_search.fit(encoded_cols, dataset[target_cols])




In [18]:
best_params = grid_search.best_params_

best_dt_model = DecisionTreeClassifier(**best_params)
best_dt_model.fit(encoded_cols, dataset[target_cols])



In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(encoded_cols, dataset[target_cols], test_size=0.2, random_state=42)

In [24]:
from sklearn.ensemble import StackingClassifier
estimators = [('xgb', xgb_model), ('dt', best_dt_model)]

stacking_model = StackingClassifier(estimators=estimators, final_estimator=xgb.XGBClassifier())

In [22]:
y_pred = stacking_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='micro')
f1

0.9929734552754852

In [25]:
stacking_model = stacking_model.fit(encoded_cols, dataset[target_cols])

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


In [36]:
stacking_model_goat = StackingClassifier(estimators=estimators, final_estimator=xgb.XGBClassifier(n_estimators=1000))
stacking_model_goat = stacking_model.fit(encoded_cols, dataset[target_cols])

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


test_data = pd.read_csv("/content/test.csv")
test_data.info()

In [45]:
test_data = pd.read_csv("/content/test.csv")
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57637 entries, 0 to 57636
Data columns (total 18 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   ID                         57637 non-null  int64 
 1   TargetID                   57637 non-null  object
 2   DRUGID                     57637 non-null  object
 3   DRUGTYPE                   57637 non-null  object
 4   Drug_high_status           57637 non-null  object
 5   DRUGNAME                   57637 non-null  object
 6   PUBCHCID                   57637 non-null  int64 
 7   Disease_of_highest_status  57637 non-null  object
 8   Drug_Status                57637 non-null  object
 9   UNIPROID                   57637 non-null  object
 10  TARGNAME                   57637 non-null  object
 11  GENENAME                   57637 non-null  object
 12  SYNONYMS                   57637 non-null  object
 13  FUNCTION                   57637 non-null  object
 14  BIOCLA

In [46]:
input_test_cols =['DRUGTYPE','Drug_high_status', 'DRUGNAME', 'Disease_of_highest_status','Drug_Status','GENENAME','BIOCLASS','SEQUENCE', 'Disease']

encoded_cols_test = encoder.transform(test_data[input_test_cols])

In [47]:
final_preds = stacking_model_goat.predict(encoded_cols_test)

In [48]:
true_final = label_encoder.inverse_transform(final_preds)
true_final[0]

'Approved'

In [49]:
predictions_df = pd.DataFrame({'ID': test_data['ID'],'Target_Status': true_final})
predictions_df.head()

Unnamed: 0,ID,Target_Status
0,1076,Approved
1,190816,Phase 3
2,180551,Phase 3
3,51630,Approved
4,50566,Phase 1/2


In [50]:
predictions_df.to_csv('predictions.csv', index=False)

In [51]:
predictions_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57637 entries, 0 to 57636
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   ID             57637 non-null  int64 
 1   Target_Status  57637 non-null  object
dtypes: int64(1), object(1)
memory usage: 900.7+ KB
