## **Load the Dataset**

In [2]:
import pandas as pd

# Read the Parquet file
df = pd.read_parquet("/content/train.parquet")



In [2]:
print(df.head())

                            Patient-Uid       Date           Incident
0  a0db1e73-1c7c-11ec-ae39-16262ee38c7f 2019-03-09  PRIMARY_DIAGNOSIS
1  a0dc93f2-1c7c-11ec-9cd2-16262ee38c7f 2015-05-16  PRIMARY_DIAGNOSIS
3  a0dc94c6-1c7c-11ec-a3a0-16262ee38c7f 2018-01-30     SYMPTOM_TYPE_0
4  a0dc950b-1c7c-11ec-b6ec-16262ee38c7f 2015-04-22        DRUG_TYPE_0
8  a0dc9543-1c7c-11ec-bb63-16262ee38c7f 2016-06-18        DRUG_TYPE_1


In [3]:
# shape of the dataframe
df.shape

(3220868, 3)

In [3]:
df

Unnamed: 0,Patient-Uid,Date,Incident
0,a0db1e73-1c7c-11ec-ae39-16262ee38c7f,2019-03-09,PRIMARY_DIAGNOSIS
1,a0dc93f2-1c7c-11ec-9cd2-16262ee38c7f,2015-05-16,PRIMARY_DIAGNOSIS
3,a0dc94c6-1c7c-11ec-a3a0-16262ee38c7f,2018-01-30,SYMPTOM_TYPE_0
4,a0dc950b-1c7c-11ec-b6ec-16262ee38c7f,2015-04-22,DRUG_TYPE_0
8,a0dc9543-1c7c-11ec-bb63-16262ee38c7f,2016-06-18,DRUG_TYPE_1
...,...,...,...
29080886,a0ee9f75-1c7c-11ec-94c7-16262ee38c7f,2018-07-06,DRUG_TYPE_6
29080897,a0ee1284-1c7c-11ec-a3d5-16262ee38c7f,2017-12-29,DRUG_TYPE_6
29080900,a0ee9b26-1c7c-11ec-8a40-16262ee38c7f,2018-10-18,DRUG_TYPE_10
29080903,a0ee1a92-1c7c-11ec-8341-16262ee38c7f,2015-09-18,DRUG_TYPE_6


In [4]:
# check the null values if any
df.isnull().sum()

Patient-Uid    0
Date           0
Incident       0
dtype: int64

In [5]:
# get the basic information about dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3220868 entries, 0 to 29080911
Data columns (total 3 columns):
 #   Column       Dtype         
---  ------       -----         
 0   Patient-Uid  object        
 1   Date         datetime64[ns]
 2   Incident     object        
dtypes: datetime64[ns](1), object(2)
memory usage: 98.3+ MB


In [6]:
df["Incident"].value_counts()

DRUG_TYPE_6          561934
DRUG_TYPE_1          484666
PRIMARY_DIAGNOSIS    431902
DRUG_TYPE_0          300005
DRUG_TYPE_7          258782
DRUG_TYPE_2          256841
DRUG_TYPE_8          160066
DRUG_TYPE_3          127676
TEST_TYPE_1           96810
TARGET DRUG           67218
DRUG_TYPE_9           66894
DRUG_TYPE_5           57510
DRUG_TYPE_11          48118
SYMPTOM_TYPE_0        46078
SYMPTOM_TYPE_6        32066
TEST_TYPE_0           27570
SYMPTOM_TYPE_7        22019
DRUG_TYPE_10          20925
DRUG_TYPE_14          17306
DRUG_TYPE_13          12372
DRUG_TYPE_12           9551
SYMPTOM_TYPE_14        8927
SYMPTOM_TYPE_1         8608
SYMPTOM_TYPE_2         8168
TEST_TYPE_3            8115
SYMPTOM_TYPE_5         7583
SYMPTOM_TYPE_8         7430
TEST_TYPE_2            7021
SYMPTOM_TYPE_15        6295
SYMPTOM_TYPE_10        6005
SYMPTOM_TYPE_29        5950
SYMPTOM_TYPE_16        4940
DRUG_TYPE_15           4906
SYMPTOM_TYPE_9         4885
DRUG_TYPE_4            4566
SYMPTOM_TYPE_4      

In [7]:
df["Incident"].nunique()

57

In [8]:
df['Patient-Uid'].value_counts()

a0ddfd2c-1c7c-11ec-876d-16262ee38c7f    1645
a0ea618f-1c7c-11ec-93fb-16262ee38c7f    1320
a0e553c4-1c7c-11ec-83f1-16262ee38c7f    1163
a0df4809-1c7c-11ec-be0b-16262ee38c7f    1099
a0ec2afe-1c7c-11ec-befd-16262ee38c7f    1075
                                        ... 
a0ecc127-1c7c-11ec-92b5-16262ee38c7f      31
a0eb794b-1c7c-11ec-92d7-16262ee38c7f      30
a0f02cd8-1c7c-11ec-96a0-16262ee38c7f      28
a0efac48-1c7c-11ec-9daa-16262ee38c7f      25
a0f0d0b5-1c7c-11ec-9901-16262ee38c7f      24
Name: Patient-Uid, Length: 27033, dtype: int64

In [9]:
df['Patient-Uid'].nunique()

27033

## Data Preprocessing

**Consider only the TARGET DRUG as eligibility criteria**

In [10]:
trg = df[df['Incident']=='TARGET DRUG']

In [11]:
trg

Unnamed: 0,Patient-Uid,Date,Incident
3294791,a0eb742b-1c7c-11ec-8f61-16262ee38c7f,2020-04-09,TARGET DRUG
3296990,a0edaf09-1c7c-11ec-a360-16262ee38c7f,2018-06-12,TARGET DRUG
3305387,a0e9fa0e-1c7c-11ec-8dc7-16262ee38c7f,2019-06-11,TARGET DRUG
3309423,a0ecc615-1c7c-11ec-aa31-16262ee38c7f,2019-11-15,TARGET DRUG
3309494,a0ea612f-1c7c-11ec-8cf0-16262ee38c7f,2020-03-18,TARGET DRUG
...,...,...,...
29074998,a0ef2b6d-1c7c-11ec-9172-16262ee38c7f,2018-10-12,TARGET DRUG
29075105,a0ebe423-1c7c-11ec-a5e0-16262ee38c7f,2019-07-02,TARGET DRUG
29075494,a0ebc713-1c7c-11ec-bd53-16262ee38c7f,2019-05-21,TARGET DRUG
29080031,a0ee1bdb-1c7c-11ec-90ba-16262ee38c7f,2018-06-07,TARGET DRUG


In [12]:
trg.shape

(67218, 3)

In [13]:
# take the unique items from patient id
target_elg = trg["Patient-Uid"].unique()

In [14]:
target_elg

array(['a0eb742b-1c7c-11ec-8f61-16262ee38c7f',
       'a0edaf09-1c7c-11ec-a360-16262ee38c7f',
       'a0e9fa0e-1c7c-11ec-8dc7-16262ee38c7f', ...,
       'a0ef1c8c-1c7c-11ec-b007-16262ee38c7f',
       'a0f08a4d-1c7c-11ec-bb15-16262ee38c7f',
       'a0efc1fa-1c7c-11ec-9e1c-16262ee38c7f'], dtype=object)

In [15]:
target_elg.shape              # Totally 9k items are consider to be eligible for the drug , unique patient IDs who have taken the "TARGET DRUG" at least once.

(9374,)

In [16]:
# take 1 for target drug user
# take 0 for others drug users

df['Eligiblity_for_drug'] = df['Patient-Uid'].isin(target_elg)

**Encoded the Eligibility column with replace method**

In [18]:
df['Eligiblity_for_drug'] = df['Eligiblity_for_drug'].replace({True: 1, False: 0})

In [19]:
df

Unnamed: 0,Patient-Uid,Date,Incident,Eligiblity_for_drug
0,a0db1e73-1c7c-11ec-ae39-16262ee38c7f,2019-03-09,PRIMARY_DIAGNOSIS,0
1,a0dc93f2-1c7c-11ec-9cd2-16262ee38c7f,2015-05-16,PRIMARY_DIAGNOSIS,0
3,a0dc94c6-1c7c-11ec-a3a0-16262ee38c7f,2018-01-30,SYMPTOM_TYPE_0,0
4,a0dc950b-1c7c-11ec-b6ec-16262ee38c7f,2015-04-22,DRUG_TYPE_0,0
8,a0dc9543-1c7c-11ec-bb63-16262ee38c7f,2016-06-18,DRUG_TYPE_1,0
...,...,...,...,...
29080886,a0ee9f75-1c7c-11ec-94c7-16262ee38c7f,2018-07-06,DRUG_TYPE_6,1
29080897,a0ee1284-1c7c-11ec-a3d5-16262ee38c7f,2017-12-29,DRUG_TYPE_6,1
29080900,a0ee9b26-1c7c-11ec-8a40-16262ee38c7f,2018-10-18,DRUG_TYPE_10,1
29080903,a0ee1a92-1c7c-11ec-8341-16262ee38c7f,2015-09-18,DRUG_TYPE_6,1


In [20]:
# Feature Engineering - Calculate time since first prescription for each patient
df["Date"] = pd.to_datetime(df["Date"])

**we have a lot of patient-uid in which we can group by the unique patient-uid calculate the no of days beofre the TARGET DRUG IS USED**

In [21]:
_1st_prescription_date = df.groupby("Patient-Uid")["Date"].min()

In [22]:
df["No.Of days before TARGET DRUG"] = (df["Date"] - df["Patient-Uid"].map(_1st_prescription_date)).dt.days


**Total no of incident happened for a single patient**

In [23]:
df["No of Incidents"] = df.groupby("Patient-Uid")["Date"].transform("count")

In [24]:
df

Unnamed: 0,Patient-Uid,Date,Incident,Eligiblity_for_drug,No.Of days before TARGET DRUG,No of Incidents
0,a0db1e73-1c7c-11ec-ae39-16262ee38c7f,2019-03-09,PRIMARY_DIAGNOSIS,0,1264,96
1,a0dc93f2-1c7c-11ec-9cd2-16262ee38c7f,2015-05-16,PRIMARY_DIAGNOSIS,0,36,109
3,a0dc94c6-1c7c-11ec-a3a0-16262ee38c7f,2018-01-30,SYMPTOM_TYPE_0,0,1028,98
4,a0dc950b-1c7c-11ec-b6ec-16262ee38c7f,2015-04-22,DRUG_TYPE_0,0,0,109
8,a0dc9543-1c7c-11ec-bb63-16262ee38c7f,2016-06-18,DRUG_TYPE_1,0,431,244
...,...,...,...,...,...,...
29080886,a0ee9f75-1c7c-11ec-94c7-16262ee38c7f,2018-07-06,DRUG_TYPE_6,1,1123,199
29080897,a0ee1284-1c7c-11ec-a3d5-16262ee38c7f,2017-12-29,DRUG_TYPE_6,1,996,94
29080900,a0ee9b26-1c7c-11ec-8a40-16262ee38c7f,2018-10-18,DRUG_TYPE_10,1,1262,126
29080903,a0ee1a92-1c7c-11ec-8341-16262ee38c7f,2015-09-18,DRUG_TYPE_6,1,164,136


In [25]:
df.isnull().sum()

Patient-Uid                      0
Date                             0
Incident                         0
Eligiblity_for_drug              0
No.Of days before TARGET DRUG    0
No of Incidents                  0
dtype: int64

In [26]:
df['Eligiblity_for_drug'].value_counts()

0    1784079
1    1436789
Name: Eligiblity_for_drug, dtype: int64

**Encoding the Incident column** - We use one hot encoding because incident  categories do not have a ranking in nature.

In [27]:
df = pd.get_dummies(df, columns=["Incident"])


**Creating a positive and negative set**

In [28]:
# Create a positive set and a negative set
positive_set = df[df["Eligiblity_for_drug"] == 1]
negative_set = df[df["Eligiblity_for_drug"] == 0]

# combine both  positive and negative sets to create the final  set  for training
df_final = pd.concat([positive_set, negative_set])


In [29]:
positive_set.shape,negative_set.shape     # both are unequal now

((1436789, 62), (1784079, 62))

**Split the data for training process**

In [30]:
df_final

Unnamed: 0,Patient-Uid,Date,Eligiblity_for_drug,No.Of days before TARGET DRUG,No of Incidents,Incident_DRUG_TYPE_0,Incident_DRUG_TYPE_1,Incident_DRUG_TYPE_10,Incident_DRUG_TYPE_11,Incident_DRUG_TYPE_12,...,Incident_SYMPTOM_TYPE_7,Incident_SYMPTOM_TYPE_8,Incident_SYMPTOM_TYPE_9,Incident_TARGET DRUG,Incident_TEST_TYPE_0,Incident_TEST_TYPE_1,Incident_TEST_TYPE_2,Incident_TEST_TYPE_3,Incident_TEST_TYPE_4,Incident_TEST_TYPE_5
8,a0e9c384-1c7c-11ec-81a0-16262ee38c7f,2018-02-22,1,1045,116,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22,a0e9c3b3-1c7c-11ec-ae8e-16262ee38c7f,2018-02-21,1,1042,285,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23,a0e9c3e3-1c7c-11ec-a8b9-16262ee38c7f,2017-05-11,1,603,71,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
29,a0e9c414-1c7c-11ec-889a-16262ee38c7f,2019-11-22,1,1666,135,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
32,a0e9c443-1c7c-11ec-9eb0-16262ee38c7f,2020-01-28,1,1748,171,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3256795,a0e045a1-1c7c-11ec-8014-16262ee38c7f,2020-07-10,0,1743,106,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3256799,a0e67e2a-1c7c-11ec-b805-16262ee38c7f,2015-12-16,0,253,59,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3256800,a0dec400-1c7c-11ec-80df-16262ee38c7f,2019-08-06,0,1537,157,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3256804,a0e09919-1c7c-11ec-9e7d-16262ee38c7f,2017-02-19,0,684,175,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
X = df_final.drop(['Patient-Uid','Date','Eligiblity_for_drug'],axis=1)
y = df_final['Eligiblity_for_drug']

In [32]:
X.shape,y.shape

((3220868, 59), (3220868,))

# **MODELS**

**RANDOMFOREST CLASSIFIER**

In [33]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix


# Split data into training and validation sets
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)



In [37]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((2576694, 59), (644174, 59), (2576694,), (644174,))

In [39]:
#scaling to  standardize the features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train= scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [36]:

# Create and train the Random Forest model
model = RandomForestClassifier(n_estimators=50, random_state=42)
result = model.fit(X_train, y_train)

# Make predictions on the validation set
y_pred = result.predict(X_test)

# Evaluate the model using F1-score
f1 = f1_score(y_test, y_pred)
print("F1-score:", f1)


accuracy = accuracy_score(y_test, y_pred)
print("Accuracy ",accuracy)

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)


F1-score: 0.636632832417539
Accuracy  0.6793987338824603
Confusion Matrix:
[[256733 100330]
 [106193 180918]]


**LOGISTIC REGRESSION**

In [37]:
from sklearn.linear_model import LogisticRegression

# X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

# Train the logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Make predictions on the validation set
y_pred = model.predict(X_test)

# Evaluate the model using F1-score
f1 = f1_score(y_test, y_pred)
print("F1-score:", f1)


accuracy = accuracy_score(y_test, y_pred)
print("Accuracy ",accuracy)


# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

F1-score: 0.6142153328526796
Accuracy  0.7009658880985572
Confusion Matrix:
[[298199  58864]
 [133766 153345]]


**GradientBoostingClassifier**

In [38]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix

# X_train, X_test, y_train, y_test = train_test_split(X1,y1,test_size=0.2, random_state=42)

# Train Gradient Boosting model
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_model.fit(X_train, y_train)

gb_pred = gb_model.predict(X_test)

gb_f1 = f1_score(y_test, gb_pred)
gb_accuracy = accuracy_score(y_test, gb_pred)


print("\nGradient Boosting:")
print("F1-score:", gb_f1)
print("Accuracy:", gb_accuracy)


Gradient Boosting:
F1-score: 0.6537972802259048
Accuracy: 0.71070859736655


**XGBOOST CLASSIFIER**

In [46]:
import xgboost as xgb
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix

# X_train, X_test, y_train, y_test = train_test_split( X1,y1,test_size=0.2, random_state=42)

# Train XGBoost model
xgb_model = xgb.XGBClassifier(n_estimators=100, random_state=42)
xgb_model.fit(X_train, y_train)

xgb_pred = xgb_model.predict(X_test)

xgb_f1 = f1_score(y_test, xgb_pred)
xgb_accuracy = accuracy_score(y_test, xgb_pred)

print("\nXGBoost:")
print("F1-score:", xgb_f1)
print("Accuracy:", xgb_accuracy)


XGBoost:
F1-score: 0.6694611688357415
Accuracy: 0.7231043165355944


INFERENCE :
 **By comparing the different models we can choose ExtraGradientBoostclassifier for our predicting model , because it give F1score approximately 0.67 and accuracy of 72 percent with others.**

 **WE CHOOSE XGBOOST CLASSIFIER**

# **TESTING**

In [120]:
df_test = pd.read_parquet("/content/test.parquet")

In [121]:
df_test

Unnamed: 0,Patient-Uid,Date,Incident
0,a0f9e8a9-1c7c-11ec-8d25-16262ee38c7f,2016-12-08,SYMPTOM_TYPE_0
1,a0f9e8a9-1c7c-11ec-8d25-16262ee38c7f,2018-10-17,DRUG_TYPE_0
2,a0f9e8a9-1c7c-11ec-8d25-16262ee38c7f,2017-12-01,DRUG_TYPE_2
3,a0f9e8a9-1c7c-11ec-8d25-16262ee38c7f,2018-12-05,DRUG_TYPE_1
4,a0f9e8a9-1c7c-11ec-8d25-16262ee38c7f,2017-11-04,SYMPTOM_TYPE_0
...,...,...,...
1372854,a10272c9-1c7c-11ec-b3ce-16262ee38c7f,2017-05-11,DRUG_TYPE_13
1372856,a10272c9-1c7c-11ec-b3ce-16262ee38c7f,2018-08-22,DRUG_TYPE_2
1372857,a10272c9-1c7c-11ec-b3ce-16262ee38c7f,2017-02-04,DRUG_TYPE_2
1372858,a10272c9-1c7c-11ec-b3ce-16262ee38c7f,2017-09-25,DRUG_TYPE_8


In [122]:
df_test['Incident'].value_counts()

DRUG_TYPE_6          192292
DRUG_TYPE_1          169127
PRIMARY_DIAGNOSIS    130511
DRUG_TYPE_7           93002
DRUG_TYPE_0           91059
DRUG_TYPE_2           90629
DRUG_TYPE_8           56213
DRUG_TYPE_3           41091
TEST_TYPE_1           34274
DRUG_TYPE_9           25294
DRUG_TYPE_5           20692
DRUG_TYPE_11          17542
SYMPTOM_TYPE_0        17292
SYMPTOM_TYPE_6        11536
TEST_TYPE_0           10921
SYMPTOM_TYPE_7         7943
SYMPTOM_TYPE_5         5403
DRUG_TYPE_14           4963
DRUG_TYPE_10           4245
DRUG_TYPE_13           4079
SYMPTOM_TYPE_1         2955
DRUG_TYPE_12           2826
SYMPTOM_TYPE_2         2772
TEST_TYPE_3            2709
SYMPTOM_TYPE_8         2683
TEST_TYPE_2            2524
SYMPTOM_TYPE_14        2482
DRUG_TYPE_4            1840
SYMPTOM_TYPE_15        1818
SYMPTOM_TYPE_10        1783
SYMPTOM_TYPE_29        1703
SYMPTOM_TYPE_9         1689
SYMPTOM_TYPE_16        1576
DRUG_TYPE_15           1373
SYMPTOM_TYPE_4         1315
SYMPTOM_TYPE_3      

In [123]:
df_test['Incident'].nunique()

55

**Preprocessing the test datatset and do some changed as per training set in same order**

In [124]:

df_test["Date"] = pd.to_datetime(df_test["Date"])
test_1st_prescription_date = df_test.groupby("Patient-Uid")["Date"].min()
df_test["No.Of days before TARGET DRUG"] = (df_test["Date"] - df_test["Patient-Uid"].map(test_1st_prescription_date)).dt.days
df_test["No of Incidents"] = df_test.groupby("Patient-Uid")["Date"].transform("count")
df_test = pd.get_dummies(df_test, columns=["Incident"])

**Insert the missing columns into desired index**

In [125]:
df_test.insert(loc=14, column='Incident_DRUG_TYPE_18', value=0)


In [127]:
df_test.insert(loc=54, column='Incident_TARGET DRUG', value=0)

In [128]:
df_test.columns

Index(['Patient-Uid', 'Date', 'No.Of days before TARGET DRUG',
       'No of Incidents', 'Incident_DRUG_TYPE_0', 'Incident_DRUG_TYPE_1',
       'Incident_DRUG_TYPE_10', 'Incident_DRUG_TYPE_11',
       'Incident_DRUG_TYPE_12', 'Incident_DRUG_TYPE_13',
       'Incident_DRUG_TYPE_14', 'Incident_DRUG_TYPE_15',
       'Incident_DRUG_TYPE_16', 'Incident_DRUG_TYPE_17',
       'Incident_DRUG_TYPE_18', 'Incident_DRUG_TYPE_2', 'Incident_DRUG_TYPE_3',
       'Incident_DRUG_TYPE_4', 'Incident_DRUG_TYPE_5', 'Incident_DRUG_TYPE_6',
       'Incident_DRUG_TYPE_7', 'Incident_DRUG_TYPE_8', 'Incident_DRUG_TYPE_9',
       'Incident_PRIMARY_DIAGNOSIS', 'Incident_SYMPTOM_TYPE_0',
       'Incident_SYMPTOM_TYPE_1', 'Incident_SYMPTOM_TYPE_10',
       'Incident_SYMPTOM_TYPE_11', 'Incident_SYMPTOM_TYPE_12',
       'Incident_SYMPTOM_TYPE_13', 'Incident_SYMPTOM_TYPE_14',
       'Incident_SYMPTOM_TYPE_15', 'Incident_SYMPTOM_TYPE_16',
       'Incident_SYMPTOM_TYPE_17', 'Incident_SYMPTOM_TYPE_18',
       'Incident_SY

In [129]:
X_predict = df_test.drop(['Patient-Uid','Date'],axis = 1)

In [130]:
X_predict

Unnamed: 0,No.Of days before TARGET DRUG,No of Incidents,Incident_DRUG_TYPE_0,Incident_DRUG_TYPE_1,Incident_DRUG_TYPE_10,Incident_DRUG_TYPE_11,Incident_DRUG_TYPE_12,Incident_DRUG_TYPE_13,Incident_DRUG_TYPE_14,Incident_DRUG_TYPE_15,...,Incident_SYMPTOM_TYPE_7,Incident_SYMPTOM_TYPE_8,Incident_SYMPTOM_TYPE_9,Incident_TARGET DRUG,Incident_TEST_TYPE_0,Incident_TEST_TYPE_1,Incident_TEST_TYPE_2,Incident_TEST_TYPE_3,Incident_TEST_TYPE_4,Incident_TEST_TYPE_5
0,168,55,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,846,55,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,526,55,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,895,55,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,499,55,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1372854,758,179,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1372856,1226,179,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1372857,662,179,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1372858,895,179,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [131]:
X_predict.shape

(1065524, 59)

**Scale down the test data**

In [132]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)


#scaling to  standardize the features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train= scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
x_predict_test = scaler.transform(X_predict)

**Predict the Output with the Extragradientboosting model**

In [133]:
import xgboost as xgb
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix



# Train XGBoost model
model = xgb.XGBClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

df_test['PREDICTION OUTPUT'] = model.predict(x_predict_test)



In [None]:
df_test

**Download in CSV format**

In [136]:
df_test[["Patient-Uid", "PREDICTION OUTPUT"]].to_csv("final_submission.csv", index=False)

In [141]:
df_test["PREDICTION OUTPUT"].value_counts()

0    727161
1    338363
Name: PREDICTION OUTPUT, dtype: int64