# Drugs Prediction

In [496]:
import numpy as np
import pandas as pd
import seaborn as sns
from pandas.api.types import CategoricalDtype


In [563]:
train = pd.read_csv("../HagitGadot/train.csv")
test = pd.read_csv("../HagitGadot/dev.csv")

In [564]:
#list(train)

In [565]:
train2 = train.copy()
train2['AgeStartAll'] = train2['AgeStartAll'].astype('category')

In [566]:
test2 = test.copy()
test2['AgeStartAll'] = test2['AgeStartAll'].astype('category')

In [567]:
train2 = train2.rename(columns={'X.PeoInFamily_1':'XPeoInFamily_1'})
test2 = test2.rename(columns={'X.PeoInFamily_1':'XPeoInFamily_1'})

# Partition of the data

In [570]:
y_train = train2.AgeStartAll
y_test = test2.AgeStartAll

In [571]:
train3=train2.copy()
test3=test2.copy()

In [572]:
train3=train3.drop("Unnamed: 0",axis=1)
test3=test3.drop("Unnamed: 0",axis=1)
train3=train3.drop("AgeStartAll",axis=1)
test3=test3.drop("AgeStartAll",axis=1)

In [573]:
X_train = train3
X_test = test3

# Models

### Logistic Regression

In [481]:
from sklearn.linear_model import LogisticRegression


In [482]:
from sklearn.metrics import roc_auc_score

In [575]:
mod1 = LogisticRegression(random_state=1)


In [576]:
mod1.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=1, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [577]:
yhat1_tr = mod1.predict(X_train)
yhat1_tr

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [578]:
yprob1_tr = mod1.predict_proba(X_train)
yprob1_tr

array([[0.97947926, 0.02052074],
       [0.99217964, 0.00782036],
       [0.96522514, 0.03477486],
       ...,
       [0.9555648 , 0.0444352 ],
       [0.95477904, 0.04522096],
       [0.97903246, 0.02096754]])

In [579]:
pd.crosstab(y_train,yhat1_tr)

col_0,0,1
AgeStartAll,Unnamed: 1_level_1,Unnamed: 2_level_1
0,13565,161
1,950,392


In [580]:
### Accuracy:
mod1.score(X_train,y_train)

0.926267586939209

In [581]:
### Train
mod1.score(X_train,y_train)
yprob1_tr = mod1.predict_proba(X_train)
print('AUC=',roc_auc_score(y_train, yprob1_tr[:,1]))


AUC= 0.8693233256020045


In [582]:
yhat1_ts = mod1.predict(X_test)
print(mod1.score(X_test,y_test))
print(pd.crosstab(y_test,yhat1_ts))
yprob1_ts = mod1.predict_proba(X_test)
print(roc_auc_score(y_test, yprob1_ts[:,1]))

0.9166666666666666
col_0           0   1
AgeStartAll          
0            3356  56
1             258  98
0.845624168499809


In [92]:
res = {1:{'model':'Logistic Regression',
           'accuracy-Train':mod1.score(X_train,y_train),
           'accuracy-Test':mod1.score(X_test,y_test),
           'AUC-train':roc_auc_score(y_train, yprob1_tr[:,1]),
           'AUC-test':roc_auc_score(y_test, yprob1_ts[:,1])
      }}

In [93]:
res

{1: {'model': 'Logistic Regression',
  'accuracy-Train': 0.926267586939209,
  'accuracy-Test': 0.9166666666666666,
  'AUC-train': 0.8693233256020045,
  'AUC-test': 0.845624168499809}}

## Decision Tree Model

In [94]:
from sklearn.tree import DecisionTreeClassifier

In [583]:
mod2 = DecisionTreeClassifier(random_state=1)
mod2.fit(X_train,y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=1, splitter='best')

In [584]:
yhat2_tr = mod2.predict(X_train)
print(pd.crosstab(y_train,yhat2_tr))
yprob2_tr = mod2.predict_proba(X_train)
print('---------------')
print(mod2.score(X_train,y_train))
print(roc_auc_score(y_train, yprob2_tr[:,1]))

col_0            0    1
AgeStartAll            
0            13664   62
1              659  683
---------------
0.9521502521900717
0.9329105912110406


In [585]:
yhat2_ts = mod2.predict(X_test)
print(pd.crosstab(y_test,yhat2_ts))
yprob2_ts = mod2.predict_proba(X_test)
print('---------------')
print(mod2.score(X_test,y_test))
print(roc_auc_score(y_test, yprob2_ts[:,1]))

col_0           0    1
AgeStartAll           
0            3303  109
1             257   99
---------------
0.9028662420382165
0.6741848828325671


In [586]:
res[2] = {'model':'Decision Tree',
           'accuracy-Train':mod2.score(X_train,y_train),
           'accuracy-Test':mod2.score(X_test,y_test),
           'AUC-train':roc_auc_score(y_train, yprob2_tr[:,1]),
           'AUC-test':roc_auc_score(y_test, yprob2_ts[:,1])
         }

In [587]:
res[2]

{'model': 'Decision Tree',
 'accuracy-Train': 0.9521502521900717,
 'accuracy-Test': 0.9028662420382165,
 'AUC-train': 0.9329105912110406,
 'AUC-test': 0.6741848828325671}

## Random Forest Model

In [100]:
from sklearn.ensemble import RandomForestClassifier

In [101]:
mod3 = RandomForestClassifier(max_depth=3,max_leaf_nodes=4,criterion='entropy',random_state=34)
mod3.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=3, max_features='auto',
                       max_leaf_nodes=4, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=34, verbose=0,
                       warm_start=False)

In [102]:
yhat3_tr = mod3.predict(X_train)
print(pd.crosstab(y_train,yhat3_tr))
yprob3_tr = mod3.predict_proba(X_train)
print('---------------')
print(mod3.score(X_train,y_train))
print(roc_auc_score(y_train, yprob3_tr[:,1]))

col_0            0
AgeStartAll       
0            13726
1             1342
---------------
0.9109370852136979
0.8617787112169558


In [103]:
yhat3_ts = mod3.predict(X_test)
print(pd.crosstab(y_test,yhat3_ts))
yprob3_ts = mod3.predict_proba(X_test)
print('---------------')
print(mod3.score(X_test,y_test))
print(roc_auc_score(y_test, yprob3_ts[:,1]))

col_0           0
AgeStartAll      
0            3412
1             356
---------------
0.9055201698513801
0.8389194778508108


In [104]:
res[3] = {'model':'Random Forest',
           'accuracy-Train':mod3.score(X_train,y_train),
           'accuracy-Test':mod3.score(X_test,y_test),
           'AUC-train':roc_auc_score(y_train, yprob3_tr[:,1]),
           'AUC-test':roc_auc_score(y_test, yprob3_ts[:,1])
         }

In [105]:
res[3]

{'model': 'Random Forest',
 'accuracy-Train': 0.9109370852136979,
 'accuracy-Test': 0.9055201698513801,
 'AUC-train': 0.8617787112169558,
 'AUC-test': 0.8389194778508108}

## SVM

In [106]:
from sklearn.svm import SVC

In [107]:
mod4 = SVC(probability=True)
mod4.fit(X_train,y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [108]:
yhat4_tr = mod4.predict(X_train)
print(pd.crosstab(y_train,yhat4_tr))
yprob4_tr = mod4.predict_proba(X_train)
print('---------------')
print(mod4.score(X_train,y_train))
print(roc_auc_score(y_train, yprob4_tr[:,1]))

col_0            0    1
AgeStartAll            
0            13660   66
1              956  386
---------------
0.9321741438810724
0.8087602248650563


In [109]:
yhat4_ts = mod4.predict(X_test)
print(pd.crosstab(y_test,yhat4_ts))
yprob4_ts = mod4.predict_proba(X_test)
print('---------------')
print(mod4.score(X_test,y_test))
print(roc_auc_score(y_test, yprob4_ts[:,1]))

col_0           0   1
AgeStartAll          
0            3375  37
1             268  88
---------------
0.9190552016985138
0.7186450333917305


In [110]:
res[4] = {'model':'SVM',
           'accuracy-Train':mod4.score(X_train,y_train),
           'accuracy-Test':mod4.score(X_test,y_test),
           'AUC-train':roc_auc_score(y_train, 1-yprob4_tr[:,1]),
           'AUC-test':roc_auc_score(y_test, 1-yprob4_ts[:,1])
         }

In [111]:
res[4]

{'model': 'SVM',
 'accuracy-Train': 0.9321741438810724,
 'accuracy-Test': 0.9190552016985138,
 'AUC-train': 0.19123977513494356,
 'AUC-test': 0.28135496660826953}

## kNN

In [112]:
from sklearn.neighbors import KNeighborsClassifier

In [113]:
mod5 = KNeighborsClassifier()
mod5.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [114]:
yhat5_tr = mod5.predict(X_train)
print(pd.crosstab(y_train,yhat5_tr))
yprob5_tr = mod5.predict_proba(X_train)
print('---------------')
print(mod5.score(X_train,y_train))
print(roc_auc_score(y_train, yprob5_tr[:,1]))

col_0            0    1
AgeStartAll            
0            13469  257
1              843  499
---------------
0.9269976108308999
0.8547518682114268


In [115]:
yhat5_ts = mod5.predict(X_test)
print(pd.crosstab(y_test,yhat5_ts))
yprob5_ts = mod5.predict_proba(X_test)
print('---------------')
print(mod5.score(X_test,y_test))
print(roc_auc_score(y_test, yprob5_ts[:,1]))

col_0           0    1
AgeStartAll           
0            3307  105
1             267   89
---------------
0.9012738853503185
0.7643693112214656


In [116]:
res[5] = {'model':'kNN',
           'accuracy-Train':mod5.score(X_train,y_train),
           'accuracy-Test':mod5.score(X_test,y_test),
           'AUC-train':roc_auc_score(y_train,yprob5_tr[:,1]),
           'AUC-test':roc_auc_score(y_test,1-yprob5_ts[:,1])
         }

In [117]:
res[5]

{'model': 'kNN',
 'accuracy-Train': 0.9269976108308999,
 'accuracy-Test': 0.9012738853503185,
 'AUC-train': 0.8547518682114268,
 'AUC-test': 0.23563068877853438}

## AdaBoost

In [122]:
from sklearn.ensemble import AdaBoostClassifier

In [123]:
mod6 = AdaBoostClassifier()
mod6.fit(X_train,y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=50, random_state=None)

In [124]:
yhat6_tr = mod6.predict(X_train)
print(pd.crosstab(y_train,yhat6_tr))
yprob6_tr = mod6.predict_proba(X_train)
print('---------------')
print(mod6.score(X_train,y_train))
print(roc_auc_score(y_train, yprob6_tr[:,1]))

col_0            0    1
AgeStartAll            
0            13565  161
1              946  396
---------------
0.926533050172551
0.8682964417719327


In [125]:
yhat6_ts = mod6.predict(X_test)
print(pd.crosstab(y_test,yhat6_ts))
yprob6_ts = mod6.predict_proba(X_test)
print('---------------')
print(mod6.score(X_test,y_test))
print(roc_auc_score(y_test, yprob6_ts[:,1]))

col_0           0    1
AgeStartAll           
0            3356   56
1             256  100
---------------
0.9171974522292994
0.8428736317293886


In [126]:
res[6] = {'model':'AdaBoost',
           'accuracy-Train':mod6.score(X_train,y_train),
           'accuracy-Test':mod6.score(X_test,y_test),
           'AUC-train':roc_auc_score(y_train, yprob6_tr[:,1]),
           'AUC-test':roc_auc_score(y_test, yprob6_ts[:,1])
         }

In [127]:
res[6]

{'model': 'AdaBoost',
 'accuracy-Train': 0.926533050172551,
 'accuracy-Test': 0.9171974522292994,
 'AUC-train': 0.8682964417719327,
 'AUC-test': 0.8428736317293886}

## Gradient Boosting Machine

In [128]:
from sklearn.ensemble import GradientBoostingClassifier

In [129]:
mod7 = GradientBoostingClassifier()
mod7.fit(X_train,y_train)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [130]:
yhat7_tr = mod7.predict(X_train)
print(pd.crosstab(y_train,yhat7_tr))
yprob7_tr = mod7.predict_proba(X_train)
print('---------------')
print(mod7.score(X_train,y_train))
print(roc_auc_score(y_train, yprob7_tr[:,1]))

col_0            0    1
AgeStartAll            
0            13610  116
1              941  401
---------------
0.9298513405893284
0.8722863893797124


In [131]:
yhat7_ts = mod7.predict(X_test)
print(pd.crosstab(y_test,yhat7_ts))
yprob7_ts = mod7.predict_proba(X_test)
print('---------------')
print(mod7.score(X_test,y_test))
print(roc_auc_score(y_test, yprob7_ts[:,1]))

col_0           0   1
AgeStartAll          
0            3360  52
1             263  93
---------------
0.9164012738853503
0.8405865945703861


In [132]:
res[7] = {'model':'Gradient Boosting Machine',
           'accuracy-Train':mod7.score(X_train,y_train),
           'accuracy-Test':mod7.score(X_test,y_test),
           'AUC-train':roc_auc_score(y_train, yprob7_tr[:,1]),
           'AUC-test':roc_auc_score(y_test, yprob7_ts[:,1])
         }

In [133]:
res[7]

{'model': 'Gradient Boosting Machine',
 'accuracy-Train': 0.9298513405893284,
 'accuracy-Test': 0.9164012738853503,
 'AUC-train': 0.8722863893797124,
 'AUC-test': 0.8405865945703861}

# Model Selection

In [134]:
pd.DataFrame.from_dict(res,orient='index').sort_values(by='AUC-test',ascending=False)

Unnamed: 0,model,accuracy-Train,accuracy-Test,AUC-train,AUC-test
1,Logistic Regression,0.926268,0.916667,0.869323,0.845624
6,AdaBoost,0.926533,0.917197,0.868296,0.842874
7,Gradient Boosting Machine,0.929851,0.916401,0.872286,0.840587
3,Random Forest,0.910937,0.90552,0.861779,0.838919
2,Decision Tree,0.95215,0.902866,0.932911,0.674185
4,SVM,0.932174,0.919055,0.19124,0.281355
5,kNN,0.926998,0.901274,0.854752,0.235631


In [135]:
pd.DataFrame.from_dict(res,orient='index').sort_values(by='accuracy-Test',ascending=False)

Unnamed: 0,model,accuracy-Train,accuracy-Test,AUC-train,AUC-test
4,SVM,0.932174,0.919055,0.19124,0.281355
6,AdaBoost,0.926533,0.917197,0.868296,0.842874
1,Logistic Regression,0.926268,0.916667,0.869323,0.845624
7,Gradient Boosting Machine,0.929851,0.916401,0.872286,0.840587
3,Random Forest,0.910937,0.90552,0.861779,0.838919
2,Decision Tree,0.95215,0.902866,0.932911,0.674185
5,kNN,0.926998,0.901274,0.854752,0.235631


In [None]:
# I chose Logistic Regression because it is the best model (and also easier model to preform and explain)

## Finetunnig - Logistic regression

In [144]:
lr=LogisticRegression()

In [145]:
from sklearn.model_selection import GridSearchCV

In [207]:
param_grid =[{'penalty':['l1','l2','elasticnet','none'],
            'C':[0.005, 0.5,1,2,5,8],
            'solver':['liblinear','newton-cg','sag','saga'],
            'max_iter':[100,200,500]}]

In [208]:
ftlr=GridSearchCV(lr,param_grid=param_grid,cv=7,verbose=True,n_jobs=-1)

In [209]:
best_ftlr=ftlr.fit(X_train,y_train)

Fitting 7 folds for each of 288 candidates, totalling 2016 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 125 tasks      | elapsed:    7.8s
[Parallel(n_jobs=-1)]: Done 380 tasks      | elapsed:   47.2s
[Parallel(n_jobs=-1)]: Done 742 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 1288 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 1930 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done 2016 out of 2016 | elapsed:  6.1min finished


In [190]:
best_ftlr.best_estimator_

LogisticRegression(C=8, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l1',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [191]:
print(f'Accuracy:{best_ftlr.score(X_train,y_train):.5f}')

Accuracy:0.92640


In [192]:
pred_tr = best_ftlr.predict_proba(X_train)
print('AUC=',roc_auc_score(y_train, pred_tr[:,1]))

AUC= 0.86917142790136


In [193]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mae = 100 * np.mean(errors)
    print('Model Performance')
    print('Mean Absolute Error: {:0.4f}'.format(np.mean(errors)))
    return mae

In [194]:
#Base Logistic regression model

In [195]:
base_model = LogisticRegression(random_state=1)
base_model.fit(X_train,y_train)
y_test=y_test.astype('float')
base_accuracy = evaluate(base_model, X_test, y_test)

Model Performance
Mean Absolute Error: 0.0833


In [212]:
print('bace model Accuracy= {:0.5f}'.format(base_model.score(X_train,y_train)))

bace model Accuracy= 0.92627


In [213]:
yprob1_tr = base_model.predict_proba(X_train)
print('bace model AUC= {:0.5f}'.format(roc_auc_score(y_train, yprob1_tr[:,1])))

bace model AUC= 0.86932


In [198]:
#Best model - Logistic regression

In [214]:
best_model = LogisticRegression(C=8, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l1',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

best_model.fit(X_train,y_train)
best_accuracy = evaluate(best_model, X_test, y_test)

Model Performance
Mean Absolute Error: 0.0833


In [215]:
print("BEST model Accuracy= {:0.5f}".format(best_model.score(X_train,y_train)))

BEST model Accuracy= 0.92640


In [203]:
yprob1_tr = best_model.predict_proba(X_train)
print('BEST model AUC= {:0.5f}'.format(roc_auc_score(y_train, yprob1_tr[:,1])))

BEST model AUC= 0.86917


## final check

In [225]:
print('Accuracy Improvement of {:0.5f}%'.format( 100*(best_model.score(X_train,y_train) - base_model.score(X_train,y_train))/best_model.score(X_train,y_train) ))

Accuracy Improvement of 0.01433%


In [240]:
print('AUC improvement  {:0.5f}%'.format( 100 * (roc_auc_score(y_train, yprob1_tr[:,1])-roc_auc_score(y_train, pred_tr[:,1]))/ roc_auc_score(y_train, pred_tr[:,1])))

AUC improvement  0.01748%


# The Final Model - Real data

In [606]:
TheTest = pd.read_csv("../HagitGadot/TheTest.csv", index_col=False)
TheTrain = pd.read_csv("../HagitGadot/TheTrain.csv",index_col=False)

In [607]:
TheTrain = TheTrain.rename(columns={'X.PeoInFamily_1':'XPeoInFamily_1'})

TheTest = TheTest.rename(columns={'X.PeoInFamily_1':'XPeoInFamily_1'})

In [608]:
TheTrain1=TheTrain.copy()

In [609]:
TheTest1=TheTest.copy()

In [611]:
X_train=TheTrain1.drop('AgeStartAll',axis=1)
X_test=TheTest1.drop('AgeStartAll',axis=1)

In [612]:
y_train = TheTrain.AgeStartAll
y_test = TheTest.AgeStartAll

In [613]:
mod = LogisticRegression(C=8, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l1',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)
mod.fit(X_train,y_train)

LogisticRegression(C=8, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l1',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [614]:
yhat_tr = mod.predict(X_train)

In [615]:
yprob_tr = mod.predict_proba(X_train)

In [616]:
pd.crosstab(y_train,yhat_tr)

col_0,0,1
AgeStartAll,Unnamed: 1_level_1,Unnamed: 2_level_1
0,16928,210
1,1208,490


In [599]:
### Accuracy:
mod.score(X_train,y_train)

0.9247186239116585

In [600]:
### Train
mod.score(X_train,y_train)
yprob_tr = mod1.predict_proba(X_train)
print(roc_auc_score(y_train, yprob_tr[:,1]))

0.8644642410166979


In [601]:
yhat_ts = mod.predict(X_test)
print(mod.score(X_test,y_test))
print(pd.crosstab(y_test,yhat_ts))
yprob_ts = mod.predict_proba(X_test)
print(roc_auc_score(y_test, yprob_ts[:,1]))

0.919091102144829
col_0           0    1
AgeStartAll           
0            4199   63
1             318  129
0.8606442974016253


In [602]:
print("The final outcome")
print("----------------------")
print('accuracy-Train= {:0.5f}'.format(mod.score(X_train,y_train)))
print('accuracy-Test=  {:0.5f}'.format(mod1.score(X_test,y_test)))
print('AUC-train=      {:0.5f}'.format(roc_auc_score(y_train, yprob_tr[:,1])))
print('AUC-test=       {:0.5f}'.format(roc_auc_score(y_test, yprob_ts[:,1])))
      

The final outcome
----------------------
accuracy-Train= 0.92472
accuracy-Test=  0.91994
AUC-train=      0.86446
AUC-test=       0.86064
