In [6]:
import pandas as pd
import numpy as np
from sklearn.ensemble import VotingClassifier

from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, cross_val_predict

from xgboost.sklearn import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.neural_network import MLPClassifier

from sklearn.calibration import CalibratedClassifierCV

In [7]:
X =  pd.read_csv("X_train_f.csv",index_col="tripid")
y =  pd.read_csv("y_train.csv",index_col="tripid")["label"]
test =  pd.read_csv("X_test_f.csv",index_col="tripid")

In [8]:
def AvgF1(n,model):
    f1 = []
    for _ in range(n):
        skf = StratifiedKFold(n_splits=3, shuffle = True)
        f1.append(cross_val_score(model, X, y, n_jobs=4, cv=skf.split(X, y), scoring='f1').mean())
    
    return np.array(f1).mean()

def Eval(n,models):
    for k in models:
        models[k] = AvgF1(n,models[k])
    return models

def CVPredict(models,X,y):
    
    skf = StratifiedKFold(n_splits=3, shuffle = True)
    
    results = pd.DataFrame(index=X.index)
    
    for k in models:
        data = cross_val_predict(models[k], X, y, n_jobs=4, cv=skf.split(X, y),method="predict_proba")
        results[str(k)+"_prob"] = pd.DataFrame(data,index=X.index)[1]
        
    return results
    
    
def AvgF1Stack(n,model,X,y):
    f1 = []
    for _ in range(n):
        skf = StratifiedKFold(n_splits=3, shuffle = True)
        f1.append(cross_val_score(model, X, y, n_jobs=4, cv=skf.split(X, y), scoring='f1').mean())
    
    return np.array(f1).mean()

def EvalStack(n,models,X,y):
    for k in models:
        models[k] = AvgF1Stack(n,models[k],X,y)
    return models

In [4]:
xgb = XGBClassifier()
cat = CatBoostClassifier()
rf = RandomForestClassifier()
bag = BaggingClassifier()
ext = ExtraTreesClassifier()
lgb = LGBMClassifier()

X_xgb = X.loc[:['duration', 'meter_waiting', 'meter_waiting_fare', 'fare', 'distance',
       'additional_fare', 'pickup_time', 'drop_time', 'avg_speed', 'raw_fare',
       'fare_per_km', 'fare_per_min']]

In [5]:
models = {'xgb': xgb, 'cat': cat, 'lgb':lgb ,'bag': bag,'rf':rf,'ext':ext}
results = CVPredict(models,X,y)
#results1 = CVPredict(models)

In [221]:
results.head(60)

Unnamed: 0_level_0,xgb_prob,cat_prob,lgb_prob,bag_prob,rf_prob,ext_prob
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
189123628,0.97478,0.987731,0.984993,0.93,0.946667,0.92
189125358,0.966506,0.98571,0.985458,0.975,0.96,0.94
189125719,0.991668,0.992274,0.992759,0.99,1.0,1.0
189127273,0.923873,0.957184,0.979274,0.945,0.966667,0.966667
189128020,0.722288,0.603427,0.578654,0.9,0.373333,0.753333
189129552,0.981019,0.994908,0.988635,0.965,0.96,0.953333
189132829,0.986005,0.983385,0.984755,0.995,0.993333,0.993333
189135103,0.993298,0.982903,0.993905,0.96,0.966667,0.986667
189139296,0.988586,0.99407,0.990022,0.99,1.0,1.0
189138671,0.974492,0.984834,0.987686,0.955,0.993333,0.96


In [57]:
xgb1 = XGBClassifier()
cat1 = CatBoostClassifier()
rf1 = RandomForestClassifier()
bag1 = BaggingClassifier()
ext1 = ExtraTreesClassifier()
lgb1 = LGBMClassifier()

models = {'xgb': xgb1, 'cat': cat1, 'lgb':lgb1 ,'bag': bag1,'rf':rf1,'ext':ext1}
eval = EvalStack(10,models,results1,y)
eval

{'xgb': 0.9740132533009739,
 'cat': 0.9743078180880722,
 'lgb': 0.9739791213192437,
 'bag': 0.973795142694067,
 'rf': 0.9739096258175014,
 'ext': 0.9738945742496036}

In [147]:
models_1 = {'bag': bag1,'rf':rf1,'ext':ext1}

In [150]:
results_1 = CVPredict(models_1,results.loc[:,"xgb_prob":"lgb_prob"],y)

In [151]:
results_1.head(50)

Unnamed: 0_level_0,bag_prob,rf_prob,ext_prob
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
189123628,1.0,0.98,0.99
189125358,1.0,0.99,0.97
189125719,1.0,1.0,1.0
189127273,0.9,0.99,0.95
189128020,0.6,0.52,0.52
189129552,1.0,0.99,1.0
189132829,1.0,1.0,1.0
189135103,1.0,1.0,0.98
189139296,0.9,0.99,0.93
189138671,1.0,1.0,1.0


In [163]:
vclf = VotingClassifier(
    estimators=[('xgb', xgb1), ('cat', cat1), ('lgb',lgb1) ,('bag', bag1),('rf',rf1),('ext',ext1)],
    weights= [1.5,2,1.5,1,1,1],
    voting='hard')

In [164]:
AvgF1Stack(10,vclf,results,y)

0.9738931012602101

In [234]:
lr = LogisticRegression()
cat2 = CatBoostClassifier()
knn = KNeighborsClassifier()
rf_ = RandomForestClassifier()
svc = SVC()
AvgF1Stack(10,lr,results,y)

0.974895894380774

In [236]:
lr.fit(results,y)
lr.coef_

array([[0.41933864, 1.21740231, 0.79889399, 2.2478126 , 1.26773382,
        2.7574968 ]])

In [100]:
xgb.fit(X,y)
cat.fit(X,y)
lgb.fit(X,y)
rf.fit(X,y)
bag.fit(X,y)
ext.fit(X,y)

0:	learn: 0.9607819	total: 10.7ms	remaining: 10.7s
200:	learn: 0.9763949	total: 1.57s	remaining: 6.23s
400:	learn: 0.9801640	total: 3.05s	remaining: 4.56s
600:	learn: 0.9828295	total: 4.54s	remaining: 3.01s
800:	learn: 0.9850756	total: 6.34s	remaining: 1.57s
999:	learn: 0.9866888	total: 7.96s	remaining: 0us


ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=150,
                     n_jobs=None, oob_score=False, random_state=None, verbose=0,
                     warm_start=False)

In [102]:
def CVPredictTest(models):
    
    skf = StratifiedKFold(n_splits=3, shuffle = True)
    
    results = pd.DataFrame(index=test.index)
    
    for k in models:
        data = models[k].predict_proba(test)
        results[str(k)+"_prob"] = pd.DataFrame(data,index=test.index)[1]
        
    return results

In [103]:
models_test = {'xgb': xgb, 'cat': cat, 'lgb':lgb ,'bag': bag,'rf':rf,'ext':ext}

results_test = CVPredictTest(models_test)

In [105]:
results_test.head()

Unnamed: 0_level_0,xgb_prob,cat_prob,lgb_prob,bag_prob,rf_prob,ext_prob
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
213284604,0.98655,0.98822,0.991872,1.0,0.966667,0.96
213286352,0.411319,0.122683,0.40444,0.275,0.386667,0.2
213293973,0.959732,0.961532,0.980256,0.96,0.953333,0.933333
213294622,0.99328,0.994443,0.993077,1.0,1.0,0.986667
213298687,0.991547,0.994231,0.990887,0.975,0.98,0.953333


In [107]:
predictions = lr.predict(results_test)

In [113]:
submission_df = pd.DataFrame(predictions,index=test.index,columns=["prediction"])

In [114]:
submission_df.head()

Unnamed: 0_level_0,prediction
tripid,Unnamed: 1_level_1
213284604,1
213286352,0
213293973,1
213294622,1
213298687,1


In [None]:
models = {'xgb': xgb, 'cat': cat, 'lgb':lgb ,'bag': bag,'rf':rf,'ext':ext}
results_test = CVPredict(models)

In [115]:
submission_df.to_csv("stack.csv")

In [116]:
submission_df.head()

Unnamed: 0_level_0,prediction
tripid,Unnamed: 1_level_1
213284604,1
213286352,0
213293973,1
213294622,1
213298687,1
