In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

In [2]:
df = pd.read_csv('cleaned_data.csv')

In [3]:
df = df.drop(['TIMESTAMP', 'M_AI_DIFFICULTY'], 1)
df.head()

  df = df.drop(['TIMESTAMP', 'M_AI_DIFFICULTY'], 1)


Unnamed: 0,M_AIR_TEMPERATURE,M_TRACK_TEMPERATURE_CHANGE,M_WEATHER_FORECAST_SAMPLES_M_AIR_TEMPERATURE,M_AIR_TEMPERATURE_CHANGE,M_RAIN_PERCENTAGE,M_WEATHER
0,0.666667,1.0,0.666667,1.0,0.0,0
1,0.666667,1.0,0.666667,1.0,0.0,0
2,0.666667,1.0,0.666667,1.0,0.01087,0
3,0.666667,0.5,0.583333,0.5,0.043478,0
4,0.666667,1.0,0.583333,1.0,0.043478,0


In [4]:
y = LabelEncoder().fit_transform(df.iloc[:, -1])
col = [1,3]
for i in col:
    df.iloc[:,[i]] = df.iloc[:,[i]].astype('int64')
X = df.iloc[:, :-1]

In [5]:
rf = OneVsRestClassifier(estimator = GaussianNB())

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=20)

n_scores = cross_val_score(rf, X, y, scoring='accuracy', cv=cv, error_score='raise')

print(n_scores)
print('Baseline scores \n mean accuracy: %.3f with a %.3f standard deviation in scores ' % (np.mean(n_scores), np.std(n_scores)))

[0.79340792 0.79334585 0.79362518 0.79216325 0.79464618 0.79258225
 0.79492551 0.79146493 0.79124767 0.79332713 0.79331482 0.79376484
 0.79222855 0.79408752 0.79241155 0.78980447 0.79470826 0.79244258
 0.79224084 0.79236499 0.79272513 0.79395106 0.79345448 0.79196151
 0.79396338 0.79182185 0.79236499 0.79219429 0.79301676 0.79366853]
Baseline scores 
 mean accuracy: 0.793 with a 0.001 standard deviation in scores 


In [6]:
def create_stacking_models():
    base_models = list()
    base_models.append(('KNNC', KNeighborsClassifier(n_neighbors = len(np.unique(y))
                                                     , weights = 'distance')
                       ))
    base_models.append(('SVC', SVC(kernel = 'linear'
                                   , class_weight = 'balanced'
                                  , break_ties = True)
                       ))    
    base_models.append(('GNB', GaussianNB()))
    base_models.append(('RF', RandomForestClassifier(n_estimators= 200, 
                                                   oob_score = True, 
                                                   class_weight = "balanced", 
                                                   random_state = 20, 
                                                   ccp_alpha = 0.1)
                       ))

    
    meta_model = LogisticRegression()
    final_model = StackingClassifier(estimators = base_models, ##Base estimators which will be stacked together
                                     final_estimator = meta_model,
                                     cv = 5
                                    )
    return final_model

def models_all():
    all_models = dict()
    all_models['KNNC']= KNeighborsClassifier(n_neighbors = len(np.unique(y))
                                             , weights = 'distance')
    all_models['SVC']= SVC(kernel = 'linear'
                           , class_weight = 'balanced'
                          , break_ties = True
                          )
    all_models['RF']= RandomForestClassifier(n_estimators= 200, 
                                                   oob_score = True, 
                                                   class_weight = "balanced", 
                                                   random_state = 20, 
                                                   ccp_alpha = 0.15)
    all_models['GNB'] = GaussianNB()
    all_models['Stacking'] = create_stacking_models()
    return all_models

def evaluate_model(model):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)
    scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, error_score='raise')
    return scores

In [None]:
model_results = list()
models = models_all()
names = list()
 
for name, model in models.items():
    scores = evaluate_model(model)
    model_results.append(scores)
    names.append(name)

    print('>%s %.3f (%.3f) \n' % (name, np.mean(scores), np.std(scores)))

In [None]:
plt.figure(figsize=(15,5))
fig = sns.boxplot(x=names, y=model_results, showmeans=True)
# plt.xlabel(list(names))
# plt.ylabel(list(model_results))
plt.title("Model Comparisions")
plt.show(fig)