In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

In [3]:
df = pd.read_csv('cleaned_data.csv')
df.head()

Unnamed: 0,TIMESTAMP,M_SESSION_UID,M_WEATHER_FORECAST_SAMPLES_M_WEATHER,M_WEATHER_FORECAST_SAMPLES_M_TRACK_TEMPERATURE,M_TRACK_TEMPERATURE_CHANGE,M_WEATHER_FORECAST_SAMPLES_M_AIR_TEMPERATURE,M_AIR_TEMPERATURE_CHANGE,M_RAIN_PERCENTAGE,M_AIR_TEMPERATURE,M_FORECAST_ACCURACY,M_TRACK_TEMPERATURE,M_WEATHER
0,2022-01-21 00:16:50,0.162116,0.0,0.8125,1.0,0.666667,1.0,0.0,0.666667,0.0,0.727273,0
1,2022-01-21 00:16:50,0.162116,0.0,0.8125,1.0,0.666667,1.0,0.0,0.666667,0.0,0.727273,0
2,2022-01-21 00:16:50,0.162116,0.0,0.8125,1.0,0.666667,1.0,0.01087,0.666667,0.0,0.727273,0
3,2022-01-21 00:16:50,0.162116,0.166667,0.75,0.5,0.583333,0.5,0.043478,0.666667,0.0,0.727273,0
4,2022-01-21 00:16:50,0.162116,0.166667,0.75,1.0,0.583333,1.0,0.043478,0.666667,0.0,0.727273,0


In [4]:
df = df.drop(['TIMESTAMP', 'M_SESSION_UID'], 1)
df.head()

  df = df.drop(['TIMESTAMP', 'M_SESSION_UID'], 1)


Unnamed: 0,M_WEATHER_FORECAST_SAMPLES_M_WEATHER,M_WEATHER_FORECAST_SAMPLES_M_TRACK_TEMPERATURE,M_TRACK_TEMPERATURE_CHANGE,M_WEATHER_FORECAST_SAMPLES_M_AIR_TEMPERATURE,M_AIR_TEMPERATURE_CHANGE,M_RAIN_PERCENTAGE,M_AIR_TEMPERATURE,M_FORECAST_ACCURACY,M_TRACK_TEMPERATURE,M_WEATHER
0,0.0,0.8125,1.0,0.666667,1.0,0.0,0.666667,0.0,0.727273,0
1,0.0,0.8125,1.0,0.666667,1.0,0.0,0.666667,0.0,0.727273,0
2,0.0,0.8125,1.0,0.666667,1.0,0.01087,0.666667,0.0,0.727273,0
3,0.166667,0.75,0.5,0.583333,0.5,0.043478,0.666667,0.0,0.727273,0
4,0.166667,0.75,1.0,0.583333,1.0,0.043478,0.666667,0.0,0.727273,0


In [5]:
y = LabelEncoder().fit_transform(df.iloc[:, -1])
col = [3,5]
for i in col:
    df.iloc[:,[i]] = df.iloc[:,[i]].astype('int64')
X = df.iloc[:, :-1]

In [6]:
rf = OneVsRestClassifier(estimator = GaussianNB())

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=20)

n_scores = cross_val_score(rf, X, y, scoring='accuracy', cv=cv, error_score='raise')

print(n_scores)
print('Baseline scores \n mean accuracy: %.3f with a %.3f standard deviation in scores ' % (np.mean(n_scores), np.std(n_scores)))

[0.72075232 0.72165236 0.72129545 0.72079454 0.7209342  0.72087213
 0.72113594 0.72096524 0.72107387 0.72104283 0.72126441 0.72165236
 0.72109371 0.72099628 0.72081006 0.7211825  0.72079454 0.72099628
 0.72084109 0.72088765 0.72092301 0.721342   0.72078335 0.72146182
 0.72116698 0.72090317 0.72046865 0.72085661 0.72113594 0.72147734]
Baseline scores 
 mean accuracy: 0.721 with a 0.000 standard deviation in scores 


In [7]:
def create_stacking_models():
    base_models = list()
    base_models.append(('KNNC', KNeighborsClassifier(n_neighbors = len(np.unique(y))
                                                     , weights = 'distance')
                       ))
    base_models.append(('SVC', SVC(kernel = 'linear'
                                   , class_weight = 'balanced'
                                  , break_ties = True)
                       ))    
    base_models.append(('GNB', GaussianNB()))
    base_models.append(('RF', RandomForestClassifier(n_estimators= 50, 
                                                   oob_score = True, 
                                                   class_weight = "balanced", 
                                                   random_state = 20, 
                                                   ccp_alpha = 0.1)
                       ))

    
    meta_model = LogisticRegression()
    final_model = StackingClassifier(estimators = base_models, ##Base estimators which will be stacked together
                                     final_estimator = meta_model,
                                     cv = 3
                                    )
    return final_model

def models_all():
    all_models = dict()
    all_models['KNNC']= KNeighborsClassifier(n_neighbors = len(np.unique(y))
                                             , weights = 'distance')
    all_models['SVC']= SVC(kernel = 'linear'
                           , class_weight = 'balanced'
                          , break_ties = True
                          )
    all_models['RF']= RandomForestClassifier(n_estimators= 50, 
                                                   oob_score = True, 
                                                   class_weight = "balanced", 
                                                   random_state = 20, 
                                                   ccp_alpha = 0.15)
    all_models['GNB'] = GaussianNB()
    all_models['Stacking'] = create_stacking_models()
    return all_models

def evaluate_model(model):
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42)
    scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, error_score='raise')
    return scores

In [None]:
model_results = list()
models = models_all()
names = list()
 
for name, model in models.items():
    scores = evaluate_model(model)
    model_results.append(scores)
    names.append(name)

    print('>%s %.3f (%.3f) \n' % (name, np.mean(scores), np.std(scores)))

In [None]:
plt.figure(figsize=(15,5))
fig = sns.boxplot(x=names, y=model_results, showmeans=True)
# plt.xlabel(list(names))
# plt.ylabel(list(model_results))
plt.title("Model Comparisions")
plt.show(fig)