In [167]:
import math
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.metrics import accuracy_score, matthews_corrcoef, mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from imblearn.combine import SMOTETomek
import MachineLearning as ML
import PrepareData
import importlib

importlib.reload(ML)
importlib.reload(PrepareData)

<module 'PrepareData' from 'C:\\Users\\Adam\\source\\repos\\WeatherPrediction\\WeatherPredictionProject\\PrepareData.py'>

In [168]:
# method 'teach_models' takes the data and tech all models with it

def teach_models(X_train, y_train):
    models=[]

    X_train, y_train = SMOTETomek().fit_resample(X_train, y_train)

    models.append(ML.NeuralNetworks(X_train, y_train))
    models.append(ML.KNN(X_train, y_train))
    models.append(ML.DecisionTree(X_train, y_train))
    models.append(ML.LinearRegressionModel(X_train, y_train))
    models.append(ML.LogisticRegressionModel(X_train, y_train))
    models.append(ML.RandomForest(X_train, y_train))
    models.append(ML.SupportVectorMachines(X_train, y_train))
    models.append(ML.SupportVectorRegression(X_train, y_train))
    models.append(ML.TreeGradientBoosting(X_train, y_train))
    models.append(ML.DummyModel(X_train, y_train))

    return models

In [169]:
# method 'score_models' calculate scores to many statistics for all models separately

def score_models(models, X_test, y_test):
    results=[]
    for model in models:
        scores={}
        y_pred=model.predict(X_test)>0.5
        scores['Model']=model.__str__()
        scores['Accuracy']=accuracy_score(y_test, y_pred)
        scores['F1_score']=f1_score(y_test, y_pred)
        scores['Matthwes_correlation_coefficient_(MCC)']=matthews_corrcoef(y_test, y_pred)
        scores['Mean_squared_error_(MSE)']=mean_squared_error(y_test, y_pred)
        scores['Square_root_of_mean_squared_error_(RMSE)']=math.sqrt(mean_squared_error(y_test, y_pred))
        scores['Mean_absolute_error_(MAE)']=mean_absolute_error(y_test, y_pred)
        scores['Confusion_matrix']=confusion_matrix(y_test, y_pred)

        results.append(scores)
    return results

In [170]:
# method 'score_merged_models' calculate scores to many statistics for selected models as merged object

def score_merged_models(models, X_test, y_test):
    merged_scores={}
    results=pd.DataFrame()
    num=1
    for model in models:
        column="Model_"+str(num)
        results[column]=list(model.predict(X_test)>0.5)
        num+=1

    results=results.astype(np.int32)

    results["Sum"]=results.sum(axis=1)/len(models)>0.5

    y_pred_merged=results["Sum"].values>0.5

    merged_scores['Model']='Merged model'
    merged_scores['Accuracy']=accuracy_score(y_test, y_pred_merged)
    merged_scores['F1_score']=f1_score(y_test, y_pred_merged)
    merged_scores['Matthwes_correlation_coefficient_(MCC)']=matthews_corrcoef(y_test, y_pred_merged)
    merged_scores['Mean_squared_error_(MSE)']=mean_squared_error(y_test, y_pred_merged)
    merged_scores['Square_root_of_mean_squared_error_(RMSE)']=math.sqrt(mean_squared_error(y_test, y_pred_merged))
    merged_scores['Mean_absolute_error_(MAE)']=mean_absolute_error(y_test, y_pred_merged)
    merged_scores['Confusion_matrix']=confusion_matrix(y_test, y_pred_merged)

    return merged_scores

In [171]:
# cell with data read, split it into train and test parts and teach the models

df=PrepareData.get_data()

df=df.dropna()

y=df['nextday_rainfall']
X=df.drop('nextday_rainfall', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.25, stratify=y)

models=teach_models(X_train, y_train)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [172]:
# simple usage of 'score_models' method

scores=score_models(models,X_test,y_test)
scores[1]



{'Model': 'KNeighborsClassifier()',
 'Accuracy': 0.6703241895261846,
 'F1_score': 0.7129830655666523,
 'Matthwes_correlation_coefficient_(MCC)': 0.336041995188636,
 'Mean_squared_error_(MSE)': 0.32967581047381544,
 'Square_root_of_mean_squared_error_(RMSE)': 0.5741740245551129,
 'Mean_absolute_error_(MAE)': 0.32967581047381544,
 'Confusion_matrix': array([[523, 244],
        [417, 821]], dtype=int64)}

In [173]:
# simple usage of 'score_merged_models' method

scores=score_merged_models(models,X_test,y_test)
scores



{'Model': 'Merged model',
 'Accuracy': 0.712219451371571,
 'F1_score': 0.7509710832973672,
 'Matthwes_correlation_coefficient_(MCC)': 0.4194390574770282,
 'Mean_squared_error_(MSE)': 0.28778054862842894,
 'Square_root_of_mean_squared_error_(RMSE)': 0.5364518138923839,
 'Mean_absolute_error_(MAE)': 0.28778054862842894,
 'Confusion_matrix': array([[558, 209],
        [368, 870]], dtype=int64)}

In [175]:
# The below algorithm allow determining which set of models is best (needs ~10 min)

best_score=0
best_models=[]
num=0

import itertools
for L in range(2,len(models) + 1):
    for subset in itertools.combinations(models[:-1], L):
        merged_scores=score_merged_models(list(subset),X_test,y_test)
        if merged_scores['F1_score']>best_score:
            best_score=merged_scores['F1_score']
            best_models=list(subset)
        if num%100==0:
            print(num)
        num+=1

print(best_score)
print(best_models)

0
100
200
300
400
500
0.7790839075800567
[DecisionTreeClassifier(), RandomForestClassifier(max_depth=3), LGBMClassifier()]
