In [1]:
import math
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.metrics import accuracy_score, matthews_corrcoef, mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from imblearn.combine import SMOTETomek
import MachineLearning as ML
import PrepareData
import importlib

importlib.reload(ML)
importlib.reload(PrepareData)

<module 'PrepareData' from 'C:\\Users\\Adam\\source\\repos\\WeatherPrediction\\WeatherPredictionProject\\PrepareData.py'>

In [2]:
# method 'teach_models' takes the data and tech all models with it

def teach_models(X_train, y_train):
    models=[]

    models.append(ML.NeuralNetworks(X_train, y_train))
    models.append(ML.LinearRegressionModel(X_train, y_train))
    models.append(ML.SupportVectorRegression(X_train, y_train))

    return models

In [3]:
# method 'score_models' calculate scores to many statistics for all models separately

def score_models(models, X_test, y_test):
    results=[]
    for model in models:
        scores={}
        y_pred=model.predict(X_test)
        scores['Model']=model.__str__()
        scores['Mean_squared_error_(MSE)']=mean_squared_error(y_test, y_pred)
        scores['Square_root_of_mean_squared_error_(RMSE)']=math.sqrt(mean_squared_error(y_test, y_pred))
        scores['Mean_absolute_error_(MAE)']=mean_absolute_error(y_test, y_pred)

        results.append(scores)
    return results

In [4]:
# method 'score_merged_models' calculate scores to many statistics for selected models as merged object

def score_merged_models(models, X_test, y_test):
    merged_scores={}
    results=pd.DataFrame()
    num=1
    for model in models:
        column="Model_"+str(num)
        results[column]=list(model.predict(X_test))
        num+=1

    results=results.astype(np.int32)

    results["Sum"]=results.sum(axis=1)/len(models)

    y_pred_merged=results["Sum"].values

    merged_scores['Model']='Merged model'
    merged_scores['Mean_squared_error_(MSE)']=mean_squared_error(y_test, y_pred_merged)
    merged_scores['Square_root_of_mean_squared_error_(RMSE)']=math.sqrt(mean_squared_error(y_test, y_pred_merged))
    merged_scores['Mean_absolute_error_(MAE)']=mean_absolute_error(y_test, y_pred_merged)

    return merged_scores

In [5]:
# cell with data read, split it into train and test parts and teach the models

df = PrepareData.get_temp_data()
df = df.dropna()

train = df.sample(frac=0.8)
test = df.drop(train.index)

y_train = train['temp']
X_train = train.drop('temp', axis=1)

y_test = train['temp']
X_test = train.drop('temp', axis=1)

models = teach_models(X_train, y_train)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [9]:
# simple usage of 'score_models' method

scores=score_models(models,X_test,y_test)
scores[2]



{'Model': 'SVR()',
 'Mean_squared_error_(MSE)': 4.823904313654315,
 'Square_root_of_mean_squared_error_(RMSE)': 2.196338843087358,
 'Mean_absolute_error_(MAE)': 1.624521643355879}

In [7]:
# simple usage of 'score_merged_models' method

scores=score_merged_models(models,X_test,y_test)
scores



{'Model': 'Merged model',
 'Mean_squared_error_(MSE)': 6.176179140086094,
 'Square_root_of_mean_squared_error_(RMSE)': 2.4851919724814207,
 'Mean_absolute_error_(MAE)': 1.864037135003371}

In [12]:
# The below algorithm allow determining which set of models is best (needs ~10 min)

best_score=1000
best_models=[]
num=0

import itertools
for L in range(1,len(models) + 1):
    for subset in itertools.combinations(models, L):
        merged_scores=score_merged_models(list(subset),X_test,y_test)
        if merged_scores['Mean_squared_error_(MSE)']<best_score:
            best_score=merged_scores['Mean_squared_error_(MSE)']
            best_models=list(subset)
        if num%100==0:
            print(num)
        num+=1

print(best_score)
print(best_models)

0
5.162022716664074
[LinearRegression(), SVR()]
