In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from scipy.ndimage.interpolation import shift
from math import sin, cos, sqrt, atan2, radians 
from sklearn import tree, svm, linear_model, ensemble, neighbors, naive_bayes 
import dateutil
import os
from joblib import dump, load
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import f1_score, recall_score, precision_score
from sklearn.metrics import make_scorer


Bad key "text.kerning_factor" on line 4 in
/opt/anaconda3/envs/bigdatalab_cpu_202101/lib/python3.7/site-packages/matplotlib/mpl-data/stylelib/_classic_test_patch.mplstyle.
You probably need to get an updated matplotlibrc file from
https://github.com/matplotlib/matplotlib/blob/v3.1.2/matplotlibrc.template
or from the matplotlib source distribution


In [2]:
stations_path = '../../Data/station.csv'

stations_df = pd.read_csv(stations_path)
San_Fancisco_stations = stations_df[stations_df['city'] == 'San Francisco']['id'].unique()

In [3]:
interval = 15
window_width = 3
time_zone = '17-18'
zone_str = f"_{time_zone}" if time_zone != None else ""

In [4]:
classifiers = {
    'DecisionTree': tree.DecisionTreeClassifier(random_state=42),   
    'RandomForest': ensemble.RandomForestClassifier(random_state=42, max_features='sqrt'),
    'XGBoost':  xgb.XGBClassifier(random_state=42)
}

In [5]:
decision_tree_parameters = {
    'max_depth': list(range(2, 6)),
    'min_samples_split': list(range(2, 7)),
    'criterion' : ['gini', 'entropy'],
    'class_weight': [None, 'balanced']
}

random_forest_parameters = {
    'n_estimators': [10, 100, 1000],
    'criterion' : ['gini', 'entropy'],
    'max_depth': list(range(2, 6)), 
    'min_samples_split': list(range(0, 5)), 
    'class_weight': [None, 'balanced']
}

xgboost_parameters = {
    'n_estimators': [10, 100, 1000],
    'max_depth': list(range(2, 6)),
}

classifiers_parameters = {
    'DecisionTree': decision_tree_parameters,
    'RandomForest': random_forest_parameters,
    'XGBoost': xgboost_parameters
}

In [6]:
for reference_station in San_Fancisco_stations:
    train_df = pd.read_csv(f"../../Datasets/{interval}_{window_width}/station{reference_station}{zone_str}_train.csv")
    print(f"Station: {reference_station}")   
    y = train_df['status']
    X = train_df.drop(columns=['status'])

    for scoring in ['recall', 'precision', 'f1_score']:
        for classifier in classifiers.keys():
            
            #I segunti controlli sono necessari in quanto richiamo, precisione e 
            # f1score hanno necessità di conoscere quale sia la classe positiva (QP)
            if scoring == "recall":
                scorer = make_scorer(recall_score, pos_label="QP")
            elif scoring == "precision":
                scorer = make_scorer(precision_score, pos_label="QP")
            elif scoring == "f1_score":
                scorer = make_scorer(f1_score, pos_label="QP")
            else:
                scorer = scoring
                
            clf = classifiers[classifier]
            parametrers = classifiers_parameters[classifier]
            
            grid_search = GridSearchCV(clf, parametrers, scoring=scorer, cv=3, n_jobs=-1)
            search_result = grid_search.fit(X, y)
            
            params = search_result.best_params_
            tuned_clf = search_result.best_estimator_
            
            #salvo il modello dopo il fine tuning
            if not os.path.exists(f'../../Results/Other_classifiers/Tuned_models/{scoring}/{classifier}/{interval}_{window_width}'):
                os.makedirs(f"../../Results/Other_classifiers/Tuned_models/{scoring}/{classifier}/{interval}_{window_width}")
            dump(tuned_clf, f'../../Results/Other_classifiers/Tuned_models/{scoring}/{classifier}/{interval}_{window_width}/station{reference_station}{zone_str}_tuned_model.joblib')

Station: 41




Station: 42
Station: 45
Station: 46
Station: 47
Station: 48
Station: 49
Station: 50
Station: 51
Station: 39
Station: 54
Station: 55
Station: 56




Station: 57
Station: 58
Station: 59
Station: 60
Station: 61
Station: 62




Station: 63




Station: 64
Station: 65
Station: 66
Station: 67
Station: 68
Station: 69
Station: 70
Station: 71
Station: 72
Station: 73
Station: 74
Station: 75




Station: 76
Station: 77
Station: 82


In [7]:
results_table = {}

for scoring in ["recall", "precision", "f1_score"]:
    result_matrix = []
    for classifier in classifiers.keys():
        file = open(f'../../Results/Other_classifiers/Tuned_models/{scoring}/{classifier}_{interval}_{window_width}{zone_str}_results.txt', "w")
        file.write(f'TESING RESULTS FOR {classifier} CLASSIFIER:\n\n')
        tot_fp = 0
        tot_tp = 0
        tot_fn = 0
        tot_tn = 0

        for station_id in San_Fancisco_stations:
            model = load(f'../../Results/Other_classifiers/Tuned_models/{scoring}/{classifier}/{interval}_{window_width}/station{station_id}{zone_str}_tuned_model.joblib')

            test_df = pd.read_csv(f'../../Datasets/{interval}_{window_width}/station{station_id}{zone_str}_test.csv')
            y_test = test_df[3:]['status']
            X_test = test_df[3:].drop(columns=['status'])    

            prediction = model.predict(X_test)
            
            cm = confusion_matrix(y_test, prediction, labels=['N', 'QP'])

            str_= f'{classifier} FOR STATION {station_id}' + '\n'
            str_ += f'Confusion matrix:' + '\n'
            str_ += str(cm) + '\n'

            tn, fp, fn, tp = cm.ravel()
            str_+= f'tp={tp}, fn={fn}, fp={fp}, tn={tn}' +'\n'

            test_accuracy = (tn + tp) / (tn + fp + fn + tp)
            test_recall = recall_score(y_test, prediction, pos_label='QP', zero_division=0)
            test_precision = precision_score(y_test, prediction, pos_label='QP', zero_division=0)
            test_f1_score = f1_score(y_test, prediction, pos_label='QP', zero_division=0)

            str_+= f'accuracy={test_accuracy}; recall={test_recall}; precision={test_precision}; f1_score= {test_f1_score}' +'\n\n'
            str_+= "-"*10 +'\n\n'

            tot_fp += fp
            tot_tp += tp
            tot_fn += fn
            tot_tn += tn

            file.write(str_)

        avg_accuracy = (tot_tn + tot_tp) / (tot_tn + tot_fp + tot_fn + tot_tp)
        avg_recall = (tot_tp) / (tot_tp + tot_fn)
        avg_precision = (tot_tp) / (tot_tp + tot_fp)
        avg_f1_score = 2*avg_precision*avg_recall/(avg_precision+avg_recall)
        
        result_matrix.append([avg_accuracy, avg_recall, avg_precision, avg_f1_score])
        avg_str = f"AVERAGE VALUES FOR {classifier}: accuracy={avg_accuracy}; recall={avg_recall}; precision={avg_precision}; f1_score={avg_f1_score}"

        file.write(f"Overall results:\nTP:{tot_tp},TN:{tot_tn},FP:{tot_fp},FN:{tot_fn}\n")
        file.write(avg_str)
        file.close()
    
    result_table = pd.DataFrame(result_matrix, columns=['avg_accuracy', 'avg_recall', 'avg_precision', 'avg_f1_score'], index=pd.Index(classifiers.keys()))
    results_table[scoring] = result_table
    result_table.to_csv(f"../../Results/Other_classifiers/Tuned_models/{scoring}/Overall_results_{interval}_{window_width}{zone_str}.csv")
print(tot_fp+tot_tp+tot_fn+tot_tn)

11795
