In [253]:
from configparser import ConfigParser
import psycopg2
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import recall_score, precision_score
#Can be very helpful to notice any imbalance in classes
from collections import Counter 

In [254]:
#Sourced from https://www.postgresqltutorial.com/postgresql-python/connect/
def config(filename='psql_sample.ini', section='postgresql'):
    parser = ConfigParser()
    # read config file
    parser.read(filename) 
 
    # get section, default to postgresql
    db = {}
    if parser.has_section(section):
        params = parser.items(section)
        for param in params:
            db[param[0]] = param[1]
    else:
        raise Exception('Section {0} not found in the {1} file'.format(section, filename))
 
    return db

In [255]:
#Get the configuration file as a python dictionary
cfg = config()

#Establish the connection and create a cursor to the database
try:
    print("Here's an attempt to connect to the database")
    conn = psycopg2.connect(**cfg)
    cursor = conn.cursor()
    print("Look's like it was a success")
    
except (Exception, psycopg2.DatabaseError) as error:
    print(error)

Here's an attempt to connect to the database
Look's like it was a success


In [256]:
#SELECT QUERY
try:
    #Lets get our data 
    cursor.execute("SELECT age_group,gender,measures.status,holiday,city,daily_high_temp,daily_low_temp,rain_amount,snow_amount,parks_percentage,retail_and_recreation_percentage,grocery_and_pharmacy_percentage,workplaces_percentage,residential_percentage,transit_stations_percentage,is_fatal,is_resolved,is_unresolved from data_mart.fact_table as fact inner join data_mart.mobility_dimension as mobility on fact.mobility_key=mobility.mobility_key inner join data_mart.weather_dimension as weather on fact.weather_key=weather.weather_key inner join data_mart.patient_dimension as patient on fact.patient_key=patient.patient_key inner join data_mart.special_measures_dimension as measures on fact.special_measures_key=measures.special_measures_key inner join data_mart.phu_location_dimension as phu on fact.phu_location_key=phu.phu_location_key inner join data_mart.reported_date_dimension as date on fact.reported_date_key=date.reported_date_key") 

    #Get the complete result set. It will be a list of tuples where each tuple is a row from the result set
    result_list = cursor.fetchall()
        
except (Exception, psycopg2.DatabaseError) as error:
    print(error)

In [257]:
#Ensure to run this cell at the end of all your experiments to close all connections
cursor.close()
conn.close()

In [258]:
#Now,  1
result_df = pd.DataFrame(result_list, columns=["age_group","gender","status","holiday","city","daily_high","daily_low","rain_amount","snow_amount", "parks_percentage", "retail_and_recreation_percentage","grocery_and_pharmacy_percentage","workplaces_percentage","residential_percentage","transit_stations_percentage",
                                               "is_fatal", "is_resolved","is_unresolved"])

result_df.drop(columns=['daily_high'], inplace=True)
result_df.drop(columns=['daily_low'], inplace=True)
result_df.drop(columns=['rain_amount'], inplace=True)
result_df.drop(columns=['snow_amount'], inplace=True)
result_df.drop(columns=['workplaces_percentage'], inplace=True)
result_df.drop(columns=['residential_percentage'], inplace=True)
result_df.drop(columns=['transit_stations_percentage'], inplace=True)
result_df.drop(columns=['is_resolved'], inplace=True)
result_df.drop(columns=['is_unresolved'], inplace=True)

result_df.head()

Unnamed: 0,age_group,gender,status,holiday,city,parks_percentage,retail_and_recreation_percentage,grocery_and_pharmacy_percentage,is_fatal
0,40s,MALE,Private gathering restrictions,False,Mississauga,71.0,-15.0,-1.0,False
1,50s,MALE,Private gathering restrictions,False,Mississauga,71.0,-15.0,-1.0,False
2,<20,MALE,Private gathering restrictions,False,Mississauga,71.0,-15.0,-1.0,False
3,30s,MALE,Private gathering restrictions,False,Mississauga,71.0,-15.0,-1.0,False
4,20s,MALE,Private gathering restrictions,False,Newmarket,,-11.0,4.0,False


In [259]:
#Removing null values : 2
result_df["parks_percentage"].fillna(result_df["parks_percentage"].mean(), inplace=True)
result_df["grocery_and_pharmacy_percentage"].fillna(result_df["grocery_and_pharmacy_percentage"].mean(), inplace=True)
result_df["retail_and_recreation_percentage"].fillna(result_df["retail_and_recreation_percentage"].mean(), inplace=True)

result_df.head()

Unnamed: 0,age_group,gender,status,holiday,city,parks_percentage,retail_and_recreation_percentage,grocery_and_pharmacy_percentage,is_fatal
0,40s,MALE,Private gathering restrictions,False,Mississauga,71.0,-15.0,-1.0,False
1,50s,MALE,Private gathering restrictions,False,Mississauga,71.0,-15.0,-1.0,False
2,<20,MALE,Private gathering restrictions,False,Mississauga,71.0,-15.0,-1.0,False
3,30s,MALE,Private gathering restrictions,False,Mississauga,71.0,-15.0,-1.0,False
4,20s,MALE,Private gathering restrictions,False,Newmarket,16.937927,-11.0,4.0,False


In [260]:
#get counter 6
Counter(result_df['is_fatal'])

Counter({False: 83387, True: 1425})

In [261]:
#Get the labels 3
y = result_df['is_fatal']#label


In [262]:
#normalize
#Normalizing data:
transform_data = result_df[["retail_and_recreation_percentage","grocery_and_pharmacy_percentage","parks_percentage"]]
X_normalized=preprocessing.normalize(transform_data,norm='l2')
normalize_part=pd.DataFrame(X_normalized,columns=transform_data.columns)
non_numerical=result_df[["status","age_group","is_fatal","city","gender","holiday"]]
result_data=pd.concat([non_numerical,normalize_part],axis=1)
result_data.head()

Unnamed: 0,status,age_group,is_fatal,city,gender,holiday,retail_and_recreation_percentage,grocery_and_pharmacy_percentage,parks_percentage
0,Private gathering restrictions,40s,False,Mississauga,MALE,False,-0.206685,-0.013779,0.97831
1,Private gathering restrictions,50s,False,Mississauga,MALE,False,-0.206685,-0.013779,0.97831
2,Private gathering restrictions,<20,False,Mississauga,MALE,False,-0.206685,-0.013779,0.97831
3,Private gathering restrictions,30s,False,Mississauga,MALE,False,-0.206685,-0.013779,0.97831
4,Private gathering restrictions,20s,False,Newmarket,MALE,False,-0.534274,0.194282,0.822682


In [263]:
#drop the label
result_data.drop(columns=['is_fatal'], inplace=True)

In [264]:
# Our features have categorical data. We need to convert them to one-hot encoders to get the best results.
new_result_df = pd.get_dummies(result_data)
new_result_df.head()

Unnamed: 0,holiday,retail_and_recreation_percentage,grocery_and_pharmacy_percentage,parks_percentage,status_Acitivies closures,status_Control,status_Lockdown,status_Other,status_Private gathering restrictions,status_Protect,...,city_Mississauga,city_Newmarket,city_Oakville,city_Ottawa,city_Toronto,city_Whitby,gender_FEMALE,gender_GENDER DIVERSE,gender_MALE,gender_UNSPECIFIED
0,False,-0.206685,-0.013779,0.97831,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,1,0
1,False,-0.206685,-0.013779,0.97831,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,1,0
2,False,-0.206685,-0.013779,0.97831,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,1,0
3,False,-0.206685,-0.013779,0.97831,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,1,0
4,False,-0.534274,0.194282,0.822682,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,1,0


In [265]:
#Undersampling of majority classes:  4
from imblearn.under_sampling import NearMiss
X = new_result_df.values#features
label = result_df["is_fatal"]#label
undersample = NearMiss(version=1,n_neighbors=3)
x_under, y_under = undersample.fit_resample(X,y)

In [266]:
X_train, X_test, y_train, y_test = train_test_split(x_under, label, test_size=0.2, shuffle=True, stratify=label)

In [267]:
#Let's see the no. of records per class in training and test set
print("Training set {} ".format(Counter(y_train)))
print("Test set {} ".format(Counter(y_test)))

Training set Counter({False: 1140, True: 1140}) 
Test set Counter({False: 285, True: 285}) 


In [268]:
#PART B
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import recall_score, precision_score, accuracy_score
import datetime
#Gradient Boosting


#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, shuffle=True, stratify=y_under)
a = datetime.datetime.now()
classifier = GradientBoostingClassifier(n_estimators=20, learning_rate=0.75, max_features=2, max_depth=2, random_state=0)
classifier.fit(X_train, y_train)
predictions = classifier.predict(X_test)
b = datetime.datetime.now()
c = b - a
print('time in milliseconds')
print(c.total_seconds() * 1000)
print("Confusion Matrix:")
print(confusion_matrix(y_test, predictions))

print("Classification Report")
print(classification_report(y_test, predictions))

accuracy = accuracy_score(predictions, y_test) * 100
recall = recall_score(predictions, y_test) * 100
precision = precision_score(predictions, y_test) * 100

print("Accuracy of Gradient Boosting: {:.2f} %".format(accuracy))
print("Precision of Gradient Boosting: {:.2f} %".format(precision))
print("Recall of Gradient Boosting: {:.2f} %".format(recall))


time in milliseconds
85.39099999999999
Confusion Matrix:
[[199  86]
 [ 99 186]]
Classification Report
              precision    recall  f1-score   support

       False       0.67      0.70      0.68       285
        True       0.68      0.65      0.67       285

    accuracy                           0.68       570
   macro avg       0.68      0.68      0.68       570
weighted avg       0.68      0.68      0.68       570

Accuracy of Gradient Boosting: 67.54 %
Precision of Gradient Boosting: 65.26 %
Recall of Gradient Boosting: 68.38 %


In [269]:
#Random Forest
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify = y_under)

#create and fit random forest
a = datetime.datetime.now()#start time
rf = RandomForestClassifier(max_depth=2, random_state=0)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
b = datetime.datetime.now()#end time
c = b - a
print('time in milliseconds')
print(c.total_seconds() * 1000)

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("Classification Report")
print(classification_report(y_test, y_pred))

accuracy = accuracy_score(y_pred, y_test) * 100
recall = recall_score(y_pred, y_test) * 100
precision = precision_score(y_pred, y_test) * 100

print("Accuracy of Random Forest: {:.2f} %".format(accuracy))
print("Precision of Random Forest: {:.2f} %".format(precision))
print("Recall of Random Forest: {:.2f} %".format(recall))


time in milliseconds
367.026
Confusion Matrix:
[[198  87]
 [ 91 194]]
Classification Report
              precision    recall  f1-score   support

       False       0.69      0.69      0.69       285
        True       0.69      0.68      0.69       285

    accuracy                           0.69       570
   macro avg       0.69      0.69      0.69       570
weighted avg       0.69      0.69      0.69       570

Accuracy of Random Forest: 68.77 %
Precision of Random Forest: 68.07 %
Recall of Random Forest: 69.04 %


In [270]:
#Decision Tree Algorithm:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y)
from sklearn import tree
from matplotlib import pyplot as plt
import graphviz

a = datetime.datetime.now()
dt = DecisionTreeClassifier(criterion='entropy')
dt.fit(X_train, y_train)

y_pred = dt.predict(X_test)
b = datetime.datetime.now()
c = b - a
print('time in milliseconds')
print(c.total_seconds() * 1000)

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("Classification Report")
print(classification_report(y_test, y_pred))

accuracy = accuracy_score(y_pred,y_test) * 100  
recall = recall_score(y_pred, y_test) * 100 
precision = precision_score(y_pred, y_test) * 100

print("Accuracy of Decision Tree: {:.2f} %".format(accuracy))
print("Recall of Decision Tree: {:.2f} %".format(recall))
print("precision of Decision Tree: {:.2f} %".format(precision))



time in milliseconds
38.619
Confusion Matrix:
[[241  44]
 [104 181]]
Classification Report
              precision    recall  f1-score   support

       False       0.70      0.85      0.77       285
        True       0.80      0.64      0.71       285

    accuracy                           0.74       570
   macro avg       0.75      0.74      0.74       570
weighted avg       0.75      0.74      0.74       570

Accuracy of Decision Tree: 74.04 %
Recall of Decision Tree: 80.44 %
precision of Decision Tree: 63.51 %


In [271]:
dot_data = tree.export_graphviz(dt, out_file=None) 
graph = graphviz.Source(dot_data) 
graph.render("DecisionTree") 

'DecisionTree.pdf'

In [272]:
#PART C
from sklearn.svm import OneClassSVM

clf = OneClassSVM(gamma='auto').fit(X_train)
y_pred = clf.predict(X_test)
#array([-1,  1,  1,  1, -1])
Score = clf.score_samples(X_test)
print(Score)
#array([1.7798..., 2.0547..., 2.0556..., 2.0561..., 1.7332...])


[961.63902965 960.81609955 925.04866741 928.9750466  936.30846295
 945.78853579 957.91194736 931.67945984 943.89093147 938.0663991
 947.30646973 945.78853579 943.91481574 921.55293688 937.21757125
 954.49418155 921.82693813 963.47928844 949.75483449 949.3433719
 948.62804666 950.80945741 910.4487685  944.85476816 957.68467038
 958.09848273 927.69573636 952.02520253 935.67649959 958.39645791
 925.80305259 936.66774331 940.55855826 954.68113988 920.18103276
 931.27459681 936.37575562 917.36863614 956.43471959 923.71913497
 949.15300214 947.91963584 950.84244171 956.59865438 896.32940112
 950.48762168 955.01588981 925.04866741 951.15037883 917.20281959
 950.29503734 946.43774934 937.0276063  956.3801457  922.0282053
 946.59534168 929.02863822 942.28126791 927.54445795 953.05149282
 951.46660631 950.84244171 911.23759472 962.83239237 964.9193382
 931.27459681 933.23532831 948.37842828 958.3212183  919.49553923
 945.02350391 893.857007   943.28405188 894.94871281 930.57739867
 944.44272401 