In [351]:
from configparser import ConfigParser
import psycopg2
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import recall_score, precision_score
#Can be very helpful to notice any imbalance in classes
from collections import Counter 

In [352]:
#Sourced from https://www.postgresqltutorial.com/postgresql-python/connect/
def config(filename='psql_sample.ini', section='postgresql'):
    parser = ConfigParser()
    # read config file
    parser.read(filename) 
 
    # get section, default to postgresql
    db = {}
    if parser.has_section(section):
        params = parser.items(section)
        for param in params:
            db[param[0]] = param[1]
    else:
        raise Exception('Section {0} not found in the {1} file'.format(section, filename))
 
    return db

In [353]:
#Get the configuration file as a python dictionary
cfg = config()

#Establish the connection and create a cursor to the database
try:
    print("Here's an attempt to connect to the database")
    conn = psycopg2.connect(**cfg)
    cursor = conn.cursor()
    print("Look's like it was a success")
    
except (Exception, psycopg2.DatabaseError) as error:
    print(error)

Here's an attempt to connect to the database
Look's like it was a success


In [256]:
#SELECT QUERY
try:
    #Lets get our data 
    cursor.execute("SELECT age_group,gender,measures.status,holiday,city,daily_high_temp,daily_low_temp,rain_amount,snow_amount,parks_percentage,retail_and_recreation_percentage,grocery_and_pharmacy_percentage,workplaces_percentage,residential_percentage,transit_stations_percentage,is_fatal,is_resolved,is_unresolved from data_mart.fact_table as fact inner join data_mart.mobility_dimension as mobility on fact.mobility_key=mobility.mobility_key inner join data_mart.weather_dimension as weather on fact.weather_key=weather.weather_key inner join data_mart.patient_dimension as patient on fact.patient_key=patient.patient_key inner join data_mart.special_measures_dimension as measures on fact.special_measures_key=measures.special_measures_key inner join data_mart.phu_location_dimension as phu on fact.phu_location_key=phu.phu_location_key inner join data_mart.reported_date_dimension as date on fact.reported_date_key=date.reported_date_key") 

    #Get the complete result set. It will be a list of tuples where each tuple is a row from the result set
    result_list = cursor.fetchall()
        
except (Exception, psycopg2.DatabaseError) as error:
    print(error)

In [257]:
#Ensure to run this cell at the end of all your experiments to close all connections
cursor.close()
conn.close()

In [315]:
#Now,  1
result_df = pd.DataFrame(result_list, columns=["age_group","gender","status","holiday","city","daily_high","daily_low","rain_amount","snow_amount", "parks_percentage", "retail_and_recreation_percentage","grocery_and_pharmacy_percentage","workplaces_percentage","residential_percentage","transit_stations_percentage",
                                               "is_fatal", "is_resolved","is_unresolved"])

result_df.drop(columns=['daily_high'], inplace=True)
result_df.drop(columns=['daily_low'], inplace=True)
result_df.drop(columns=['rain_amount'], inplace=True)
result_df.drop(columns=['snow_amount'], inplace=True)
result_df.drop(columns=['workplaces_percentage'], inplace=True)
result_df.drop(columns=['residential_percentage'], inplace=True)
result_df.drop(columns=['transit_stations_percentage'], inplace=True)
result_df.drop(columns=['is_resolved'], inplace=True)
result_df.drop(columns=['is_unresolved'], inplace=True)

result_df.head()

Unnamed: 0,age_group,gender,status,holiday,city,parks_percentage,retail_and_recreation_percentage,grocery_and_pharmacy_percentage,is_fatal
0,40s,MALE,Private gathering restrictions,False,Mississauga,71.0,-15.0,-1.0,False
1,50s,MALE,Private gathering restrictions,False,Mississauga,71.0,-15.0,-1.0,False
2,<20,MALE,Private gathering restrictions,False,Mississauga,71.0,-15.0,-1.0,False
3,30s,MALE,Private gathering restrictions,False,Mississauga,71.0,-15.0,-1.0,False
4,20s,MALE,Private gathering restrictions,False,Newmarket,,-11.0,4.0,False


In [323]:
#Removing null values : 2
result_df["parks_percentage"].fillna(result_df["parks_percentage"].mean(), inplace=True)
result_df["grocery_and_pharmacy_percentage"].fillna(result_df["grocery_and_pharmacy_percentage"].mean(), inplace=True)
result_df["retail_and_recreation_percentage"].fillna(result_df["retail_and_recreation_percentage"].mean(), inplace=True)

result_df.head()

Unnamed: 0,age_group,gender,status,holiday,city,parks_percentage,retail_and_recreation_percentage,grocery_and_pharmacy_percentage,is_fatal
0,40s,MALE,Private gathering restrictions,False,Mississauga,71.0,-15.0,-1.0,False
1,50s,MALE,Private gathering restrictions,False,Mississauga,71.0,-15.0,-1.0,False
2,<20,MALE,Private gathering restrictions,False,Mississauga,71.0,-15.0,-1.0,False
3,30s,MALE,Private gathering restrictions,False,Mississauga,71.0,-15.0,-1.0,False
4,20s,MALE,Private gathering restrictions,False,Newmarket,16.937927,-11.0,4.0,False


In [325]:
#get counter 6
Counter(result_df['is_fatal'])

Counter({False: 83387, True: 1425})

In [327]:
#Get the labels 3
y_under = result_df["is_fatal"]#label


In [329]:
#normalize
#Normalizing data:
transform_data = result_df[["retail_and_recreation_percentage","grocery_and_pharmacy_percentage","parks_percentage"]]
X_normalized=preprocessing.normalize(transform_data,norm='l2')
normalize_part=pd.DataFrame(X_normalized,columns=transform_data.columns)
non_numerical=result_df[["status","age_group","is_fatal","city","gender","holiday"]]
result_data=pd.concat([non_numerical,normalize_part],axis=1)
result_data.head()

Unnamed: 0,status,age_group,is_fatal,city,gender,holiday,retail_and_recreation_percentage,grocery_and_pharmacy_percentage,parks_percentage
0,Private gathering restrictions,40s,False,Mississauga,MALE,False,-0.206685,-0.013779,0.97831
1,Private gathering restrictions,50s,False,Mississauga,MALE,False,-0.206685,-0.013779,0.97831
2,Private gathering restrictions,<20,False,Mississauga,MALE,False,-0.206685,-0.013779,0.97831
3,Private gathering restrictions,30s,False,Mississauga,MALE,False,-0.206685,-0.013779,0.97831
4,Private gathering restrictions,20s,False,Newmarket,MALE,False,-0.534274,0.194282,0.822682


In [331]:
#drop the label
result_data.drop(columns=['is_fatal'], inplace=True)

In [333]:
# Our features have categorical data. We need to convert them to one-hot encoders to get the best results.
new_result_df = pd.get_dummies(result_data)
new_result_df.head()

Unnamed: 0,holiday,retail_and_recreation_percentage,grocery_and_pharmacy_percentage,parks_percentage,status_Acitivies closures,status_Control,status_Lockdown,status_Other,status_Private gathering restrictions,status_Protect,...,city_Mississauga,city_Newmarket,city_Oakville,city_Ottawa,city_Toronto,city_Whitby,gender_FEMALE,gender_GENDER DIVERSE,gender_MALE,gender_UNSPECIFIED
0,False,-0.206685,-0.013779,0.97831,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,1,0
1,False,-0.206685,-0.013779,0.97831,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,1,0
2,False,-0.206685,-0.013779,0.97831,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,1,0
3,False,-0.206685,-0.013779,0.97831,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,1,0
4,False,-0.534274,0.194282,0.822682,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,1,0


In [335]:
#Undersampling of majority classes:  4
from imblearn.under_sampling import NearMiss
X = new_result_df.values#features
undersample = NearMiss(version=1,n_neighbors=3)
x_under, y_under = undersample.fit_resample(X,label)

In [349]:
X_train, X_test, y_train, y_test = train_test_split(x_under, y_under, test_size=0.2, shuffle=True, stratify=y_under)

In [350]:
#Let's see the no. of records per class in training and test set
print("Training set {} ".format(Counter(y_train)))
print("Test set {} ".format(Counter(y_test)))

Training set Counter({True: 1140, False: 1140}) 
Test set Counter({True: 285, False: 285}) 


In [339]:
#PART B
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import recall_score, precision_score, accuracy_score
import datetime
#Gradient Boosting


#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, shuffle=True, stratify=y_under)
a = datetime.datetime.now()
classifier = GradientBoostingClassifier(n_estimators=20, learning_rate=0.75, max_features=2, max_depth=2, random_state=0)
classifier.fit(X_train, y_train)
predictions = classifier.predict(X_test)
b = datetime.datetime.now()
c = b - a
print('time in milliseconds')
print(c.total_seconds() * 1000)
print("Confusion Matrix:")
print(confusion_matrix(y_test, predictions))

print("Classification Report")
print(classification_report(y_test, predictions))

accuracy = accuracy_score(predictions, y_test) * 100
recall = recall_score(predictions, y_test) * 100
precision = precision_score(predictions, y_test) * 100

print("Accuracy of Gradient Boosting: {:.2f} %".format(accuracy))
print("Precision of Gradient Boosting: {:.2f} %".format(precision))
print("Recall of Gradient Boosting: {:.2f} %".format(recall))


time in milliseconds
57.802
Confusion Matrix:
[[227  58]
 [112 173]]
Classification Report
              precision    recall  f1-score   support

       False       0.67      0.80      0.73       285
        True       0.75      0.61      0.67       285

    accuracy                           0.70       570
   macro avg       0.71      0.70      0.70       570
weighted avg       0.71      0.70      0.70       570

Accuracy of Gradient Boosting: 70.18 %
Precision of Gradient Boosting: 60.70 %
Recall of Gradient Boosting: 74.89 %


In [340]:
#Random Forest
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify = y_under)

#create and fit random forest
a = datetime.datetime.now()#start time
rf = RandomForestClassifier(max_depth=2, random_state=0)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
b = datetime.datetime.now()#end time
c = b - a
print('time in milliseconds')
print(c.total_seconds() * 1000)

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("Classification Report")
print(classification_report(y_test, y_pred))

accuracy = accuracy_score(y_pred, y_test) * 100
recall = recall_score(y_pred, y_test) * 100
precision = precision_score(y_pred, y_test) * 100

print("Accuracy of Random Forest: {:.2f} %".format(accuracy))
print("Precision of Random Forest: {:.2f} %".format(precision))
print("Recall of Random Forest: {:.2f} %".format(recall))


time in milliseconds
440.75399999999996
Confusion Matrix:
[[237  48]
 [118 167]]
Classification Report
              precision    recall  f1-score   support

       False       0.67      0.83      0.74       285
        True       0.78      0.59      0.67       285

    accuracy                           0.71       570
   macro avg       0.72      0.71      0.70       570
weighted avg       0.72      0.71      0.70       570

Accuracy of Random Forest: 70.88 %
Precision of Random Forest: 58.60 %
Recall of Random Forest: 77.67 %


In [341]:
#Decision Tree Algorithm:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y)
from sklearn import tree
from matplotlib import pyplot as plt
import graphviz

a = datetime.datetime.now()
dt = DecisionTreeClassifier(criterion='entropy')
dt.fit(X_train, y_train)

y_pred = dt.predict(X_test)
b = datetime.datetime.now()
c = b - a
print('time in milliseconds')
print(c.total_seconds() * 1000)

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("Classification Report")
print(classification_report(y_test, y_pred))

accuracy = accuracy_score(y_pred,y_test) * 100  
recall = recall_score(y_pred, y_test) * 100 
precision = precision_score(y_pred, y_test) * 100

print("Accuracy of Decision Tree: {:.2f} %".format(accuracy))
print("Recall of Decision Tree: {:.2f} %".format(recall))
print("precision of Decision Tree: {:.2f} %".format(precision))



time in milliseconds
44.699000000000005
Confusion Matrix:
[[255  30]
 [105 180]]
Classification Report
              precision    recall  f1-score   support

       False       0.71      0.89      0.79       285
        True       0.86      0.63      0.73       285

    accuracy                           0.76       570
   macro avg       0.78      0.76      0.76       570
weighted avg       0.78      0.76      0.76       570

Accuracy of Decision Tree: 76.32 %
Recall of Decision Tree: 85.71 %
precision of Decision Tree: 63.16 %


In [342]:
dot_data = tree.export_graphviz(dt, out_file=None) 
graph = graphviz.Source(dot_data) 
graph.render("DecisionTree") 

'DecisionTree.pdf'

In [343]:
#PART C
from sklearn.svm import OneClassSVM

clf = OneClassSVM(gamma='auto').fit(X_train)
y_pred = clf.predict(X_test)
#array([-1,  1,  1,  1, -1])
Score = clf.score_samples(X_test)
print(Score)
#array([1.7798..., 2.0547..., 2.0556..., 2.0561..., 1.7332...])


[910.05329477 938.89774941 937.25505834 928.95123801 903.8268
 952.89101345 953.25464584 958.6540075  952.65209946 927.1861651
 946.64632033 938.96168681 947.28585376 946.21197739 933.8720698
 927.1029745  916.04886931 952.89101345 942.48652063 943.08190113
 927.4082738  954.57561496 953.16216021 949.1648353  924.51337792
 941.2120428  905.26614931 958.77154487 951.34002367 949.71871405
 958.77154487 956.88684793 902.75236243 943.3614357  948.03954202
 949.15098196 938.30800943 938.59918596 953.96110425 936.87687533
 921.39438674 932.07586126 949.44107647 941.2120428  928.49734178
 908.3583714  920.99774221 957.21654792 953.66746061 924.09439173
 940.50465993 964.18637426 946.50286396 948.89119173 878.00222299
 910.79974995 939.50086803 953.73616935 897.4429088  928.03713738
 943.3614357  943.08190113 924.39618302 917.25178575 950.87865559
 946.50286396 926.41684472 942.48652063 932.17106741 955.03658072
 953.63010876 915.13045968 943.0370595  941.48991862 938.90076306
 951.23544774 96