In [452]:
from configparser import ConfigParser
import psycopg2
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_text
from sklearn import preprocessing
from sklearn.svm import LinearSVC
from sklearn.metrics import recall_score, precision_score, accuracy_score
from imblearn.under_sampling import NearMiss
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.svm import OneClassSVM
from sklearn.tree import export_graphviz
from subprocess import call
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
import time


from collections import Counter

In [453]:
def config(filename='database.ini', section='postgresql'):
    # create a parser
    parser = ConfigParser()
    # read config file
    parser.read(filename)

    # get section, default to postgresql
    db = {}
    if parser.has_section(section):
        params = parser.items(section)
        for param in params:
            db[param[0]] = param[1]
    else:
        raise Exception('Section {0} not found in the {1} file'.format(section, filename))

    return db

In [454]:
cfg = config()

In [455]:
try:
    conn = psycopg2.connect(**cfg)
    cursor = conn.cursor()
    print("connected to database")
    
except (Exception, psycopg2.DatabaseError) as error:
    print(error)

connected to database


In [456]:
try:
    cursor.execute("SELECT age_group, gender, parks_percentage, status, phu_name, holiday, is_fatal\
                    from data_mart.fact_table as F inner join data_mart.patient_dimension\
                    as P on F.patient_key = P.patient_key inner join data_mart.phu_location_dimension as PHU\
                    on F.phu_location_key = PHU.phu_location_key inner join data_mart.mobility_dimension as M\
                    on F.mobility_key = M.mobility_key inner join data_mart.special_measures_dimension as SM\
                    on F.special_measures_key = SM.special_measures_key inner join data_mart.reported_date_dimension as RD\
                    on F.reported_date_key = RD.reported_date_key")
    
    result = cursor.fetchall()
    
except (Exception, psycopg2.DatabaseError) as e:
    print(e)

In [457]:
cursor.close()
conn.close()

In [458]:
result_df = pd.DataFrame(result, columns = ["Age", "gender", "parks_percentage", "status", "PHU", "holiday", "is_fatal"])

In [459]:
result_df.head()

Unnamed: 0,Age,gender,parks_percentage,status,PHU,holiday,is_fatal
0,40s,MALE,71.0,Private gathering restrictions,Peel Public Health,False,False
1,50s,MALE,71.0,Private gathering restrictions,Peel Public Health,False,False
2,<20,MALE,71.0,Private gathering restrictions,Peel Public Health,False,False
3,30s,MALE,71.0,Private gathering restrictions,Peel Public Health,False,False
4,20s,MALE,,Private gathering restrictions,York Region Public Health Services,False,False


In [460]:
#remove null values from mobility dimension
result_df['parks_percentage'].fillna(result_df['parks_percentage'].mean(), inplace = True)

In [461]:
#normalize
transform_data = result_df[["parks_percentage"]]
X_normalized = preprocessing.normalize(transform_data, norm='l2')
normalized = pd.DataFrame(X_normalized, columns = transform_data.columns)
non_numerical = result_df[["Age", "gender", "status", "PHU", "holiday", "is_fatal"]]
result_data = pd.concat([non_numerical,normalized],axis=1)

In [473]:
#one hot conversion
new_result = pd.get_dummies(result_data)
unsupervised_onehot = new_result.copy()
new_result.drop(columns=["is_fatal"], inplace = True)
new_result.head()

Unnamed: 0,holiday,parks_percentage,Age_20s,Age_30s,Age_40s,Age_50s,Age_60s,Age_70s,Age_80s,Age_90+,...,status_Private gathering restrictions,status_Protect,status_Restrict,status_Stay-at-home,PHU_Durham Region Health Department,PHU_Halton Region Health Department,PHU_Ottawa Public Health,PHU_Peel Public Health,PHU_Toronto Public Health,PHU_York Region Public Health Services
0,False,1.0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
1,False,1.0,0,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
2,False,1.0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
3,False,1.0,0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
4,False,1.0,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1


In [463]:
result_data.head()

Unnamed: 0,Age,gender,status,PHU,holiday,is_fatal,parks_percentage
0,40s,MALE,Private gathering restrictions,Peel Public Health,False,False,1.0
1,50s,MALE,Private gathering restrictions,Peel Public Health,False,False,1.0
2,<20,MALE,Private gathering restrictions,Peel Public Health,False,False,1.0
3,30s,MALE,Private gathering restrictions,Peel Public Health,False,False,1.0
4,20s,MALE,Private gathering restrictions,York Region Public Health Services,False,False,1.0


In [464]:
X = new_result.values
y = result_data["is_fatal"].values

undersample = NearMiss(version=1, n_neighbors=3)
# fit and apply the transform
X_under, y_under = undersample.fit_resample(X, y)

In [465]:
X_train, X_test, y_train, y_test = train_test_split(X_under, y_under, test_size=0.33, shuffle=True, stratify = y_under)

In [466]:
print("Undersampled training set {} ".format(Counter(y_train)))
print("Undersampled test set {} ".format(Counter(y_test)))

Undersampled training set Counter({False: 955, True: 954}) 
Undersampled test set Counter({True: 471, False: 470}) 


In [467]:
#timer start
start = time.time()
#create and fit random forest
rf = RandomForestClassifier(max_depth=2, random_state=0)
rf.fit(X_train, y_train)

#end timer
end = time.time()

In [468]:
y_pred = rf.predict(X_test)

accuracy = accuracy_score(y_pred, y_test) * 100
recall = recall_score(y_pred, y_test) * 100
precision = precision_score(y_pred, y_test) * 100

print("Accuracy of Random Forest: {:.2f} %".format(accuracy))
print("Precision of Random Forest: {:.2f} %".format(precision))
print("Recall of Random Forest: {:.2f} %".format(recall))
print("Random forest construction time: ", (round((end - start), 4) * 1000), "milliseconds")

Accuracy of Random Forest: 78.75 %
Precision of Random Forest: 74.10 %
Recall of Random Forest: 81.73 %
Random forest construction time:  119.0 milliseconds


In [469]:
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("Classification Report")
print(classification_report(y_test, y_pred))

Confusion Matrix:
[[392  78]
 [122 349]]
Classification Report
              precision    recall  f1-score   support

       False       0.76      0.83      0.80       470
        True       0.82      0.74      0.78       471

    accuracy                           0.79       941
   macro avg       0.79      0.79      0.79       941
weighted avg       0.79      0.79      0.79       941



In [470]:
#visualize
estimator = rf.estimators_[50]

# Export as dot file
#export_graphviz(estimator, out_file='tree.dot', 
 #               feature_names = new_result.columns,
  #              class_names = True,
   #             rounded = True, proportion = False, 
    #            precision = 2, filled = True)


In [476]:
#part C: Anomaly detection


KeyError: 'status_Stay-at-home'