# CMPT 459 - Data Mining - Final Milestone

Group Collaborators: Hazem Hisham, Harry Preet Singh, Jiongyu Zhu 


## Random Forest Classifier

### Reading Data

In [6]:
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

### Splitting Labels and Features

In [7]:
labels = train.loc[:,train.columns == 'outcome_group']
features = train.loc[:,train.columns != 'outcome_group']

### Initialsing Parameter Grid for HyperTuning

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler

X_std = StandardScaler().fit_transform(features)
rf_base = RandomForestClassifier(random_state = 42, verbose=True)
# Number of trees in Random Forest
rf_n_estimators = [int(x) for x in np.linspace(200, 1000, 5)]
rf_n_estimators.append(1500)
rf_n_estimators.append(2000)
rf_n_estimators.append(2500)
# Maximum number of levels in tree
rf_max_depth = [int(x) for x in np.linspace(5, 55, 11)]
# Add the default as a possible value
rf_max_depth.append(None)

# Number of features to consider at every split
rf_max_features = ['auto', 'sqrt', 'log2']

# Criterion to split on
rf_criterion = ['gini', 'entropy']

# Minimum number of samples required to split a node
rf_min_samples_split = [int(x) for x in np.linspace(2, 10, 9)]

# Minimum decrease in impurity required for split to happen
rf_min_impurity_decrease = [0.0, 0.05, 0.1]

# Method of selecting samples for training each tree
rf_bootstrap = [True, False]

rf_class_weight = ["balanced", "balanced_subsample"]

# Create the grid
rf_grid = {'n_estimators': rf_n_estimators,
               'max_depth': rf_max_depth,
               'max_features': rf_max_features,
               'criterion': rf_criterion,
               'min_samples_split': rf_min_samples_split,
               'min_impurity_decrease': rf_min_impurity_decrease,
               'bootstrap': rf_bootstrap,
               'class_weight' : rf_class_weight
               }
rf_grid

{'n_estimators': [200, 400, 600, 800, 1000, 1500, 2000, 2500],
 'max_depth': [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, None],
 'max_features': ['auto', 'sqrt', 'log2'],
 'criterion': ['gini', 'entropy'],
 'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10],
 'min_impurity_decrease': [0.0, 0.05, 0.1],
 'bootstrap': [True, False],
 'class_weight': ['balanced', 'balanced_subsample']}

### HyperTuning

In [9]:
from sklearn.metrics import f1_score, make_scorer, accuracy_score
scoring = {"f1": make_scorer(f1_score , average='macro'),"f1dec":make_scorer(f1_score,average='macro',labels = [0.0]),"acc":make_scorer(accuracy_score)}
rf_random = RandomizedSearchCV(estimator= rf_base, param_distributions= rf_grid, scoring=scoring, refit="f1", n_iter =65,cv= 5,verbose = 3,random_state= 42, n_jobs=-1)

In [None]:
rf_random.fit(X_std, labels.values.ravel())

In [49]:
print(rf_random.best_score_)
print(rf_random.best_params_)

AttributeError: 'RandomizedSearchCV' object has no attribute 'best_score_'

In [None]:
rfpd = pd.DataFrame(rf_random.cv_results_)
rf =  rfpd.sort_values(by='rank_test_f1')
print("Mean Macro F1 Score:",rf['mean_test_f1'][62],"| Mean Accuracy:s", rf['mean_test_acc'][62],"| Mean Macro F1 Score on deceased class:",rf['mean_test_f1dec'][62])

### Training the Model with Best Parameters

In [10]:
rf_best = RandomForestClassifier(n_estimators=400, min_samples_split= 3,min_impurity_decrease=0.0, max_features='auto',max_depth=10, criterion= 'entropy', bootstrap=True,class_weight = 'balanced' ,random_state=42)
X_std = StandardScaler().fit_transform(features)
rf_best.fit(X_std, labels.values.ravel())

RandomForestClassifier(class_weight='balanced', criterion='entropy',
                       max_depth=10, min_samples_split=3, n_estimators=400,
                       random_state=42)

### Predicting on Test Data

In [11]:
Y_std = StandardScaler().fit_transform(test.values)
predictions = rf_best.predict(Y_std)
preds = []
for i in predictions:
    n = int(i)
    preds.append(str(n))
import csv
def create_submission_file(y_preds, file_name):
    with open(file_name, 'w') as csvfile:
        wr = csv.writer(csvfile, quoting=csv.QUOTE_ALL)
        wr.writerow(["Id", "Prediction"])
        for i, pred in enumerate(y_preds):
            wr.writerow([str(i), str(pred)])
create_submission_file(preds, 'submission_rf_harry.csv')

## K-Nearest Neighbours

In [12]:
import pandas as pd
import numpy as np
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import KFold 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedStratifiedKFold
import csv
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

In [13]:
train_data=pd.read_csv('data/train.csv')
test_data=pd.read_csv('data/test.csv')

In [14]:
train_data[train_data['outcome_group']==0.0]

Unnamed: 0,age,sex,country,latitude,longitude,date_confirmation,chronic_disease_binary,Confirmed,Deaths,Recovered,Active,Incident_Rate,Case_Fatality_Ratio,outcome_group
6,65,0.0,21.0,19.420820,76.050130,93,0.0,2812980,54649,2400727,357604,2284.297169,1.942744,0.0
40,73,1.0,27.0,14.584244,121.176289,113,0.0,747288,13297,603746,130245,681.949809,1.779368,0.0
47,25,0.0,27.0,14.470810,121.427050,133,0.0,747288,13297,603746,130245,681.949809,1.779368,0.0
63,34,0.0,27.0,14.595800,120.977200,90,0.0,747288,13297,603746,130245,681.949809,1.779368,0.0
90,62,1.0,21.0,13.083620,80.282520,141,0.0,886673,12719,858075,15879,1139.078325,1.434463,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17107,74,0.0,27.0,14.380000,121.050000,96,0.0,747288,13297,603746,130245,681.949809,1.779368,0.0
17116,33,1.0,27.0,10.333330,123.750000,121,0.0,747288,13297,603746,130245,681.949809,1.779368,0.0
17130,62,0.0,27.0,14.595800,120.977200,93,0.0,747288,13297,603746,130245,681.949809,1.779368,0.0
17143,43,1.0,21.0,13.083620,80.282520,133,0.0,886673,12719,858075,15879,1139.078325,1.434463,0.0


In [15]:
train_data[train_data['outcome_group']==1.0]

Unnamed: 0,age,sex,country,latitude,longitude,date_confirmation,chronic_disease_binary,Confirmed,Deaths,Recovered,Active,Incident_Rate,Case_Fatality_Ratio,outcome_group
0,18,0.0,21.0,25.49096,85.93903,138,0.0,265527,1576,262371,1580,212.762145,0.593537,1.0
2,46,1.0,21.0,13.08362,80.28252,122,0.0,886673,12719,858075,15879,1139.078325,1.434463,1.0
3,21,0.0,21.0,13.08362,80.28252,144,0.0,886673,12719,858075,15879,1139.078325,1.434463,1.0
4,27,1.0,21.0,26.28361,87.20347,147,0.0,265527,1576,262371,1580,212.762145,0.593537,1.0
5,24,1.0,21.0,24.45712,85.13749,146,0.0,265527,1576,262371,1580,212.762145,0.593537,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17204,55,0.0,21.0,12.30906,76.65303,150,0.0,997004,12567,956170,28267,1475.672533,1.260476,1.0
17205,53,0.0,21.0,28.36145,77.50704,86,0.0,617194,8811,598535,9848,259.453056,1.427590,1.0
17207,29,0.0,21.0,13.08362,80.28252,114,0.0,886673,12719,858075,15879,1139.078325,1.434463,1.0
17209,30,1.0,21.0,24.94452,86.26404,146,0.0,265527,1576,262371,1580,212.762145,0.593537,1.0


In [16]:
train_data[train_data['outcome_group']==2.0]

Unnamed: 0,age,sex,country,latitude,longitude,date_confirmation,chronic_disease_binary,Confirmed,Deaths,Recovered,Active,Incident_Rate,Case_Fatality_Ratio,outcome_group
1,27,0.0,27.0,7.070000,125.600000,105,0.0,747288,13297,603746,130245,681.949809,1.779368,2.0
12,38,0.0,27.0,14.630000,121.030000,112,0.0,747288,13297,603746,130245,681.949809,1.779368,2.0
19,53,0.0,27.0,13.790060,121.013160,98,0.0,747288,13297,603746,130245,681.949809,1.779368,2.0
24,59,1.0,27.0,14.595800,120.977200,93,0.0,747288,13297,603746,130245,681.949809,1.779368,2.0
27,25,1.0,27.0,14.630000,121.030000,106,0.0,747288,13297,603746,130245,681.949809,1.779368,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17190,67,1.0,27.0,14.580000,121.030000,94,0.0,747288,13297,603746,130245,681.949809,1.779368,2.0
17198,29,1.0,27.0,15.975780,120.567090,90,0.0,747288,13297,603746,130245,681.949809,1.779368,2.0
17202,35,1.0,27.0,10.333330,123.750000,113,0.0,747288,13297,603746,130245,681.949809,1.779368,2.0
17206,53,1.0,27.0,10.287693,123.976852,113,0.0,747288,13297,603746,130245,681.949809,1.779368,2.0


### Under Sampling the Data

In [17]:
sampler=RandomUnderSampler(random_state=42,sampling_strategy={1:5000})
X=train_data.iloc[:,:-1]
y=train_data.iloc[:,-1]
X_resampled,y_resampled=sampler.fit_resample(X,y)

X_resampled_train, X_resampled_validation, y_resampled_train, y_resampled_validation = train_test_split(X_resampled, y_resampled,test_size=0.2, shuffle=True)
print("Before sampling: " ,train_data['outcome_group'].value_counts())
print("After sampling: ", y_resampled.value_counts())

Before sampling:  1.0    13241
2.0     2974
0.0      997
Name: outcome_group, dtype: int64
After sampling:  1.0    5000
2.0    2974
0.0     997
Name: outcome_group, dtype: int64


### Tuning N neighbours

In [18]:
#### code skeleton to view individual fold score adapted from https://www.askpython.com/python/examples/k-fold-cross-validation
knn=KNeighborsClassifier(n_neighbors=8,weights='uniform',p=2)
kf = KFold(n_splits=10)
kf_splits=kf.split(X_resampled_train)
trans = MinMaxScaler()
pipeline = Pipeline(steps=[('t', trans), ('m', knn)])
model=pipeline
f1_scores = []
f1_scores_deceased=[]
f1_scores_hospitalised=[]
f1_scores_nonhospitalised=[]
accuracy=[]
for train_index , test_index in kf_splits:
    X_train_fold = X_resampled_train.iloc[train_index,:]
    X_validation_fold = X_resampled_train.iloc[test_index,:]
    y_train_fold = y_resampled_train.iloc[train_index]
    y_validation_fold = y_resampled_train.iloc[test_index]
     
    model.fit(X_train_fold,y_train_fold)
    pred_values = model.predict(X_validation_fold)
     
    f1 = f1_score(y_validation_fold, pred_values ,average='macro')
    f1_deceased=f1_score(y_validation_fold,pred_values ,average=None)[0]
    f1_hospitalised=f1_score(y_validation_fold,pred_values,average=None)[1]
    f1_nonhospitalised=f1_score(y_validation_fold,pred_values, average=None)[2]
    acc=accuracy_score(y_validation_fold,pred_values)
    f1_scores.append(f1)
    f1_scores_deceased.append(f1_deceased)
    f1_scores_hospitalised.append(f1_hospitalised)
    f1_scores_nonhospitalised.append(f1_nonhospitalised)
    accuracy.append(acc)
     
avg_f1_score = sum(f1_scores)/10
avg_f1_deceased = sum(f1_scores_deceased)/10
avg_f1_hospitalised = sum(f1_scores_hospitalised)/10
avg_f1_nonhospitalised = sum(f1_scores_nonhospitalised)/10
avg_accuracy=sum(accuracy)/10


print("Scores of folds: ", f1_scores)
print('avg_f1_macro:', avg_f1_score)
print('avg_f1_deceased', avg_f1_deceased)
print('avg_f1_hospitalised', avg_f1_hospitalised)
print('avg_f1_nonhospitalised', avg_f1_nonhospitalised)
print('avg_accuracy', avg_accuracy)

Scores of folds:  [0.7755720688841707, 0.7631423516285789, 0.8011096727676262, 0.792885829521872, 0.7634791914198722, 0.7494327122153209, 0.7713085424111443, 0.7866723945162502, 0.7366313065407922, 0.7894179894179895]
avg_f1_macro: 0.7729652059323617
avg_f1_deceased 0.4665504682344059
avg_f1_hospitalised 0.971237175623792
avg_f1_nonhospitalised 0.8811079739388873
avg_accuracy 0.8953465965820133


In [19]:
model.fit(X_resampled_train, y_resampled_train)

Pipeline(steps=[('t', MinMaxScaler()),
                ('m', KNeighborsClassifier(n_neighbors=8))])

In [20]:
y_predict=model.predict(X_resampled_validation)

In [21]:
f1_score(y_resampled_validation,y_predict,average='macro')

0.7714983779793257

In [23]:
model.fit(X_resampled,y_resampled)
preds_final=model.predict(test_data)

def create_submission_file(y_preds, file_name): 
    with open(file_name, 'w') as csvfile:
        wr = csv.writer(csvfile, quoting=csv.QUOTE_ALL) 
        wr.writerow(["Id", "Prediction"])
        for i, pred in enumerate(y_preds):
            wr.writerow([str(i), str(pred)]) 

df=pd.DataFrame(preds_final)
df[0]=df[0].astype('int').astype('string')

create_submission_file(df[0], 'submission_knn_hazem.csv')

## Naive Bayes

In [39]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import KFold, train_test_split, cross_val_score
from sklearn.metrics import f1_score
from collections import Counter
import pandas as pd
import numpy as np
import csv

### Data Preprocessing

In [40]:
train= pd.read_csv('data/train.csv')
test= pd.read_csv('data/test.csv')

In [41]:
train['date_confirmation'].describe(percentiles=[0.2, 0.4, 0.6, 0.8])

count    17212.000000
mean       126.737160
std         21.321449
min          5.000000
20%        106.000000
40%        128.000000
50%        132.000000
60%        138.000000
80%        145.000000
max        154.000000
Name: date_confirmation, dtype: float64

In [43]:
train['date_confirmation'] = round(train['date_confirmation'].rank(pct=True) * 10).astype('int64')
test['date_confirmation'] = round(test['date_confirmation'].rank(pct=True) * 10).astype('int64')

### Training of Model

In [44]:
train = train.drop(['latitude', 'longitude', 'Confirmed', 'Deaths', 'Recovered', 'Active', 'Case_Fatality_Ratio', 'Incident_Rate'], axis=1)
train

Unnamed: 0,age,sex,country,date_confirmation,chronic_disease_binary,outcome_group
0,18,0.0,21.0,6,0.0,1.0
1,27,0.0,27.0,2,0.0,2.0
2,46,1.0,21.0,3,0.0,1.0
3,21,0.0,21.0,8,0.0,1.0
4,27,1.0,21.0,8,0.0,1.0
...,...,...,...,...,...,...
17207,29,0.0,21.0,3,0.0,1.0
17208,47,0.0,27.0,1,0.0,2.0
17209,30,1.0,21.0,8,0.0,1.0
17210,59,1.0,21.0,10,0.0,1.0


In [45]:
X = train.loc[:, train.columns!='outcome_group']
y = pd.DataFrame(train, columns=['outcome_group'])
X

Unnamed: 0,age,sex,country,date_confirmation,chronic_disease_binary
0,18,0.0,21.0,6,0.0
1,27,0.0,27.0,2,0.0
2,46,1.0,21.0,3,0.0
3,21,0.0,21.0,8,0.0
4,27,1.0,21.0,8,0.0
...,...,...,...,...,...
17207,29,0.0,21.0,3,0.0
17208,47,0.0,27.0,1,0.0
17209,30,1.0,21.0,8,0.0
17210,59,1.0,21.0,10,0.0


### Model

In [46]:
from mixed_naive_bayes import MixedNB
model = MixedNB(categorical_features=[1,3,4])

### KFold

In [48]:
k = 3
kf = KFold(n_splits = k, random_state=None)
kf.get_n_splits()
print(kf)

KFold(n_splits=3, random_state=None, shuffle=False)


In [32]:
acc_score = []
F1 = []
for train_index, test_index in kf.split(train):

    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    y_train = y_train.values.flatten().astype('int64')

    model.fit(X_train,y_train)

    acc = model.score(X_test , y_test.values.flatten())
    acc_score.append(acc)

    y_preds = model.predict(X_test)
    F1.append(f1_score(y_test, y_preds, average='macro'))


avg_acc_score = sum(acc_score)/k

[ 2 11  2]
[ 2 11  2]
[ 2 11  2]


### Repeated Stratified KFold

In [33]:
#print('accuracy of each fold - {}'.format(acc_score))
print('Avg accuracy : {}'.format(avg_acc_score))
#print('F1 of each fold - {}'.format(F1))
print('Mean F1 : ', np.mean(F1))

Avg accuracy : 0.9303972414312107
Mean F1 :  0.7153335712261656


In [34]:
test = test.drop(['latitude', 'longitude', 'Confirmed', 'Deaths', 'Recovered', 'Active', 'Case_Fatality_Ratio', 'Incident_Rate'], axis=1)
test


Unnamed: 0,age,sex,country,date_confirmation,chronic_disease_binary
0,59,0.0,27.0,1,0.0
1,79,1.0,21.0,8,0.0
2,44,0.0,21.0,6,0.0
3,36,1.0,21.0,3,0.0
4,52,1.0,21.0,2,0.0
...,...,...,...,...,...
4299,66,1.0,21.0,1,0.0
4300,66,1.0,27.0,1,0.0
4301,53,0.0,21.0,5,0.0
4302,25,1.0,21.0,8,0.0


### Deceased F1

In [35]:
train = train[train['outcome_group'] == 0.0]

In [36]:
for train_index, test_index in kf.split(train):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    y_train = y_train.values.flatten().astype('int64')
    model.fit(X_train,y_train)
    acc = model.score(X_test , y_test.values.flatten())
    acc_score.append(acc)
    y_preds = model.predict(X_test)
    F1.append(f1_score(y_test, y_preds, average='macro'))


[ 2 11  2]
[ 2 11  2]
[ 2 11  2]


In [37]:
print('Mean F1 for deceased class : ', np.mean(F1))

Mean F1 for deceased class :  0.7050822865127074


### Generating Result

In [38]:
y_preds = model.predict(test)
y_preds
def create_submission_file(y_preds, file_name):
    with open(file_name, 'w') as csvfile:
        wr = csv.writer(csvfile, quoting=csv.QUOTE_ALL)
        wr.writerow(["Id", "Prediction"])
        for i, pred in enumerate(y_preds):
            wr.writerow([str(i), str(int(pred))])       
create_submission_file(y_preds, 'submission_nb_joey.csv')