In [56]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import KFold, train_test_split, cross_val_score
from sklearn.metrics import f1_score
from collections import Counter
import pandas as pd
import numpy as np
import csv


# Some Preprocessing

In [57]:
train= pd.read_csv('data/train_best.csv')
test= pd.read_csv('data/test_best.csv')

In [58]:
train['date_confirmation'].describe(percentiles=[0.2, 0.4, 0.6, 0.8])

count    17212.000000
mean       126.737160
std         21.321449
min          5.000000
20%        106.000000
40%        128.000000
50%        132.000000
60%        138.000000
80%        145.000000
max        154.000000
Name: date_confirmation, dtype: float64

In [59]:
train['date_confirmation'] = round(train['date_confirmation'].rank(pct=True) * 10).astype('int64')

test['date_confirmation'] = round(test['date_confirmation'].rank(pct=True) * 10).astype('int64')

In [60]:
# from imblearn.under_sampling import RandomUnderSampler 
# from imblearn.over_sampling import RandomOverSampler

# labels = train.loc[:,train.columns == 'outcome_group']
# features = train.loc[:,train.columns != 'outcome_group']
# counter = Counter(labels['outcome_group'])
# print("Total Samples Before Balancing: ",len(train), "Distribution of Classes:", counter)
# over = RandomOverSampler({2.0:5680 ,0.0:5680})
# under = RandomUnderSampler({1.0:5680})
# features,labels = under.fit_resample(features,labels)
# features,labels = over.fit_resample(features,labels)
# counter = Counter(labels['outcome_group'])
# features['outcome_group'] = labels['outcome_group']
# train = features
# print("Total Samples After Balancing: ",len(train), "Distribution of Classes:", counter)

# Training of Model

In [61]:
train = train.drop(['latitude', 'longitude', 'Confirmed', 'Deaths', 'Recovered', 'Active', 'Case_Fatality_Ratio', 'Incident_Rate'], axis=1)

train

Unnamed: 0,age,sex,country,date_confirmation,chronic_disease_binary,outcome_group
0,18,0.0,21.0,6,0.0,1.0
1,27,0.0,27.0,2,0.0,2.0
2,46,1.0,21.0,3,0.0,1.0
3,21,0.0,21.0,8,0.0,1.0
4,27,1.0,21.0,8,0.0,1.0
...,...,...,...,...,...,...
17207,29,0.0,21.0,3,0.0,1.0
17208,47,0.0,27.0,1,0.0,2.0
17209,30,1.0,21.0,8,0.0,1.0
17210,59,1.0,21.0,10,0.0,1.0


In [62]:
X = train.loc[:, train.columns!='outcome_group']
y = pd.DataFrame(train, columns=['outcome_group'])

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

#print(X.shape, y.shape)

X

Unnamed: 0,age,sex,country,date_confirmation,chronic_disease_binary
0,18,0.0,21.0,6,0.0
1,27,0.0,27.0,2,0.0
2,46,1.0,21.0,3,0.0
3,21,0.0,21.0,8,0.0
4,27,1.0,21.0,8,0.0
...,...,...,...,...,...
17207,29,0.0,21.0,3,0.0
17208,47,0.0,27.0,1,0.0
17209,30,1.0,21.0,8,0.0
17210,59,1.0,21.0,10,0.0


Model

In [63]:
from mixed_naive_bayes import MixedNB

#model = GaussianNB()

model = MixedNB(categorical_features=[1,3,4])

KFold

In [64]:
k = 5
kf = KFold(n_splits = k, random_state=None)
kf.get_n_splits()

print(kf)

KFold(n_splits=5, random_state=None, shuffle=False)


In [65]:

acc_score = []
F1 = []

for train_index, test_index in kf.split(train):
    #print("TRAIN:", train_index)
    #print("TEST:", test_index)

    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    y_train = y_train.values.flatten().astype('int64')

    model.fit(X_train,y_train)

    acc = model.score(X_test , y_test.values.flatten())
    acc_score.append(acc)

    y_preds = model.predict(X_test)
    F1.append(f1_score(y_test, y_preds, average='macro'))


avg_acc_score = sum(acc_score)/k

[ 2 11  2]
[ 2 11  2]
[ 2 11  2]
[ 2 11  2]
[ 2 11  2]


Repeated Stratified KFold

In [66]:
# from sklearn.model_selection import RepeatedStratifiedKFold

# rskf = RepeatedStratifiedKFold(n_splits=k, n_repeats=2)

In [67]:
# acc_score = []
# F1 = []

# for train_index, test_index in rskf.split(X, y):
#     #print("TRAIN:", train_index)
#     #print("TEST:", test_index)

#     X_train, X_test = X.iloc[train_index], X.iloc[test_index]
#     y_train, y_test = y.iloc[train_index], y.iloc[test_index]

#     model.fit(X_train,y_train)

#     acc = model.score(X_test , y_test)
#     acc_score.append(acc)

#     y_preds = model.predict(X_test)
#     F1.append(f1_score(y_test, y_preds, average='macro'))


# avg_acc_score = sum(acc_score)/k

In [68]:
#print('accuracy of each fold - {}'.format(acc_score))
#print('Avg accuracy : {}'.format(avg_acc_score))
#print('F1 of each fold - {}'.format(F1))
print('Mean F1 : ', np.mean(F1))

Mean F1 :  0.7159517327482173


In [69]:
test = test.drop(['latitude', 'longitude', 'Confirmed', 'Deaths', 'Recovered', 'Active', 'Case_Fatality_Ratio', 'Incident_Rate'], axis=1)

test


Unnamed: 0,age,sex,country,date_confirmation,chronic_disease_binary
0,59,0.0,27.0,1,0.0
1,79,1.0,21.0,8,0.0
2,44,0.0,21.0,6,0.0
3,36,1.0,21.0,3,0.0
4,52,1.0,21.0,2,0.0
...,...,...,...,...,...
4299,66,1.0,21.0,1,0.0
4300,66,1.0,27.0,1,0.0
4301,53,0.0,21.0,5,0.0
4302,25,1.0,21.0,8,0.0


Generating Result

In [70]:
y_preds = model.predict(test)

y_preds

def create_submission_file(y_preds, file_name):
    with open(file_name, 'w') as csvfile:
        wr = csv.writer(csvfile, quoting=csv.QUOTE_ALL)
        wr.writerow(["Id", "Prediction"])
        for i, pred in enumerate(y_preds):
            wr.writerow([str(i), str(int(pred))])

        
create_submission_file(y_preds, 'submission_[naive_bayes].csv')