In [0]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive/My Drive/cvd

In [0]:
import numpy as np
import pandas as pd
import datetime as dt
import sklearn
from scipy import stats
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score as rs
from sklearn.metrics import precision_score as ps
from sklearn.metrics import f1_score as fs
from sklearn.metrics import log_loss

encoder = preprocessing.LabelEncoder()

In [0]:
data = pd.read_csv('data.csv')
data = data.drop('id',axis=1)
data = data.fillna(np.nan,axis=0)
data['location'] = encoder.fit_transform(data['location'].astype(str))
data['country'] = encoder.fit_transform(data['country'].astype(str))
data['gender'] = encoder.fit_transform(data['gender'].astype(str))
data[['symptom1']] = encoder.fit_transform(data['symptom1'].astype(str))
data[['symptom2']] = encoder.fit_transform(data['symptom2'].astype(str))
data[['symptom3']] = encoder.fit_transform(data['symptom3'].astype(str))
data[['symptom4']] = encoder.fit_transform(data['symptom4'].astype(str))
data[['symptom5']] = encoder.fit_transform(data['symptom5'].astype(str))
data[['symptom6']] = encoder.fit_transform(data['symptom6'].astype(str))

In [0]:
data['sym_on'] = pd.to_datetime(data['sym_on'])
data['hosp_vis'] = pd.to_datetime(data['hosp_vis'])
data['sym_on']= data['sym_on'].map(dt.datetime.toordinal)
data['hosp_vis']= data['hosp_vis'].map(dt.datetime.toordinal)
data['diff_sym_hos']= data['hosp_vis'] - data['sym_on']

In [0]:
data['diff_symp_hos'] = data['hosp_vis']-data['sym_on']

In [0]:
data = data.drop(['sym_on','hosp_vis'],axis=1)

In [0]:
print(data.dtypes)

###Visualization

In [0]:
import matplotlib.pyplot as plt
def counter2(colname1,colname2):
  colname1 = pd.Series(colname1)
  colname2 = pd.Series(colname2)
  count1 = 0
  for i in range(min([colname1.size,colname2.size])):
    if(colname1[i]==1 and colname2[i]==1):
      count1 = count1+1
  return count1

def counter1(colname):
  colname1 = pd.Series(colname)
  count = 0
  for i in range(colname1.size):
    if(colname1[i]==1):
      count = count+1
  return count

In [0]:
fwuh = counter1(data['from_wuhan'])
vwuh = counter1(data['vis_wuhan'])


In [0]:
print(counter1(data['death']))
print(counter2(data['from_wuhan'],data['death']))
print(counter2(data['vis_wuhan'],data['death']))

In [0]:
import matplotlib.pyplot as plt

plt.bar(['From Wuhan','Visiting Wuhan'],[counter2(data['death'],data['from_wuhan']),counter2(data['death'],data['vis_wuhan'])],color='green')
plt.title('Patient Deaths')
plt.xlabel('Patients\' Native Place')
plt.ylabel('Number of Deaths')
plt.plot([counter2(data['death'],data['from_wuhan']),counter2(data['death'],data['vis_wuhan'])],color='red')
plt.show()

In [0]:
plt.bar(['From Wuhan','Visiting Wuhan'],[counter2(data['recov'],data['from_wuhan']),counter2(data['recov'],data['vis_wuhan'])],color='purple')
plt.title('Recovered Patients')
plt.xlabel('Patients\' Native Place')
plt.ylabel('Number of Patients Recovered')
plt.plot([counter2(data['recov'],data['from_wuhan']),counter2(data['recov'],data['vis_wuhan'])],color='blue')
plt.show()

###Training

In [0]:
tdata = pd.read_csv('train.csv')
print(tdata.head())

In [0]:
tdata = pd.read_csv('train.csv')
tdata = tdata.drop('id',axis=1)
tdata = tdata.fillna(np.nan,axis=0)
tdata['age'] = tdata['age'].fillna(value=tdata['age'].mean())
tdata['location'] = encoder.fit_transform(tdata['location'].astype(str))
tdata['country'] = encoder.fit_transform(tdata['country'].astype(str))
tdata['gender'] = encoder.fit_transform(tdata['gender'].astype(str))
tdata[['symptom1']] = encoder.fit_transform(tdata['symptom1'].astype(str))
tdata[['symptom2']] = encoder.fit_transform(tdata['symptom2'].astype(str))
tdata[['symptom3']] = encoder.fit_transform(tdata['symptom3'].astype(str))
tdata[['symptom4']] = encoder.fit_transform(tdata['symptom4'].astype(str))
tdata[['symptom5']] = encoder.fit_transform(tdata['symptom5'].astype(str))
tdata[['symptom6']] = encoder.fit_transform(tdata['symptom6'].astype(str))

In [0]:
tdata['sym_on'] = pd.to_datetime(tdata['sym_on'])
tdata['hosp_vis'] = pd.to_datetime(tdata['hosp_vis'])
tdata['sym_on']= tdata['sym_on'].map(dt.datetime.toordinal)
tdata['hosp_vis']= tdata['hosp_vis'].map(dt.datetime.toordinal)
tdata['diff_sym_hos']= tdata['hosp_vis'] - tdata['sym_on']

In [0]:
tdata = tdata.drop(['sym_on','hosp_vis'],axis=1)

In [0]:
print(tdata)

In [0]:
print(tdata.isna().sum())

In [0]:
from sklearn.metrics import recall_score as rs
from sklearn.metrics import precision_score as ps
from sklearn.metrics import f1_score as fs
from sklearn.metrics import balanced_accuracy_score as bas
from sklearn.metrics import confusion_matrix as cm

In [0]:
rf = RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=2, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
classifier = AdaBoostClassifier(rf,50,0.01,'SAMME.R',10)

In [0]:
X = tdata[['location','country','gender','age','vis_wuhan','from_wuhan','symptom1','symptom2','symptom3','symptom4','symptom5','symptom6','diff_sym_hos']]
Y = tdata['death']

In [0]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2,random_state=0)
classifier.fit(X_train,np.array(Y_train).reshape(Y_train.shape[0],1))

In [0]:
pred = np.array(classifier.predict(X_test))

recall = rs(Y_test,pred)
precision = ps(Y_test,pred)
f1 = fs(Y_test,pred)
ma = classifier.score(X_test,Y_test)

In [0]:
print('*** Evaluation metrics for test dataset ***\n')
print('Recall Score: ',recall)
print('Precision Score: ',precision)
print('F1 Score: ',f1)
print('Accuracy: ',ma)
a = pd.DataFrame(Y_test)
a['pred']= classifier.predict(X_test)
print('\n\tTable 3\n')
print(a.head())

*** Evaluation metrics for test dataset ***

Recall Score:  0.75
Precision Score:  1.0
F1 Score:  0.8571428571428571
Accuracy:  0.9333333333333333

	Table 3

     death  pred
130      0     0
203      0     0
170      1     0
66       0     0
181      0     0


In [0]:
print(pd.DataFrame({'Val':Y_test,'Pred':classifier.predict(X_test)}))

In [0]:
X1 = tdata[['location','country','gender','age','vis_wuhan','from_wuhan','symptom1','symptom2','symptom3','symptom4','symptom5','symptom6','diff_sym_hos']]
Y1 = tdata['death']
classifier1 = RandomForestClassifier()

n_estimators = [100,200,300,400,500]
max_depth = [1,2,5,6]
min_samples_split = [1,2,6,7]
min_samples_leaf = [2,3,4,5]

params_grid = {'n_estimators':n_estimators,'max_depth':max_depth,'min_samples_split':min_samples_split,'min_samples_leaf':min_samples_leaf}

gridder = GridSearchCV(estimator=classifier1,param_grid=params_grid,n_jobs=-1,cv=5,verbose=5 )
gridder.fit(X1,np.array(Y1).reshape(Y1.shape[0],1))

Fitting 5 folds for each of 320 candidates, totalling 1600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:   24.3s
[Parallel(n_jobs=-1)]: Done 180 tasks      | elapsed:   50.6s
[Parallel(n_jobs=-1)]: Done 306 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 468 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 666 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 900 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done 1170 tasks      | elapsed:  5.8min
[Parallel(n_jobs=-1)]: Done 1476 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done 1600 out of 1600 | elapsed:  8.0min finished
  self.best_estimator_.fit(X, y, **fit_params)


GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rando

In [0]:
print(gridder.best_estimator_)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=2, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)


In [0]:
!ls

data.csv  test.xlsx  train.csv


In [0]:
udata = pd.read_excel('test.xlsx')
udata = udata.drop('id',axis=1)

In [0]:
print(udata.columns)

Index(['location', 'country', 'gender', 'age', 'sym_on', 'hosp_vis',
       'vis_wuhan', 'from_wuhan', 'symptom1', 'symptom2', 'symptom3',
       'symptom4', 'symptom5', 'symptom6'],
      dtype='object')


In [0]:
udata = udata.fillna(np.nan,axis=0)
udata['age'] = udata['age'].fillna(value=udata['age'].mean())
udata['from_wuhan'] = udata['from_wuhan'].fillna(value=0)
udata['from_wuhan'] = udata['from_wuhan'].astype(int)
udata['location'] = encoder.fit_transform(udata['location'].astype(str))
udata['country'] = encoder.fit_transform(udata['country'].astype(str))
udata['gender'] = encoder.fit_transform(udata['gender'].astype(str))
udata[['symptom1']] = encoder.fit_transform(udata['symptom1'].astype(str))
udata[['symptom2']] = encoder.fit_transform(udata['symptom2'].astype(str))
udata[['symptom3']] = encoder.fit_transform(udata['symptom3'].astype(str))
udata[['symptom4']] = encoder.fit_transform(udata['symptom4'].astype(str))
udata[['symptom5']] = encoder.fit_transform(udata['symptom5'].astype(str))
udata[['symptom6']] = encoder.fit_transform(udata['symptom6'].astype(str))

In [0]:
print(udata['from_wuhan'].mode())

0    0
dtype: int64


In [0]:
udata['sym_on'] = pd.to_datetime(udata['sym_on'])
udata['hosp_vis'] = pd.to_datetime(udata['hosp_vis'])
udata['sym_on']= udata['sym_on'].map(dt.datetime.toordinal)
udata['hosp_vis']= udata['hosp_vis'].map(dt.datetime.toordinal)
udata['diff_sym_hos']= udata['hosp_vis'] - udata['sym_on']

In [0]:
print(udata['from_wuhan'].unique())

[0 1]


In [0]:
print(udata.dtypes)

location          int64
country           int64
gender            int64
age             float64
sym_on            int64
hosp_vis          int64
vis_wuhan         int64
from_wuhan        int64
symptom1          int64
symptom2          int64
symptom3          int64
symptom4          int64
symptom5          int64
symptom6          int64
diff_sym_hos      int64
dtype: object


In [0]:
udata = udata[['location','country','gender','age','vis_wuhan','from_wuhan','symptom1','symptom2','symptom3','symptom4','symptom5','symptom6','diff_sym_hos']]
udata['result'] = classifier.predict(udata)

In [0]:
print(udata['result'])

0      1
1      0
2      0
3      0
4      0
      ..
858    1
859    0
860    0
861    0
862    0
Name: result, Length: 863, dtype: int64


In [0]:
!cd '/gdrive/My Drive/cvd'

In [0]:
udata.to_csv('/gdrive/My Drive/cvd/final.csv')