## Feature selection

### Importing libraries

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso

### Reading dataset

In [3]:
data = pd.read_csv(r"C:\Users\bhavv\Downloads\Final_Project_dataset_V2.csv")

### Dropping nulls from dataset and checking the data

In [4]:
data = data.dropna()

In [5]:
data

Unnamed: 0,Case no.,Date of notification to WHO (yyyy/mm/dd),Reporting country,City of residence,Age,Sex,Health care worker,Comorbidities,Exposure to camels,Camel milk consumption,Exposure to MERS-CoV cases,Date of symptoms onset (yyyy/mm/dd),Date of first hospitalization (yyyy/mm/dd),Date of laboratory confirmation (yyyy/mm/dd),Day_diff,Status
0,1.0,2017-08-15,Saudi Arabia,Madinah,67.0,M,No,Yes,Yes,No,No,2017-08-12,2017-08-14,2017-08-14,2,Alive
1,2.0,2017-08-16,Saudi Arabia,Jeddah,69.0,M,No,Yes,No,No,No,2017-08-13,2017-08-15,2017-08-15,2,Alive
2,3.0,2017-08-18,Saudi Arabia,Jeddah,51.0,M,No,Yes,No,No,No,2017-08-11,2017-08-17,2017-08-17,6,Alive
3,4.0,2017-08-21,Saudi Arabia,Riyadh,48.0,M,No,Yes,Yes,Yes,No,2017-08-18,2017-08-20,2017-08-20,2,Alive
4,5.0,2017-08-24,Saudi Arabia,Dawmet Aljandal,40.0,F,No,Yes,No,No,No,2017-08-17,2017-08-21,2017-08-22,5,Alive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2433,4.0,2017-01-04,Saudi Arabia,Buridah,88.0,F,No,Yes,No,No,No,2016-12-19,2017-01-01,2017-01-03,15,Deceased
2434,5.0,2017-01-04,Saudi Arabia,Buridah,87.0,F,No,Yes,No,No,No,2016-12-29,2017-01-02,2017-01-04,6,Alive
2435,6.0,2017-01-04,Saudi Arabia,Riyadh,56.0,M,No,Yes,No,No,No,2016-12-28,2017-01-02,2017-01-04,7,Alive
2437,8.0,2017-01-02,Saudi Arabia,Madinah,70.0,M,No,Yes,Yes,Yes,No,2016-12-25,2017-01-01,2017-01-02,8,Alive


### Performing label encoding for Random Forest Feature Selection

In [6]:

for column in data.columns:
    if data[column].dtype == type(object):
        le = LabelEncoder()
        data[column] = le.fit_transform(data[column].astype(str))


### Splitting data into X and y

In [7]:
X = data.iloc[:,0:15]
y = data["Status"]


### Modelling Random Forest for Feature Selection

In [8]:
randomforest = RandomForestRegressor(random_state=0)

model = randomforest.fit(X,y)

from sklearn.feature_selection import SelectFromModel
sfm = SelectFromModel(model,threshold=0.05)
sfm.fit(X,y)
for feature_list_index in sfm.get_support(indices=True):
    print(X.columns[feature_list_index])





Case no.
Date of notification to WHO (yyyy/mm/dd)
City of residence
Age
Date of symptoms onset (yyyy/mm/dd)
Date of first hospitalization (yyyy/mm/dd)
Date of laboratory confirmation (yyyy/mm/dd)
Day_diff


### Storing all sorted data in a dataframe result_RF

In [9]:
result_RF = pd.DataFrame(list(zip(X.columns,model.feature_importances_)), columns = ['predictor','Gini coefficient'])

In [10]:
result_RF.sort_values(by='Gini coefficient', ascending=False)

Unnamed: 0,predictor,Gini coefficient
4,Age,0.198117
0,Case no.,0.140378
3,City of residence,0.133383
1,Date of notification to WHO (yyyy/mm/dd),0.090779
12,Date of first hospitalization (yyyy/mm/dd),0.088125
11,Date of symptoms onset (yyyy/mm/dd),0.085896
14,Day_diff,0.083865
13,Date of laboratory confirmation (yyyy/mm/dd),0.074523
5,Sex,0.033264
7,Comorbidities,0.027898


### Performing Standarization for Lasso

In [11]:
scaler = StandardScaler()
X_std = scaler.fit_transform(X)


### Modeling Lasso for Feature Selection

In [12]:
model_l = Lasso(alpha=0.01,positive=True)
model_l.fit(X_std,y)


Lasso(alpha=0.01, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=True, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

### Storing all sorted data in a dataframe result_Lasso

In [13]:
result_Lasso = pd.DataFrame(list(zip(X.columns,model_l.coef_)), columns = ['predictor','coefficient'])

In [14]:
result_Lasso.sort_values(by='coefficient', ascending=False)

Unnamed: 0,predictor,coefficient
4,Age,0.10754
2,Reporting country,0.082143
7,Comorbidities,0.080815
5,Sex,0.061456
10,Exposure to MERS-CoV cases,0.055873
3,City of residence,0.037822
0,Case no.,0.0
1,Date of notification to WHO (yyyy/mm/dd),0.0
6,Health care worker,0.0
8,Exposure to camels,0.0
