### Import relevant libraries and loading datasets

In [61]:
import pandas as pd

from sklearn.preprocessing import StandardScaler

X = pd.read_csv('original_datasets/dengue_features_train.csv')
y = pd.read_csv('original_datasets/dengue_labels_train.csv')

In [62]:
X

Unnamed: 0,city,year,weekofyear,week_start_date,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,...,reanalysis_precip_amt_kg_per_m2,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm
0,sj,1990,18,1990-04-30,0.122600,0.103725,0.198483,0.177617,12.42,297.572857,...,32.00,73.365714,12.42,14.012857,2.628571,25.442857,6.900000,29.4,20.0,16.0
1,sj,1990,19,1990-05-07,0.169900,0.142175,0.162357,0.155486,22.82,298.211429,...,17.94,77.368571,22.82,15.372857,2.371429,26.714286,6.371429,31.7,22.2,8.6
2,sj,1990,20,1990-05-14,0.032250,0.172967,0.157200,0.170843,34.54,298.781429,...,26.10,82.052857,34.54,16.848571,2.300000,26.714286,6.485714,32.2,22.8,41.4
3,sj,1990,21,1990-05-21,0.128633,0.245067,0.227557,0.235886,15.36,298.987143,...,13.90,80.337143,15.36,16.672857,2.428571,27.471429,6.771429,33.3,23.3,4.0
4,sj,1990,22,1990-05-28,0.196200,0.262200,0.251200,0.247340,7.52,299.518571,...,12.20,80.460000,7.52,17.210000,3.014286,28.942857,9.371429,35.0,23.9,5.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1451,iq,2010,21,2010-05-28,0.342750,0.318900,0.256343,0.292514,55.30,299.334286,...,45.00,88.765714,55.30,18.485714,9.800000,28.633333,11.933333,35.4,22.4,27.0
1452,iq,2010,22,2010-06-04,0.160157,0.160371,0.136043,0.225657,86.47,298.330000,...,207.10,91.600000,86.47,18.070000,7.471429,27.433333,10.500000,34.7,21.7,36.6
1453,iq,2010,23,2010-06-11,0.247057,0.146057,0.250357,0.233714,58.94,296.598571,...,50.60,94.280000,58.94,17.008571,7.500000,24.400000,6.900000,32.2,19.2,7.4
1454,iq,2010,24,2010-06-18,0.333914,0.245771,0.278886,0.325486,59.67,296.345714,...,62.33,94.660000,59.67,16.815714,7.871429,25.433333,8.733333,31.2,21.0,16.0


### Pre-Processing with scaling

In [63]:
# create dummy variables for city
X_city = pd.get_dummies(X['city'])

# create a new feature 'month'
def set_month(week_start_date):
    return week_start_date[5: 7]

X_month = pd.DataFrame(X['week_start_date'].apply(set_month)).rename(columns = {'week_start_date': 'month'})

# create dummy variables for month
X_month = pd.get_dummies(X_month, columns = ['month'])

# drop useless features
X.drop(['city', 'year', 'weekofyear', 'week_start_date'], axis = 1, inplace = True)

# drop all rows with missing value
X.dropna(inplace = True)

# record the indices
rest_index = X.index

# standardization
scalar = StandardScaler()
X = pd.DataFrame(scalar.fit_transform(X), columns = X.columns)
X.index = rest_index

# join dummy variables
X = X.join(X_city)
X = X.join(X_month)

# select y according to the indices of all rest rows in X
y = y.loc[rest_index]

# reset indices
X = X.reset_index().drop('index', axis = 1)
y = y.reset_index().drop('index', axis = 1)

X.to_csv('scaled_dropna_attemp/scaled_feature_drop_na_dengue.csv')

X

Unnamed: 0,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,reanalysis_avg_temp_k,reanalysis_dew_point_temp_k,reanalysis_max_air_temp_k,reanalysis_min_air_temp_k,...,month_03,month_04,month_05,month_06,month_07,month_08,month_09,month_10,month_11,month_12
0,-0.122207,-0.253846,-0.095140,-0.328502,-0.814598,-0.808198,-1.190146,-1.926304,-1.171085,0.122086,...,False,True,False,False,False,False,False,False,False,False
1,0.215513,0.064732,-0.580536,-0.587307,-0.573640,-0.340090,-0.634981,-0.901462,-0.837468,0.315202,...,False,False,True,False,False,False,False,False,False,False
2,-0.767304,0.319858,-0.649827,-0.407716,-0.302099,0.077752,-0.289418,0.087186,-0.958783,0.662810,...,False,False,True,False,False,False,False,False,False,False
3,-0.079130,0.917244,0.295500,0.352911,-0.746481,0.228552,-0.011835,0.004322,-0.685825,0.546941,...,False,False,True,False,False,False,False,False,False,False
4,0.403295,1.059203,0.613169,0.486861,-0.928126,0.618119,0.333727,0.345301,-0.534181,0.740057,...,False,False,True,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1194,1.449658,1.528992,0.682269,1.015140,0.178890,0.483027,1.211795,1.014877,1.831463,-0.418637,...,False,False,True,False,False,False,False,False,False,False
1195,0.145949,0.215499,-0.934097,0.233295,0.901068,-0.253170,0.118459,0.766286,1.467517,-1.422839,...,False,False,False,True,False,False,False,False,False,False
1196,0.766413,0.096898,0.601843,0.327518,0.263225,-1.522404,-1.309110,0.131951,0.557655,-1.229723,...,False,False,False,True,False,False,False,False,False,False
1197,1.386571,0.923083,0.985157,1.400717,0.280138,-1.707763,-1.365760,0.013847,0.739627,-1.422839,...,False,False,False,True,False,False,False,False,False,False


In [64]:
y.to_csv('scaled_dropna_attemp/label_drop_na_dengue.csv')

y

Unnamed: 0,city,year,weekofyear,total_cases
0,sj,1990,18,4
1,sj,1990,19,5
2,sj,1990,20,4
3,sj,1990,21,3
4,sj,1990,22,6
...,...,...,...,...
1194,iq,2010,21,5
1195,iq,2010,22,8
1196,iq,2010,23,1
1197,iq,2010,24,1


### Pre-Processing without scaling

In [65]:
X = pd.read_csv('original_datasets/dengue_features_train.csv')
y = pd.read_csv('original_datasets/dengue_labels_train.csv')

X

Unnamed: 0,city,year,weekofyear,week_start_date,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,...,reanalysis_precip_amt_kg_per_m2,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm
0,sj,1990,18,1990-04-30,0.122600,0.103725,0.198483,0.177617,12.42,297.572857,...,32.00,73.365714,12.42,14.012857,2.628571,25.442857,6.900000,29.4,20.0,16.0
1,sj,1990,19,1990-05-07,0.169900,0.142175,0.162357,0.155486,22.82,298.211429,...,17.94,77.368571,22.82,15.372857,2.371429,26.714286,6.371429,31.7,22.2,8.6
2,sj,1990,20,1990-05-14,0.032250,0.172967,0.157200,0.170843,34.54,298.781429,...,26.10,82.052857,34.54,16.848571,2.300000,26.714286,6.485714,32.2,22.8,41.4
3,sj,1990,21,1990-05-21,0.128633,0.245067,0.227557,0.235886,15.36,298.987143,...,13.90,80.337143,15.36,16.672857,2.428571,27.471429,6.771429,33.3,23.3,4.0
4,sj,1990,22,1990-05-28,0.196200,0.262200,0.251200,0.247340,7.52,299.518571,...,12.20,80.460000,7.52,17.210000,3.014286,28.942857,9.371429,35.0,23.9,5.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1451,iq,2010,21,2010-05-28,0.342750,0.318900,0.256343,0.292514,55.30,299.334286,...,45.00,88.765714,55.30,18.485714,9.800000,28.633333,11.933333,35.4,22.4,27.0
1452,iq,2010,22,2010-06-04,0.160157,0.160371,0.136043,0.225657,86.47,298.330000,...,207.10,91.600000,86.47,18.070000,7.471429,27.433333,10.500000,34.7,21.7,36.6
1453,iq,2010,23,2010-06-11,0.247057,0.146057,0.250357,0.233714,58.94,296.598571,...,50.60,94.280000,58.94,17.008571,7.500000,24.400000,6.900000,32.2,19.2,7.4
1454,iq,2010,24,2010-06-18,0.333914,0.245771,0.278886,0.325486,59.67,296.345714,...,62.33,94.660000,59.67,16.815714,7.871429,25.433333,8.733333,31.2,21.0,16.0


In [66]:
# create dummy variable for city
X = X.join(pd.get_dummies(X.city)).drop(['city'], axis = 1)

# create a new feature 'month'
def set_month(week_start_date):
    return week_start_date[5: 7]

X['month'] = pd.DataFrame(X['week_start_date'].apply(set_month)).rename(columns = {'week_start_date': 'month'})

# create dummy variable for month
X = X.join(pd.get_dummies(X.month)).drop(['month'], axis = 1)

# drop useless features
X.drop(['year', 'weekofyear', 'week_start_date'], axis = 1, inplace = True)

# drop all rows with missing value
X.dropna(inplace = True)

# select y according to the indices of all rest rows in X
y = y.loc[rest_index]

# reset indices
X = X.reset_index().drop('index', axis = 1)
y = y.reset_index().drop('index', axis = 1)

X.to_csv('dropna_attemp/feature_drop_na_dengue.csv')

X

Unnamed: 0,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,reanalysis_avg_temp_k,reanalysis_dew_point_temp_k,reanalysis_max_air_temp_k,reanalysis_min_air_temp_k,...,03,04,05,06,07,08,09,10,11,12
0,0.122600,0.103725,0.198483,0.177617,12.42,297.572857,297.742857,292.414286,299.8,295.9,...,False,True,False,False,False,False,False,False,False,False
1,0.169900,0.142175,0.162357,0.155486,22.82,298.211429,298.442857,293.951429,300.9,296.4,...,False,False,True,False,False,False,False,False,False,False
2,0.032250,0.172967,0.157200,0.170843,34.54,298.781429,298.878571,295.434286,300.5,297.3,...,False,False,True,False,False,False,False,False,False,False
3,0.128633,0.245067,0.227557,0.235886,15.36,298.987143,299.228571,295.310000,301.4,297.0,...,False,False,True,False,False,False,False,False,False,False
4,0.196200,0.262200,0.251200,0.247340,7.52,299.518571,299.664286,295.821429,301.9,297.5,...,False,False,True,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1194,0.342750,0.318900,0.256343,0.292514,55.30,299.334286,300.771429,296.825714,309.7,294.5,...,False,False,True,False,False,False,False,False,False,False
1195,0.160157,0.160371,0.136043,0.225657,86.47,298.330000,299.392857,296.452857,308.5,291.9,...,False,False,False,True,False,False,False,False,False,False
1196,0.247057,0.146057,0.250357,0.233714,58.94,296.598571,297.592857,295.501429,305.5,292.4,...,False,False,False,True,False,False,False,False,False,False
1197,0.333914,0.245771,0.278886,0.325486,59.67,296.345714,297.521429,295.324286,306.1,291.9,...,False,False,False,True,False,False,False,False,False,False


In [67]:
y.to_csv('dropna_attemp/label_drop_na_dengue.csv')

y

Unnamed: 0,city,year,weekofyear,total_cases
0,sj,1990,18,4
1,sj,1990,19,5
2,sj,1990,20,4
3,sj,1990,21,3
4,sj,1990,22,6
...,...,...,...,...
1194,iq,2010,21,5
1195,iq,2010,22,8
1196,iq,2010,23,1
1197,iq,2010,24,1
