### Import libraries and load datasets

In [96]:
import pandas as pd

from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression, Lasso, Ridge, LassoCV, RidgeCV
from sklearn.metrics import mean_absolute_error

In [97]:
x_train = pd.read_csv('scaled_feature_drop_na_dengue.csv').drop('Unnamed: 0', axis = 1)
y_train = pd.read_csv('label_drop_na_dengue.csv')['total_cases']

correlated_col = list(x_train.columns)

train_df = x_train.join(y_train)

train_df

Unnamed: 0,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,reanalysis_avg_temp_k,reanalysis_dew_point_temp_k,reanalysis_max_air_temp_k,reanalysis_min_air_temp_k,...,month_04,month_05,month_06,month_07,month_08,month_09,month_10,month_11,month_12,total_cases
0,-0.122207,-0.253846,-0.095140,-0.328502,-0.814598,-0.808198,-1.190146,-1.926304,-1.171085,0.122086,...,True,False,False,False,False,False,False,False,False,4
1,0.215513,0.064732,-0.580536,-0.587307,-0.573640,-0.340090,-0.634981,-0.901462,-0.837468,0.315202,...,False,True,False,False,False,False,False,False,False,5
2,-0.767304,0.319858,-0.649827,-0.407716,-0.302099,0.077752,-0.289418,0.087186,-0.958783,0.662810,...,False,True,False,False,False,False,False,False,False,4
3,-0.079130,0.917244,0.295500,0.352911,-0.746481,0.228552,-0.011835,0.004322,-0.685825,0.546941,...,False,True,False,False,False,False,False,False,False,3
4,0.403295,1.059203,0.613169,0.486861,-0.928126,0.618119,0.333727,0.345301,-0.534181,0.740057,...,False,True,False,False,False,False,False,False,False,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1194,1.449658,1.528992,0.682269,1.015140,0.178890,0.483027,1.211795,1.014877,1.831463,-0.418637,...,False,True,False,False,False,False,False,False,False,5
1195,0.145949,0.215499,-0.934097,0.233295,0.901068,-0.253170,0.118459,0.766286,1.467517,-1.422839,...,False,False,True,False,False,False,False,False,False,8
1196,0.766413,0.096898,0.601843,0.327518,0.263225,-1.522404,-1.309110,0.131951,0.557655,-1.229723,...,False,False,True,False,False,False,False,False,False,1
1197,1.386571,0.923083,0.985157,1.400717,0.280138,-1.707763,-1.365760,0.013847,0.739627,-1.422839,...,False,False,True,False,False,False,False,False,False,1


### Split train set and test set

In [98]:
train = train_df.sample(frac = 0.7, random_state = 1)

test = train_df.drop(train.index)

y_train = train['total_cases']
x_train = train.drop('total_cases', axis = 1)

y_test = test['total_cases']
x_test = test.drop('total_cases', axis = 1)

### ATTEMPT 1: Regular Linear model

In [99]:
model = LinearRegression().fit(x_train, y_train)

y_pred = model.predict(x_test)

mean_absolute_error(y_test, y_pred)

16.470504090165583

### ATTEMPT 2: Regular Linear model with RFE(10)

In [100]:
model = LinearRegression()

rfe = RFE(model, n_features_to_select = 10)

selected_features = rfe.fit(x_train, y_train)

selected_features = list(x_train.columns[rfe.support_])

selected_features

['reanalysis_air_temp_k',
 'reanalysis_dew_point_temp_k',
 'reanalysis_relative_humidity_percent',
 'reanalysis_specific_humidity_g_per_kg',
 'iq',
 'sj',
 'month_03',
 'month_04',
 'month_05',
 'month_06']

In [101]:
model = LinearRegression().fit(x_train[selected_features], y_train)

y_pred = model.predict(x_test[selected_features])

mean_absolute_error(y_test, y_pred)

16.612227101314275

### ATTEMPT 2: RidgeCV Linear model with RFE(10)

In [102]:
ridge_cv = RidgeCV(alphas=[1, 10, 20], cv = 5)

rfe = RFE(ridge_cv, n_features_to_select = 10)

selected_features = rfe.fit_transform(x_train, y_train)

selected_features = list(x_train.columns[rfe.support_])

selected_features

['reanalysis_air_temp_k',
 'reanalysis_avg_temp_k',
 'iq',
 'sj',
 'month_03',
 'month_04',
 'month_05',
 'month_06',
 'month_09',
 'month_10']

In [103]:
ridge_cv.fit(x_train[selected_features], y_train)

best_alpha = ridge_cv.alpha_

best_alpha

10

In [104]:
y_pred = ridge_cv.predict(x_train[selected_features])

mae = mean_absolute_error(y_train, y_pred)

mae

15.449426183306283

### ATTEMPT 3: LASSO Linear model with RFE(10)

In [105]:
lasso_cv = LassoCV(alphas=[0.006, 0.01, 0.015], cv = 5)

rfe = RFE(lasso_cv, n_features_to_select = 10)

selected_features = rfe.fit_transform(x_train, y_train)

selected_features = list(x_train.columns[rfe.support_])

selected_features

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

['reanalysis_air_temp_k',
 'reanalysis_avg_temp_k',
 'reanalysis_dew_point_temp_k',
 'reanalysis_specific_humidity_g_per_kg',
 'iq',
 'month_03',
 'month_04',
 'month_05',
 'month_06',
 'month_10']

In [106]:
lasso_cv.fit(x_train[selected_features], y_train)

best_alpha = lasso_cv.alpha_

best_alpha

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent(


0.006

In [107]:
y_pred = lasso_cv.predict(x_train[selected_features])

mae = mean_absolute_error(y_train, y_pred)

mae

14.997102848427636