# Environment installation

In [191]:
# Imports
import numpy as np
import pandas as pd
import joblib
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestRegressor
import csv

import warnings
warnings.filterwarnings('ignore')


# Configuration

In [192]:
# !pwd
# /Source_14012023_v4/preprocessing_data

In [193]:
prj_path = '../'
data_path = prj_path + "data/new_data/DH/squeezed/"
prj_path_opt= prj_path + "optimize_hyperparam/opt_results/"
output_process = prj_path + "data/new_data/DH/processed_data/"
output_featureselection = prj_path + "data/new_data/DH/feature_selection/"

In [194]:
data_set = {0 : "train", 1 : "test"}

In [195]:
all_cities = [
        'An Giang', 'BR Vũng Tàu', 'Bình Phước', 'Bình Thuận', 'Bình Định',
        'Bạc Liêu', 'Bắc Kạn', 'Bắc Giang', 'Cao Bằng', 'Cà Mau',
        'Cần Thơ', 'Gia Lai', 'Hà Giang', 'Hà Nội', 'Hà Tĩnh',
        'Hòa Bình','Hưng Yên', 'Hải Dương', 'Hải Phòng', 'Khánh Hòa', 'Kiên Giang',
        'Kon Tum', 'Lai Châu', 'Long An', 'Lào Cai', 'Lâm Đồng',
        'Lạng Sơn','Nam Định', 'Nghệ An', 'Ninh Bình', 'Ninh Thuận',
        'Phú Thọ', 'Phú Yên', 'Quảng Bình', 'Quảng Nam', 'Quảng Ngãi',
        'Quảng Ninh', 'Quảng Trị', 'Sóc Trăng', 'Sơn La', 'TT Huế',
        'Thanh Hóa', 'Thái Bình', 'Thái Nguyên', 'Tiền Giang', 'Trà Vinh',
        'Tuyên Quang', 'Tây Ninh', 'Vĩnh Phúc', 'Yên Bái', 'Điện Biên',
        'Đà Nẵng', 'Đắk Nông', 'Đắk Lắk', 'Đồng Tháp'
]
cities = all_cities

In [196]:
# Set hyperparameters as args using the Configuration class
class Configuration():
    def __init__(self):
      # lấy bộ test dài 36 tháng = 3 năm
        self.test_size = 36
        # là nhìn vào dữ liệu trước 3 tháng và dự phóng        
        self.look_back = 3
        # dự phóng n-step trong 6 tháng
        self.n_predicted_period_months = 6
        self.n_features = 3
        self.seed = 42
        # mỗi phần tử x trong tập suppervise có độ lớn là 16 = 16 tháng
        self.batch_size = 16
        self.epochs = 300

args = Configuration()

# Supporting functions

In [197]:
def get_dict_all_city_data():
  cities_data = {}  
  for city in cities:
    city_result = pd.read_excel(prj_path+'data/new_data/DH/squeezed/squeezed_'+city+'.xlsx')  
    """Get all data from all city in 1997 - 2016""" 
    city_result = city_result.loc[city_result['year_month'] < '2017-1-1'] 
    cities_data[city] = city_result
  return cities_data

In [198]:
# Define data (pre-)processing functions
# modification
def get_city_data(city_name, dict_full_data):
    """Returns Diarrhoea rate and climate data""" 
    city_data = dict_full_data[city_name].drop(columns=['Diarrhoea_cases','Diarrhoea_rates', 'province',
                                                        'Influenza_rates','Influenza_cases',
                                                        'Dengue_fever_cases', 'year', 'month'], 
                                                                  axis=1, 
                                                                  inplace=False)    
    return city_data

def convert_to_stationary(city_data):
    """Subtracts previous value for all cols except disease rates"""
    for col_name in city_data.columns:
        if col_name != 'Diarrhoea_rates':
            try:
                city_data[col_name] = city_data[col_name] - city_data[col_name].shift()
            except:
                print(col_name)
    return city_data

def impute_missing_value(city_data):
    """
    Imputes 0 for first 12 months, 
    last year's value for months 12-24, 
    and minimum value of last two years for months 25+
    """
    for col in city_data.columns:
        for index in range(len(city_data[col])):
            if np.isnan(city_data[col].iloc[index]):
                if index < 12:
                    city_data[col].iloc[index] = 0
                elif index >= 12 and index <= 24:
                    city_data[col].iloc[index] = city_data[col].iloc[index - 12]
                else:
                    city_data[col].iloc[index] = min(city_data[col].iloc[index - 12], city_data[col].iloc[index - 24])
    return city_data

In [199]:
def clean_full_data(dict_full_data):
    climate_and_disease_feats = ['Total_Evaporation',
       'Total_Rainfall', 'Max_Daily_Rainfall', 'n_raining_days',
       'Average_temperature', 'Max_Average_Temperature',
       'Min_Average_Temperature', 'Max_Absolute_Temperature',
       'Min_Absolute_Temperature', 'Average_Humidity', 'Min_Humidity',
       'n_hours_sunshine', 'Dengue_fever_rates']
    for city in cities:
        city_data = get_city_data(city_name=city,dict_full_data = dict_full_data)
        city_data_features = city_data[climate_and_disease_feats]
        city_data_features = impute_missing_value(city_data_features)
        city_data_features = convert_to_stationary(city_data_features)
        city_data_features.dropna(inplace=True)
        city_data_features.loc[:, "year_month"] = city_data["year_month"]
        dict_full_data[city] = city_data_features
    return dict_full_data


In [248]:
def split_data(data, look_back, n_nextstep = args.n_predicted_period_months):
    """Splits data into train and test sets based on args (Configuration class)"""
    train = data[: -args.test_size]    
    test = data[-args.test_size - look_back-(n_nextstep - 1): ]
    return train, test

In [201]:
def to_supervised(data,  d_out, d_in, features_list=[]):
    """
    Frames time-series as supervised learning dataset.
    
    Args:
      d_in: lookback window
      d_out: number of predicted months
      features_list: list of all features **where last col is the disease incidence**

    Returns:
      Numpy arrays of disease incidence (y) and other predictors (X)
    """
    X, y = list(), list()
    for index, _ in enumerate(data):
        in_end = index + d_in
        out_end = in_end + d_out
        if out_end <= len(data):
            if len(features_list) == 0 :
                X.append(data[index: in_end, :])
            else:
                X.append(data[index: in_end, features_list])
            y.append(data[out_end-1: out_end, -1])
    return np.array(X), np.array(y)

In [202]:
def featureSelection(train, next_predicted_month, n_selection_feature, n_selection_feature_by_day = 18): #max args.look_back * num_feature = 36 => take half
  train_X, train_y = to_supervised(train, d_out=next_predicted_month, d_in=args.look_back )

  D = train_X.shape[2]
  rfe = RFE(RandomForestRegressor(), n_features_to_select=n_selection_feature_by_day)
  fit = rfe.fit(train_X.reshape(len(train_X),D*args.look_back), train_y)
  important_features = list()

  for i in range(len(fit.support_)):
      if fit.support_[i]:
          important_features.append(i)
  result = np.array(important_features)
  calMostFeature = [0]*D
  for i in result:
    calMostFeature[i%D] = calMostFeature[i%D] + 1
  top_idx =  np.sort(np.argsort(calMostFeature)[-n_selection_feature:])
  return top_idx

In [203]:
def normalizationMinMax(df,city,data_set_index):
    date_index = df.index
    norm_set = data_set[data_set_index]
    scaler = MinMaxScaler()
    if norm_set == "train":
        scaler.fit(df)
        series = scaler.transform(df)
        df_scaled = pd.DataFrame(data = series, columns = df.columns)
        joblib.dump(scaler, output_process+city+'_train_scalerMinMaxNorm.save')

    else:
        true_incidence = df.iloc[:, [-1]]
        scaler = joblib.load(output_process+city+'_train_scalerMinMaxNorm.save') #ok
        series = scaler.transform(df)
        df_scaled = pd.DataFrame(data = series, columns = df.columns)
        df_scaled.iloc[:, [-1]] = true_incidence
    df_scaled["year_month"] = date_index
    """Save data as csv, when load data as a dataframe, use command df.iloc[:,:-1].to_numpy() to convert to an array to use"""
    df_scaled.to_csv(output_process+city+'_'+norm_set+'_preprocessed.csv', float_format='%.7f',index=False)
    return df_scaled,scaler

# Run processing data

In [249]:
dict_full_data = get_dict_all_city_data()
full_data = clean_full_data(dict_full_data=dict_full_data)
for city in cities:
    specific_data = full_data[city]
    specific_data = specific_data.set_index("year_month")
    train, test = split_data(specific_data, args.look_back)
    df_train,scaler = normalizationMinMax(train,city, 0)
    df_test,scaler = normalizationMinMax(test,city, 1)

In [247]:
specific_data

Unnamed: 0_level_0,Total_Evaporation,Total_Rainfall,Max_Daily_Rainfall,n_raining_days,Average_temperature,Max_Average_Temperature,Min_Average_Temperature,Max_Absolute_Temperature,Min_Absolute_Temperature,Average_Humidity,Min_Humidity,n_hours_sunshine,Dengue_fever_rates
year_month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1997-02-28,-14.9,17.3,9.0,3.0,1.4,1.3,1.7,0.7,1.5,-2.0,4.0,-75.2,-7.960848
1997-03-31,37.9,-17.7,-9.4,-4.0,0.7,1.6,-0.1,1.8,-0.1,-3.0,-6.0,90.2,-0.717781
1997-04-30,6.5,66.4,32.1,7.0,1.5,2.0,1.5,1.7,1.8,2.0,-5.0,-33.8,-0.065253
1997-05-31,-63.2,99.2,2.4,8.0,0.0,-0.8,0.5,-0.4,0.0,2.0,5.0,-24.9,1.109299
1997-06-30,47.2,-119.9,-17.0,-4.0,-0.3,-0.4,-0.2,-0.6,0.1,3.0,2.0,20.9,-3.001631
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016-08-31,23.0,325.0,-45.0,-5.0,0.9,0.5,1.0,0.7,0.9,0.0,3.0,-7.0,4.089374
2016-09-30,-23.0,-204.0,69.0,6.0,-0.7,-0.7,-0.4,-0.1,-0.9,-2.0,-6.0,-18.0,5.274699
2016-10-31,-17.5,-5.0,-19.0,2.0,-0.7,-1.5,-0.5,-1.9,0.0,4.0,14.0,-72.0,-2.252119
2016-11-30,9.7,-246.0,-21.0,-15.0,0.6,0.7,0.3,-0.2,0.0,-4.0,-4.0,100.0,-1.718722


# Run feature selection

In [206]:
fields = ['City', 'Target', '1st_Feature', '2nd_Feature', '3rd_Feature'] #, '3rd_Feature'
for next_predicted_month in range(1,args.n_predicted_period_months+1):
    filename = output_featureselection+str(next_predicted_month)+"step_feature_selection_3_most.csv"
    with open(filename, 'w') as csvfile:
        rows = []
        csvwriter = csv.writer(csvfile)
        csvwriter.writerow(fields)
        for city in cities:
            df = pd.read_csv(output_process+city+'_train_preprocessed.csv', parse_dates=True, index_col= None, encoding = 'unicode_escape')
            train = df.iloc[:,:-1].to_numpy()
            important_features = featureSelection(train, next_predicted_month, args.n_features, n_selection_feature_by_day = 18)
            list_features = full_data[city].columns.tolist()
            selected_features = [
                list_features[important_features[0]],
                list_features[important_features[1]],
                list_features[important_features[2]]
            ]
            rows=[city,'Dengue_fever_rates',selected_features[0],selected_features[1],selected_features[2]] #,third_feature
            csvwriter.writerow(rows)
            print("-------------Feature selection processing------------")
            print("----> ",next_predicted_month," step")
            print("--> ",city)
            print(selected_features)

-------------Feature selection processing------------
---->  1  step
-->  An Giang
['Total_Evaporation', 'Total_Rainfall', 'Dengue_fever_rates']
-------------Feature selection processing------------
---->  1  step
-->  BR Vũng Tàu
['Max_Absolute_Temperature', 'Min_Humidity', 'Dengue_fever_rates']
-------------Feature selection processing------------
---->  1  step
-->  Bình Phước
['Max_Daily_Rainfall', 'Min_Average_Temperature', 'Dengue_fever_rates']
-------------Feature selection processing------------
---->  1  step
-->  Bình Thuận
['Max_Absolute_Temperature', 'Average_Humidity', 'Dengue_fever_rates']
-------------Feature selection processing------------
---->  1  step
-->  Bình Định
['Max_Daily_Rainfall', 'Min_Humidity', 'Dengue_fever_rates']
-------------Feature selection processing------------
---->  1  step
-->  Bạc Liêu
['Max_Absolute_Temperature', 'Average_Humidity', 'Dengue_fever_rates']
-------------Feature selection processing------------
---->  1  step
-->  Bắc Kạn
['n_rain