In [None]:
import config
import functions.functions
# Data processing
import numpy as np
import pandas as pd
import datetime
from fredapi import Fred 
import yfinance as yahooFinance

# Ignoring bothering warnings
import warnings
warnings.filterwarnings('ignore')


Load Data

In [None]:
CHR2, CHR8, CHRt = functions.functions.load_csv(drop_first_row=False)

# Add Frech Hourly Wage Data

In [None]:
# French hourly wage rate: https://fred.stlouisfed.org/series/LCWRTT01FRQ661N (Source: FRED)
fred = Fred(api_key="92fee3d8428eb7df33d3e452476aab25")   

startDate = datetime.datetime(2008, 10, 1)
endDate = datetime.datetime(2017, 3, 31)

CAC40 = yahooFinance.Ticker("^FCHI").history(start=startDate,end=endDate).Close
CAC40 = pd.DataFrame(np.log(CAC40))
CAC40 = (CAC40.groupby(pd.PeriodIndex(CAC40.index, freq='Q'), axis=0).mean().rename(columns=lambda c: str(c).lower()))
CAC40 = pd.DataFrame(CAC40)
CAC40.rename(columns={"close": "CAC40"}, inplace=True)


h_wage = fred.get_series('LCWRTT01FRQ661N', observation_start=startDate, observation_end=endDate) # Hourly Wage Rate: All Activities for France 
h_wage = pd.DataFrame(np.log(h_wage))
h_wage = (h_wage.groupby(pd.PeriodIndex(h_wage.index, freq='Q'), axis=0).mean().rename(columns=lambda c: str(c).lower()))
h_wage = pd.DataFrame(h_wage)
h_wage.rename(columns={"0": "h_wage"}, inplace=True)


# Add French Hourly Wage Rate
def add_external_data(data, external_data):
    external_data.index = external_data.index.astype("str")
    data = pd.concat([data, external_data], axis=1)
    data.dropna(inplace=True)
    return data

CHR2 = add_external_data(CHR2, CAC40)
CHR8 = add_external_data(CHR8, CAC40)
CHRt = add_external_data(CHRt, CAC40)
CHR2 = add_external_data(CHR2, h_wage)
CHR8 = add_external_data(CHR8, h_wage)
CHRt = add_external_data(CHRt, h_wage)

In [None]:
CHR2.to_csv("D:\\0 - Google Drive\Coding\JupyterNotebook\Drim Game\Data\CHR2.csv")
CHR8.to_csv("D:\\0 - Google Drive\Coding\JupyterNotebook\Drim Game\Data\CHR8.csv")
CHRt.to_csv("D:\\0 - Google Drive\Coding\JupyterNotebook\Drim Game\Data\CHRt.csv")

# Add De-temporalization Features

In [None]:
def DeTemporalizationGeneration(data, window=2):
    
    data = data.reindex(columns=['mean_1', 'median_1', 'p5_1', 'p10_1', 'p25_1',
        'p75_1', 'p90_1', 'p95_1', 'mean_2', 'median_2', 'p5_2', 'p10_2',
        'p25_2', 'p75_2', 'p90_2', 'p95_2', 'mean_3', 'median_3', 'p5_3',
        'p10_3', 'p25_3', 'p75_3', 'p90_3', 'p95_3', 'mean_4', 'median_4',
        'p5_4', 'p10_4', 'p25_4', 'p75_4', 'p90_4', 'p95_4', 'mean_5',
        'median_5', 'p5_5', 'p10_5', 'p25_5', 'p75_5', 'p90_5', 'p95_5',
        'mean_6', 'median_6', 'p5_6', 'p10_6', 'p25_6', 'p75_6', 'p90_6',
        'p95_6', 'mean_7', 'median_7', 'p5_7', 'p10_7', 'p25_7', 'p75_7',
        'p90_7', 'p95_7', 'mean_8', 'median_8', 'p5_8', 'p10_8', 'p25_8',
        'p75_8', 'p90_8', 'p95_8', 'CD_TY_CLI_RCI_1', 'CD_TY_CLI_RCI_2',
        'CD_ETA_CIV_1', 'CD_ETA_CIV_2', 'CD_MOD_HABI_1', 'CD_MOD_HABI_2',
        'CD_PROF_1', 'CD_PROF_2', 'CD_PROF_3', 'CD_QUAL_VEH_1', 'CD_QUAL_VEH_2',
        'PIB', 'Inflation', 'Tx_cho', 'h_wage', 'DR', 'CHRONIQUE'])
    df = pd.DataFrame(index=data.index) 
    
    num_last_columns_to_delete=1
    X = data.iloc[:,:len(data.columns)-num_last_columns_to_delete]

    for i in range(window,-1,-1):
        df = pd.concat([df,X.shift(i)],axis=1,ignore_index=True)
    df.dropna(inplace=True)

    Columns=['mean_1', 'median_1', 'p5_1', 'p10_1', 'p25_1',
        'p75_1', 'p90_1', 'p95_1', 'mean_2', 'median_2', 'p5_2', 'p10_2',
        'p25_2', 'p75_2', 'p90_2', 'p95_2', 'mean_3', 'median_3', 'p5_3',
        'p10_3', 'p25_3', 'p75_3', 'p90_3', 'p95_3', 'mean_4', 'median_4',
        'p5_4', 'p10_4', 'p25_4', 'p75_4', 'p90_4', 'p95_4', 'mean_5',
        'median_5', 'p5_5', 'p10_5', 'p25_5', 'p75_5', 'p90_5', 'p95_5',
        'mean_6', 'median_6', 'p5_6', 'p10_6', 'p25_6', 'p75_6', 'p90_6',
        'p95_6', 'mean_7', 'median_7', 'p5_7', 'p10_7', 'p25_7', 'p75_7',
        'p90_7', 'p95_7', 'mean_8', 'median_8', 'p5_8', 'p10_8', 'p25_8',
        'p75_8', 'p90_8', 'p95_8', 'CD_TY_CLI_RCI_1', 'CD_TY_CLI_RCI_2',
        'CD_ETA_CIV_1', 'CD_ETA_CIV_2', 'CD_MOD_HABI_1', 'CD_MOD_HABI_2',
        'CD_PROF_1', 'CD_PROF_2', 'CD_PROF_3', 'CD_QUAL_VEH_1', 'CD_QUAL_VEH_2',
        'PIB', 'Inflation', 'Tx_cho', 'h_wage', 'DR']
    NewColumns = []
    
    for i in [*range(window,-1,-1)]: 
        day = str(i)
        for name in Columns:
            if i != 0:
                NewColumns.append("t-{}_".format(day) + name)
            else:
                NewColumns.append(name)    

    df.columns = [*NewColumns]
    df = pd.concat([df, data["CHRONIQUE"]], axis=1)
    df.dropna(inplace=True)

    return df

In [None]:
CHR2_DeTemp = DeTemporalizationGeneration(CHR2, window=2)
CHR8_DeTemp = DeTemporalizationGeneration(CHR8, window=2)
CHRt_DeTemp = DeTemporalizationGeneration(CHRt, window=2)

In [None]:
CHR2_DeTemp.to_csv("D:\\0 - Google Drive\Coding\JupyterNotebook\Drim Game\Data\CHR2_DeTemp.csv")
CHR8_DeTemp.to_csv("D:\\0 - Google Drive\Coding\JupyterNotebook\Drim Game\Data\CHR8_DeTemp.csv")
CHRt_DeTemp.to_csv("D:\\0 - Google Drive\Coding\JupyterNotebook\Drim Game\Data\CHRt_DeTemp.csv")

# Polynomial Generation

In [None]:
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import mutual_info_regression
import copy

def PolyGene(X):
    X.dropna(inplace=True)
    columns=X.columns.drop(["DR", "CHRONIQUE"])
    index = X.index
    Poly_X, Poly_test, colNames_train_new, colNames_test_new = functions.functions.Polynomial_Features(columns, 
                                                                                   2, 
                                                                                   X, 
                                                                                   X, 
                                                                                   multi=False)
    Poly_X.index = index
    # Poly_train.shape
    # filtre with variance : The default is to keep all features with non-zero variance : remove the features that have the same value in all samples.
    sel = VarianceThreshold()
    sel.fit(Poly_X)
    Poly_cols = Poly_X.columns[sel.variances_ > 0]
    Poly_cols = list(Poly_cols)

    return Poly_X, Poly_cols

def Poly_f_filtre(Poly_X, y, Poly_cols):
    # filtre with f_regression
    f_regression(Poly_X, y)
    f_reg_p = f_regression(Poly_X, y)[1]
    f_reg_Poly_cols = []

    for pValue, colname in zip(f_reg_p, Poly_cols):
        if pValue < 0.01:
            f_reg_Poly_cols.append(colname)
    # print(len(f_reg_Poly_cols))
    return f_reg_Poly_cols

def Polyt_MI_filtre(Poly_X, y, Poly_cols):
    # filtre with mutual information
    MI = mutual_info_regression(Poly_X[Poly_cols], y)
    MI_threshold = MI.mean() * 0.1
    MI_Ploy_cols = []

    for MIvalue, colname in zip(MI, Poly_cols):
        if MIvalue > MI_threshold:
            MI_Ploy_cols.append(colname)
    # print(len(MI_Ploy_cols))       
    return MI_Ploy_cols

def PolyGene_pipeline(Data):
    Data_copy = copy.deepcopy(Data)
    Data_Poly, Poly_cols = PolyGene(Data_copy)

    y = Data_copy.dropna()[config.label]
    f_reg_Poly_cols = Poly_f_filtre(Data_Poly, y, Poly_cols)
    MI_Ploy_cols = Polyt_MI_filtre(Data_Poly, y, Poly_cols)

    Poly_cols_select  = list(set(f_reg_Poly_cols) & set(MI_Ploy_cols))
    Data_Poly_select = Data_Poly[Poly_cols_select]
    Data_new = pd.concat([Data, Data_Poly_select], axis=1)

    return Data_new

For Normal Data

In [None]:
CHR2_Poly = PolyGene_pipeline(CHR2)
CHR8_Poly = PolyGene_pipeline(CHR8)
CHRt_Poly = PolyGene_pipeline(CHRt)
CHR2_Poly.to_csv("D:\\0 - Google Drive\Coding\JupyterNotebook\Drim Game\Data\CHR2_Poly.csv")
CHR8_Poly.to_csv("D:\\0 - Google Drive\Coding\JupyterNotebook\Drim Game\Data\CHR8_Poly.csv")
CHRt_Poly.to_csv("D:\\0 - Google Drive\Coding\JupyterNotebook\Drim Game\Data\CHRt_Poly.csv")

For De-temporation Data

In [None]:
CHR2_Poly_DeTemp = PolyGene_pipeline(CHR2_DeTemp)
CHR8_Poly_DeTemp = PolyGene_pipeline(CHR8_DeTemp)
CHRt_Poly_DeTemp = PolyGene_pipeline(CHRt_DeTemp)
CHR2_Poly_DeTemp.to_csv("D:\\0 - Google Drive\Coding\JupyterNotebook\Drim Game\Data\CHR2_Poly+DeTemp.csv")
CHR8_Poly_DeTemp.to_csv("D:\\0 - Google Drive\Coding\JupyterNotebook\Drim Game\Data\CHR8_Poly+DeTemp.csv")
CHRt_Poly_DeTemp.to_csv("D:\\0 - Google Drive\Coding\JupyterNotebook\Drim Game\Data\CHRt_Poly+DeTemp.csv")