# Challenge ANAP

Quelques pistes pour exploiter le fichier `data2.csv`

In [9]:
import os 
data_folder = os.getcwd()

In [2]:
import pandas as pd
train_data = pd.read_csv(os.path.join(data_folder, 'data2.csv'), sep=';', low_memory=False, encoding='latin-1')
test_data = pd.read_csv(os.path.join(data_folder, 'test2.csv'), sep=';', low_memory=False, encoding='latin-1')

In [3]:
target = train_data['cible1']
train_data.drop('cible1', inplace=True, axis=1)
piv_train = train_data.shape[0]

train_data.columns = ['finess', 'Raison sociale', 'Provenance des patients (département)',
       'Domaines d activités', 'âge (deux classes >75 ans, <= 75 ans)',
       'Nombre de séjours/séances MCO des patients en ALD',
       'Nombre total de séjours/séances', 'annee']

test_data.columns = ['finess', 'Raison sociale', 'Provenance des patients (département)',
       'Domaines d activités', 'âge (deux classes >75 ans, <= 75 ans)',
       'Nombre de séjours/séances MCO des patients en ALD',
       'Nombre total de séjours/séances', 'annee']

In [4]:
train_data.shape

(1048575, 8)

### Processing the categorical variables 

#### One-hot-Encoding

In [113]:
import numpy as np
#First set : merge test and train data to get the same processing 
df_all_1 = pd.concat((train_data, test_data), axis=0, ignore_index=True)

# define the categorical columns 
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
non_numeric_columns = df_all_1.select_dtypes(exclude=numerics).columns.difference(['finess', 'Raison sociale' ])

# Mapping the categorical columns to numerical using one-hot-encoding 
for col in non_numeric_columns : 
    col_dummy = pd.get_dummies(df_all_1[col]).astype(np.int8)
    df_all_1 = pd.concat([df_all_1, col_dummy], axis=1)
    df_all_1.drop(col, axis=1, inplace=True)
    del col_dummy

# Recreate train / test
train_oe, test_oe = df_all_1[:piv_train], df_all_1[piv_train:]

#### Label Encoder 

In [114]:
from sklearn.preprocessing import Imputer, LabelEncoder
df_all_2 = pd.concat((train_data, test_data), axis=0, ignore_index=True)

# Mapping the categorical columns to numerical classes 
le = LabelEncoder()
for col in non_numeric_columns : 
        df_all_2[col] = le.fit_transform(df_all_2[col])
        
# Recreate train / test
train_le, test_le = df_all_2[:piv_train], df_all_2[piv_train:]

In [115]:
non_numeric_columns

Index(['Domaines d activités', 'Provenance des patients (département)',
       'âge (deux classes >75 ans, <= 75 ans)'],
      dtype='object')

#### Hash encoder 

In [117]:
def hash_function(modalites): 
    return [hash(m) % 2000000 for m in modalities]

In [118]:
from sklearn.preprocessing import Imputer, LabelEncoder
df_all_3 = pd.concat((train_data, test_data), axis=0, ignore_index=True)

# Mapping the categorical columns to numerical classes 
for col in non_numeric_columns : 
        modalities = df_all_3[col]
        df_all_3[col] = hash_function(modalities)
        
# Recreate train / test
train_he, test_he = df_all_3[:piv_train], df_all_3[piv_train:]

In [81]:
# check if there are some collisions 
for col in non_numeric_columns:
    if train_he[col].nunique() != train_data[col].nunique():
        print('encoding fail')

#### Aggregate the three data reprensations  

In [121]:
df = {
    'hash-encoding' : [train_he.drop(['finess', 'Raison sociale'], axis=1), test_he],
    'Label-encoding' : [train_le.drop(['finess', 'Raison sociale'], axis=1), test_le],
    'one-hot-encoding' : [train_oe.drop(['finess', 'Raison sociale'], axis=1), test_oe]
}

### Classification task : 

#### Random forest : 

In [130]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn import metrics

In [138]:
def training_benchmark(clf, X, y): 
    scores = np.sqrt(np.abs(cross_val_score(clf, X, y, scoring='mean_squared_error', cv=5, n_jobs=-1)))
    return scores.mean()

In [132]:
training_benchmark(RandomForestRegressor(), df['hash-encoding'][0], target)

0.047934019166062783

In [133]:
training_benchmark(RandomForestRegressor(), df['Label-encoding'][0], target)

0.047863614549113961

In [134]:
training_benchmark(RandomForestRegressor(), df['one-hot-encoding'][0], target)

0.046931932908946462

#### Gradient boosting :

In [139]:
from sklearn.ensemble import GradientBoostingRegressor
clf = GradientBoostingRegressor(n_estimators=500, max_depth=5, random_state=42, 
                                learning_rate=0.01, min_samples_split=100, subsample=0.8)

In [140]:
training_benchmark(clf, df['hash-encoding'][0], target)

0.044713055405507497

In [141]:
training_benchmark(clf, df['Label-encoding'][0], target)

0.044758623891831814

In [142]:
training_benchmark(clf, df['one-hot-encoding'][0], target)

0.044749014471170488

#### Linear regression

In [150]:
from sklearn.linear_model import Lasso
lr = Lasso(random_state=42)

In [151]:
training_benchmark(lr, df['hash-encoding'][0], target)

0.14821722439913218

In [152]:
training_benchmark(lr, df['Label-encoding'][0], target)

0.15521217687086372

In [154]:
training_benchmark(lr, df['one-hot-encoding'][0], target)

0.15521217687086372