# ML Training

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import operator
from collections import Counter
from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
train = pd.read_csv('../data/train.csv', sep=';')
test = pd.read_csv('../data/test.csv', sep=';')

# check null value existance
print 'NaN in train set', np.sum(np.sum(train.isnull(), axis = 1) != 0)
print 'NaN in test set', np.sum(np.sum(test.isnull(), axis = 1) != 0)

# replace NaN values with median
train.fillna(train.median(), inplace=True)
test.fillna(test.median(), inplace=True)

# check one more time null value existance
print 'NaN in train set after removing', np.sum(np.sum(train.isnull(), axis = 1) != 0)
print 'NaN in test set after removing', np.sum(np.sum(test.isnull(), axis = 1) != 0)



In [None]:
def onehotencoding(data, features='all'):
    
    #import necessary modules
    from sklearn.preprocessing import LabelEncoder, OneHotEncoder
    import numpy as np
    import pandas as pd
    
    #write only not categorical names to variable non_cat_columns
    non_cat_columns_data = [col for col in data.columns if data[col].dtype != 'O']
            
    #save all labels
    labels = []
    
    # do the others
    for i in features:
        enc_label = LabelEncoder()
        data[i] = enc_label.fit_transform(data[i])
        labels.append(enc_label.classes_)
    
    #transforming names to indexes
    columns = list(data.columns)
    features_columns = [columns.index(i) for i in features]
    
    #do OneHotEncoding
    enc_onehot = OneHotEncoder(categorical_features=features_columns, dtype='float32')
    data = enc_onehot.fit_transform(data)
    
    
    # create a list of columns to help create a DF from np array 
    new_cols = [features[i] + '_' + str(j) for i in range(0,len(features)) for j in labels[i]]
    
    #extend existing column names list with new ones
    new_cols.extend(non_cat_columns_data)
    
    # create new dataframe
    #new_data = pd.DataFrame(new_data.toarray(),columns=new_cols)
    
    # for i in data.columns:
    #    if i not in features:
    #        new_data[i] = data[i]
            
            
    return data, new_cols

In [None]:
#leave only two digits of postalcode
train.codepostal = [str(i)[:2] for i in train.codepostal]
codepostal = train.codepostal.astype('str')

# prepare data for label encoding

# first save target in a separate variable and drop index
#y_train = train.prime_tot_ttc
y = train.prime_tot_ttc #/ train.crm
#crm = train.crm
train.drop(['prime_tot_ttc', 'id', 'codepostal'], axis = 1, inplace=True)
test_ids = test.id
#test_crm = test.crm
test.drop(['id', 'codepostal'], axis = 1, inplace=True)

# stack train and test sets for label encoding
data = pd.concat([train, test], axis=0).reset_index(drop=True)
#tsne = pd.DataFrame(tsne, columns=['tsne1', 'tsne2'])
#data = pd.concat([data, tsne], axis=1).reset_index(drop=True)

#find categorical variables for encoding
cat_features = [col for col in data.columns if data[col].dtype == 'O']

#apply ohe function
ohe_encoded_data, col_names = onehotencoding(data,cat_features)
del(data)

# devide back new data frame on train and test sets
ohe_encoded_train = ohe_encoded_data.tocsr()[:300000]
ohe_encoded_test = ohe_encoded_data.tocsr()[300000:]

del(ohe_encoded_data)

print 'train set shape is', ohe_encoded_train.shape
print 'test set shape is', ohe_encoded_test.shape

In [None]:
#validation

val = ['10', '90', '91', '92', '93', '94', '95', '97']
train = [key for key in Counter(codepostal).keys() if key not in val]

val_indexes = [i for i, item in enumerate(codepostal) if item in val]
train_indexes = [i for i, item in enumerate(codepostal) if item in train]

#devide on train and validation set

ohe_encoded_val = ohe_encoded_train.tocsr()[val_indexes]
ohe_encoded_train_train = ohe_encoded_train.tocsr()[train_indexes]

In [None]:
# xgboost parameters
#set params
param = {}
param['objective'] = 'reg:linear'
param['max_depth'] = 5
param['eta'] = .1
param['colsample_bytree'] = .7
param['subsample'] = .7
plst = list(param.items()) #+ [('eval_metric', 'merror')]
num_round = 50

In [None]:
#create mape metric for xgboost
# metric mape
def mape(y_true, y_pred): 

    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def xgb_mape(preds, df):
    labels = df.get_label()
    assert len(preds) == len(labels)
    labels = np.array(labels)
    preds = np.array(preds)
    
    return 'error', mape(labels, preds)

In [None]:
#run xbgoost on the whole train set
evals_result = {}
xgb_train = xgb.DMatrix( ohe_encoded_train_train, label = y[train_indexes], feature_names=col_names)
xgb_val = xgb.DMatrix( ohe_encoded_val, label = y[val_indexes], feature_names=col_names)
evallist = [(xgb_train,'train'), (xgb_val, 'val')]
bst = xgb.train( param, xgb_train, num_round, evallist, early_stopping_rounds = 50, 
                feval = xgb_mape, evals_result=evals_result)

In [None]:
error_df = pd.DataFrame({'error': map(lambda x: float(x), evals_result['val']['error']), 'iter': range(1,len(evals_result['val']['error'])+1)})
error_df.plot(x='iter', y='error', figsize=(20,20))

In [None]:
# predict test
xgb_test = xgb.DMatrix( ohe_encoded_test, feature_names=col_names)
predicted_values = bst.predict(xgb_test)

In [None]:
#save predictions
answers = pd.DataFrame({'COTIS': predicted_values, 'ID': test_ids})
answers[['ID', 'COTIS']].to_csv('../submissions/submission.csv',sep=';',index=False)