# Running different algorithms

### Importing libraries

In [39]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import json
from os import listdir

### Defining path to data

In [40]:
path = 'C:/Users/izaou/Documents/ML_Project-master/clean_data/'

In [41]:
print(listdir(path))

['Parameters', 'targets.csv', 'targets_flat.csv', 'test.csv', 'train.csv']


### Loading train data

In [42]:
train = pd.read_csv(path + 'train.csv', index_col='user_id')
targets = pd.read_csv(path + 'targets_flat.csv', dtype={"user_id":object, "country_destination":object}, index_col=['user_id'])
targets_train = targets[targets.country_destination.notnull()].values.reshape(-1,)

In [43]:
np.unique(targets_train)

array(['AU', 'CA', 'DE', 'ES', 'FR', 'GB', 'IT', 'NDF', 'NL', 'PT', 'US',
       'other'], dtype=object)

## Predicting classes by taking out age and gender

### Defining the train dataset

In [44]:
to_drop= ['age', 'gender__MALE', 'gender__FEMALE', 'gender__OTHER','language__no','signup_method__google','first_affiliate_tracked__local ops','affiliate_provider__naver','affiliate_provider__baidu','affiliate_provider__yandex','language__id','language__hu','affiliate_provider__wayn','language__da','language__tr','language__th','language__fi','date_account_created_hour_sin','language__is','language__is','date_account_created_hour_cos','language__ca','language__cs','affiliate_provider__daum','language__hr','signup_method__weibo']

df1_train = train.drop(to_drop, axis=1)

In [None]:
train_age=pd.read_csv(path +'train_age_only_filled.csv', index_col='user_id')
train_gender=pd.read_csv(path +'complete_gradboost_GENDER.csv', index_col='user_id')
train_age=train_age.astype(np.float)
train_gender=train_gender.astype(np.float)
df_train = pd.concat([df1_train, train_gender, train_age], axis=1, join_axes=[df1_train.index])

### Importing libraries

In [45]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from sklearn.model_selection import cross_val_score

import timeit
from sklearn.metrics import accuracy_score

from sklearn.cross_validation import train_test_split #for splitting data into train and test



In [46]:
X_train, X_test, y_train, y_test=train_test_split(df_train,targets_train,test_size=0.2,random_state=1)

### Defining classifiers

# Optimisation with gridsearch

In [47]:
from sklearn.model_selection import GridSearchCV

path_rf = path + 'Parameters/random_forest_parameters.json'
path_dt = path + 'Parameters/decision_tree_parameters.json'
path_gb = path + 'Parameters/gradient_boosting_parameters.json'
path_xgb = path + 'Parameters/XGB_parameters.json'

In [48]:
#parameters for RF, GB, XGB
param_grid_RF = {'n_estimators' : range(10,30,10),
                 'min_samples_split' : range(2, 5),
                 'max_leaf_nodes' : range(8, 10),
                 'max_depth' : range(10, 14),
                'criterion':['gini', 'entropy']}

param_grid_GB = {'n_estimators':range(10, 201, 50),
                 'min_samples_split':range(2,5),
                 'max_depth':range(3,10,5)}

param_grid_DT = {'min_samples_split':range(2, 15),  
                 'min_samples_leaf':range(2,15),
                 'max_leaf_nodes':range(10, 20)}
param_grid_XGB = {'n_estimators': range(50,251,50)
                 'max_depth':8}

In [49]:
#xgb.XGBClassifier().get_params().keys()

In [52]:
def optim_save_best_params(parameters, estimator, path_ = path, cv = 10):
    
    my_estimator = estimator
    my_cv = GridSearchCV(estimator = my_estimator, param_grid = parameters, cv = cv, n_jobs = 1, verbose = 2)
    my_cv.fit(X_train, y_train)
    with open(path_ , 'w') as f:
        data= my_cv.best_params_
        json.dump(data, f)
    f.close()
    print(my_cv.best_params_)
    
def read_parameters(path_):
    with open(path_, 'r') as f:
        data = json.load(f)
    return data

In [53]:
#if trained == False:
#optim_save_best_params(parameters = param_grid_RF, 
                       #estimator=RandomForestClassifier(),
                       #path_ = path_rf)
#optim_save_best_params(parameters = param_grid_GB, 
                       #estimator=GradientBoostingClassifier(),
                       #path_ = path_gb)
#optim_save_best_params(parameters = param_grid_DT, 
                       #estimator=DecisionTreeClassifier(),
                       #path_ = path_dt)
optim_save_best_params(parameters = param_grid_XGB, 
                       estimator=xgb.XGBClassifier(),
                       path_ = path_xgb)

#trained = True

Fitting 10 folds for each of 12 candidates, totalling 120 fits
[CV] n_estimators=50, max_depth=2 ....................................
[CV] ..................... n_estimators=50, max_depth=2, total= 4.7min
[CV] n_estimators=50, max_depth=2 ....................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  4.8min remaining:    0.0s


[CV] ..................... n_estimators=50, max_depth=2, total= 7.8min
[CV] n_estimators=50, max_depth=2 ....................................
[CV] ..................... n_estimators=50, max_depth=2, total= 7.9min
[CV] n_estimators=50, max_depth=2 ....................................
[CV] ..................... n_estimators=50, max_depth=2, total= 7.8min
[CV] n_estimators=50, max_depth=2 ....................................
[CV] ..................... n_estimators=50, max_depth=2, total= 7.4min
[CV] n_estimators=50, max_depth=2 ....................................
[CV] ..................... n_estimators=50, max_depth=2, total= 7.5min
[CV] n_estimators=50, max_depth=2 ....................................
[CV] ..................... n_estimators=50, max_depth=2, total= 5.4min
[CV] n_estimators=50, max_depth=2 ....................................
[CV] ..................... n_estimators=50, max_depth=2, total= 4.7min
[CV] n_estimators=50, max_depth=2 ....................................
[CV] .

KeyboardInterrupt: 

In [None]:
opti_mod=dict()

#opti_mod['Random Forest opti']=RandomForestClassifier().set_params(**read_parameters(path_rf))
opti_mod['Extrem Gradient Boosting Classifier opti'] = xgb.XGBClassifier().set_params(**read_parameters(path_xgb))
#opti_mod['Gradient Boosting Classifier opti'] = GradientBoostingClassifier().set_params(**read_parameters(path_gb))
#opti_mod['Decision Tree Classifier opti'] = DecisionTreeClassifier().set_params(**read_parameters(path_dt))

In [None]:
def fun_cross_val_score(OptiModel, algo, cv=20):
    scores = cross_val_score(OptiModel, X_train, y_train, cv=cv)
    print("Accuracy %s: %0.4f (+/- %0.2f)" % (algo ,scores.mean(), scores.std() * 2))
    return

In [None]:
for key, value in opti_mod.items():
    fun_cross_val_score(value, key)
    