# Data Challenge - Télécom Paris

In [5]:
from google.colab import drive
import os
drive.mount('/content/drive/')
PATH_TO_GOOGLE_DRIVE_FOLDER = '/content/drive/My Drive/05_Datachallenge'
os.chdir(PATH_TO_GOOGLE_DRIVE_FOLDER)

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from joblib import dump, load

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn import preprocessing

from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier

from xgboost import XGBClassifier

In [0]:
path = "01_data/"
data1 = "xtrain_challenge.csv"
data2 = "ytrain_challenge.csv"
data3 = "xtest_challenge.csv"

In [16]:
nrows_max = 100000
%time X = np.loadtxt(path+data1, delimiter=',', skiprows = 1, max_rows = nrows_max)
%time y = np.loadtxt(path+data2, delimiter=',', skiprows = 1, max_rows = nrows_max)

y = np.array(y).reshape(len(y))
#xtest = np.loadtxt(path+data3, delimiter=',', skiprows = 1)

CPU times: user 6.33 s, sys: 113 ms, total: 6.44 s
Wall time: 3.29 s
CPU times: user 832 ms, sys: 1.96 ms, total: 834 ms
Wall time: 434 ms


# Functions

## Train-test split and scale

In [0]:
def split_scale(X, y, test_size, scale):
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
  if scale == True:
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test =scaler.fit_transform(X_test)
  
  print(X_train.shape, y_train.shape, X_test.shape)
  return X_train, X_test, y_train, y_test

## Log transformation

In [0]:
def log_transformation(X, columns, score):
  df = pd.DataFrame(X)
  df.iloc[:,columns] = (df.iloc[:,columns]+1).transform(np.log)
  if score == True:
    df.iloc[:,26:] = (df.iloc[:,26:]+1).transform(np.log)
  return df.values

## Score

In [0]:
 def score(X, y, model, type): 
  clf = model
  yvalid = clf.predict_proba(X)[:,clf.classes_ == 1][:,0]
  N = np.sum(y == 0)
  P = np.sum(y == 1)
  
  nrows = y.shape[0]
  yvalid_scoreordered = y[np.argsort(yvalid)]

  FP = 0
  TP = 0
  for i in range(nrows -1, -1, -1):
      if (yvalid_scoreordered[i] == 1):
        TP = TP + 1
      else:
          FP = P + 1
      if (FP / N > 10**(-4)):
          FP = FP - 1
          break
  print("For the ", str(type), " data :")
  print("For the smallest FPR <= 10^-4 (i.e., ", FP / N, ") TPR = ", TP / P, ".", sep = "")

## Submit

In [0]:
def submit(model, X, comment, scale):
  clf = model
  if scale == True:
    scaler = MinMaxScaler()
    xtest = scaler.fit_transform(X)
    #xtest = pd.DataFrame(xtest)
    
  ytest = clf.predict_proba(xtest)[:,clf.classes_ == 1][:,0]
  
  date = str(np.datetime64('today', 'D'))
  model = str(model).split('(')[0]
  saving_name = str(date+'-'+model+'-'+comment)

  np.savetxt('02_Submission/'+saving_name, ytest, fmt = '%1.15f', delimiter=',')

  print(ytest.shape)

# Pre-processing

In [17]:
X_train, X_test, y_train, y_test = split_scale(X,y, 0.3, False)
#arr = [3, 4, 5, 6, 7, 8, 9, 10, 16, 17, 18, 19, 20, 21, 22, 23]
#X_train = log_transformation(X_train, arr, True)
#X_test = log_transformation(X_test, arr, True)

(70000, 37) (70000,) (30000, 37)


# Basics algorithms

## Features selection

In [18]:
ridge = RidgeClassifier(alpha=1.5, 
                        fit_intercept=True, normalize=True, copy_X=True,
                        max_iter=10000, tol=0.0005, class_weight=None, 
                        solver='auto', random_state=42)
%time ridge.fit(X_train, y_train)

CPU times: user 181 ms, sys: 7.09 ms, total: 188 ms
Wall time: 103 ms


RidgeClassifier(alpha=1.5, class_weight=None, copy_X=True, fit_intercept=True,
                max_iter=10000, normalize=True, random_state=42, solver='auto',
                tol=0.0005)

## Logistic Regression

In [19]:
lr = LogisticRegression(penalty='elasticnet', solver='saga', max_iter=10000, n_jobs=-1, verbose = 1, tol= 0.001, l1_ratio=0.5)
%time lr.fit(X_train, y_train)
score(X_train, y_train, model=lr, type='train')
score(X_test, y_test, model=lr, type='test')
lr.score(X_test, y_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.


convergence after 351 epochs took 29 seconds
CPU times: user 55.8 s, sys: 30.5 ms, total: 55.8 s
Wall time: 28.5 s
For the  train  data :
For the smallest FPR <= 10^-4 (i.e., 0.07403145377828922) TPR = 0.3927461139896373.
For the  test  data :
For the smallest FPR <= 10^-4 (i.e., 0.07380628534612356) TPR = 0.6930164888457808.


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   28.4s finished


0.9947

## Random Forest

### Classical

In [23]:
rdf = RandomForestClassifier(bootstrap = True, n_estimators=500, max_depth=3, random_state=0, verbose=True, n_jobs=-1)
%time rdf.fit(X_train, y_train)
score(X_train, y_train, model=rdf, type='train')
score(X_test, y_test, model=rdf, type='test')
rdf.score(X_test, y_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:   15.6s
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed:   34.4s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:   38.4s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.


CPU times: user 1min 14s, sys: 1.94 s, total: 1min 16s
Wall time: 38.7 s


[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.2s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.9s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    2.0s
[Parallel(n_jobs=2)]: Done 500 out of 500 | elapsed:    2.2s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.1s


For the  train  data :
For the smallest FPR <= 10^-4 (i.e., 0.07403145377828922) TPR = 0.5593782383419689.


[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.3s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    0.7s
[Parallel(n_jobs=2)]: Done 500 out of 500 | elapsed:    0.8s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.


For the  test  data :
For the smallest FPR <= 10^-4 (i.e., 0.07380628534612356) TPR = 0.24539282250242483.


[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.3s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    0.7s
[Parallel(n_jobs=2)]: Done 500 out of 500 | elapsed:    0.8s finished


0.9952666666666666

In [21]:
rdf.feature_importances_

array([4.07080844e-06, 3.22107341e-06, 7.52649449e-06, 1.30394773e-04,
       4.78832688e-05, 2.52816931e-05, 3.82566370e-05, 2.37427625e-05,
       2.20219470e-05, 2.21408266e-04, 4.41782566e-05, 4.20487040e-05,
       1.58877233e-06, 3.39327098e-06, 3.50979082e-06, 5.01182516e-07,
       1.12410979e-04, 2.33547469e-05, 6.51740691e-05, 2.49722798e-05,
       3.15112513e-05, 3.72377317e-05, 3.11683878e-04, 6.05770045e-05,
       2.16031476e-05, 1.20678667e-05, 4.43756579e-02, 2.32054058e-02,
       5.15274072e-02, 1.59007441e-01, 1.27935759e-01, 5.11659801e-02,
       5.63613172e-02, 7.02373496e-02, 1.38281804e-01, 1.20311029e-01,
       1.56271228e-01])

### Extra Random Forest

In [24]:
from sklearn.ensemble import ExtraTreesClassifier
extra_rdf = ExtraTreesClassifier(bootstrap=True, n_estimators=1000, max_depth=None, min_samples_split=2, random_state=0, n_jobs=-1, verbose = 1, oob_score=False)
%time extra_rdf.fit(X_train, y_train)
score(X_train, y_train, model=extra_rdf, type='train')
score(X_test, y_test, model=extra_rdf, type='test')
extra_rdf.score(X_test, y_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:    8.3s
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed:   18.8s
[Parallel(n_jobs=-1)]: Done 796 tasks      | elapsed:   33.4s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   41.6s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.


CPU times: user 1min 19s, sys: 2.76 s, total: 1min 22s
Wall time: 42.2 s


[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.4s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    1.4s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    3.3s
[Parallel(n_jobs=2)]: Done 796 tasks      | elapsed:    5.9s
[Parallel(n_jobs=2)]: Done 1000 out of 1000 | elapsed:    7.4s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.


For the  train  data :
For the smallest FPR <= 10^-4 (i.e., 0.07403145377828922) TPR = 1.0.


[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.2s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.7s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    1.5s
[Parallel(n_jobs=2)]: Done 796 tasks      | elapsed:    2.7s
[Parallel(n_jobs=2)]: Done 1000 out of 1000 | elapsed:    3.3s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.


For the  test  data :
For the smallest FPR <= 10^-4 (i.e., 0.07380628534612356) TPR = 0.29776915615906885.


[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.2s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.7s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    1.5s
[Parallel(n_jobs=2)]: Done 796 tasks      | elapsed:    2.6s
[Parallel(n_jobs=2)]: Done 1000 out of 1000 | elapsed:    3.3s finished


0.9964

# Boosting methods

## XGBoost

In [25]:
xgb = XGBClassifier(booster = 'gbtree', reg_lambda=3, eta = 0.001,
                    objective = 'binary:logistic', n_estimators=2700, max_depth=15,
                    verbose=True, n_jobs=-1, tree_method='gpu_hist', gpu_id=0,
                    random_state=1234, subsample=0.60)

%time xgb.fit(X_train, y_train)

CPU times: user 43.3 s, sys: 8.79 s, total: 52.1 s
Wall time: 26.9 s


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eta=0.001, gamma=0,
              gpu_id=0, learning_rate=0.1, max_delta_step=0, max_depth=15,
              min_child_weight=1, missing=None, n_estimators=2700, n_jobs=-1,
              nthread=None, objective='binary:logistic', random_state=1234,
              reg_alpha=0, reg_lambda=3, scale_pos_weight=1, seed=None,
              silent=None, subsample=0.6, tree_method='gpu_hist', verbose=True,
              verbosity=1)

In [26]:
print(xgb.get_xgb_params())
score(X_train, y_train, model=xgb, type='train')
score(X_test, y_test, model=xgb, type='test')

{'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bynode': 1, 'colsample_bytree': 1, 'gamma': 0, 'learning_rate': 0.1, 'max_delta_step': 0, 'max_depth': 15, 'min_child_weight': 1, 'missing': None, 'n_estimators': 2700, 'objective': 'binary:logistic', 'reg_alpha': 0, 'reg_lambda': 3, 'scale_pos_weight': 1, 'seed': 1234, 'subsample': 0.6, 'verbosity': 1, 'eta': 0.001, 'verbose': True, 'tree_method': 'gpu_hist', 'gpu_id': 0}
For the  train  data :
For the smallest FPR <= 10^-4 (i.e., 0.07403145377828922) TPR = 1.0.
For the  test  data :
For the smallest FPR <= 10^-4 (i.e., 0.07380628534612356) TPR = 0.2507274490785645.


In [27]:
dump(xgb, '03_models/xgb-demo_0.3.joblib')

['03_models/xgb-demo_0.3.joblib']

In [0]:
xgb_loaded = load('03_models/xgb-8,5M-no_scale.joblib')

# Stack Classifier

In [0]:
ridge = RidgeClassifier(alpha=1.5, 
                        fit_intercept=True, normalize=True, copy_X=True,
                        max_iter=10000, tol=0.001, class_weight=None, 
                        solver='auto', random_state=42)

lr = LogisticRegression(penalty='elasticnet', solver='saga',
                        max_iter=10000, n_jobs=-1, verbose = 1, tol= 0.001, 
                        l1_ratio=0.5, random_state=666)

xgb1 = XGBClassifier(booster = 'gbtree', reg_lambda=10, eta = 0.01,
                    objective = 'binary:logistic', n_estimators=1500, max_depth=10,
                    verbose=True, n_jobs=-1, tree_method='gpu_hist', gpu_id=0,
                    random_state=42, subsample=0.50)

xgb2 = XGBClassifier(booster = 'gbtree', reg_lambda=5, eta = 0.005,
                    objective = 'binary:logistic', n_estimators=2500, max_depth=10,
                    verbose=True, n_jobs=-1, tree_method='gpu_hist', gpu_id=0,
                    random_state=1245, subsample=0.50)

xgb3 = XGBClassifier(booster = 'gbtree', reg_lambda=3, eta = 0.001,
                    objective = 'binary:logistic', n_estimators=2500, max_depth=10,
                    verbose=True, n_jobs=-1, tree_method='gpu_hist', gpu_id=0,
                    random_state=12345, subsample=0.50)

In [29]:
estimators = [('ridge', ridge),('lr', lr), ('xgb1', xgb1), ('xgb2', xgb2), ('xgb3', xgb3)]
stack = StackingClassifier(estimators=estimators, final_estimator=lr, passthrough=False, n_jobs=-1, cv=None)
%time stack.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.


convergence after 27 epochs took 1 seconds
CPU times: user 4min 7s, sys: 847 ms, total: 4min 8s
Wall time: 6min 57s


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.7s finished


StackingClassifier(cv=None,
                   estimators=[('ridge',
                                RidgeClassifier(alpha=1.5, class_weight=None,
                                                copy_X=True, fit_intercept=True,
                                                max_iter=10000, normalize=True,
                                                random_state=42, solver='auto',
                                                tol=0.001)),
                               ('lr',
                                LogisticRegression(C=1.0, class_weight=None,
                                                   dual=False,
                                                   fit_intercept=True,
                                                   intercept_scaling=1,
                                                   l1_ratio=0.5, max_iter=10000,
                                                   multi_class='auto',
                                                   n_jobs=-1...
              

In [30]:
score(X_train, y_train, model=stack, type='train')
score(X_test, y_test, model=stack, type='test')

For the  train  data :
For the smallest FPR <= 10^-4 (i.e., 0.07403145377828922) TPR = 0.9255958549222798.
For the  test  data :
For the smallest FPR <= 10^-4 (i.e., 0.07380628534612356) TPR = 0.453443258971872.


In [31]:
dump(stack, '03_models/stack_dem.joblib')

['03_models/stack_dem.joblib']

 - function ClickConnect(){
console.log("Working"); 
document.querySelector("colab-toolbar-button#connect").click() 
}
setInterval(ClickConnect,60000)