In [1]:
numerical = ['express.no.transactions', 'express.total.spend','metro.no.transactions','metro.total.spend', \
             'superstore.no.transactions','superstore.total.spend','extra.no.transactions','extra.total.spend',\
             'fandf.no.transactions','fandf.total.spend','petrol.no.transactions','petrol.total.spend', \
             'direct.no.transactions','direct.total.spend']

categorical = ['gender','affluency','county','content']

data_file = 'data.pickle'

In [2]:
import sklearn.model_selection
import pandas

random_state = 88 

data = pandas.read_pickle(data_file)

# for selecting model and it's basic parameters I use only a frantion of data 
# with only 10% of data for training it takes a few hours to execute so  I cannot affort more a.t.m.
# an option here would be to donwsample negative classes (click == 0) to balance classes
# however, I'll stick to stratified sampling 
for train_index, test_index in sklearn.model_selection.StratifiedShuffleSplit(n_splits=1, train_size=0.1, \
                                                        test_size=0.03, random_state=random_state).\
                                                        split(data, data['click']):
    X_train = data[numerical+categorical].iloc[train_index]
    y_train = data['click'].iloc[train_index]
    
    X_test = data[numerical+categorical].iloc[test_index]
    y_test = data['click'].iloc[test_index]

print(len(X_train))
print(len(y_train))
print(len(X_test))
print(len(y_test))

37347
37347
11205
11205


In [3]:
# using modified methods from https://gist.github.com/miguelmalvarez/07e622357b089fee7f21
import numpy
import sklearn.preprocessing 
import sklearn.pipeline
import sklearn.ensemble 
import sklearn.svm 
import sklearn.neighbors 
import sklearn.utils
import sklearn.decomposition
import xgboost 
import sklearn_pandas
import prince


def best_config(model_info, parameters, X_train, y_train):
    [name, model] = model_info
    print('Grid search for ', name)
    # cv=5: integer, to specify the number of folds in a (Stratified)KFold
    # roc_auc should be safe here for inbalanced classes
    clf = sklearn.model_selection.GridSearchCV(model, parameters, cv=5, scoring="roc_auc", n_jobs=-1, verbose=4)
    %time clf.fit(X_train, y_train)
    best_estimator = clf.best_estimator_
    print('Best parameters: ', str(clf.best_params_))
    print('scored: ', clf.best_score_)
 
    return [str(clf.best_params_), clf.best_score_, best_estimator]

def best_model(classifier_families, X_train, y_train):
    best_score = 0.0
    best_classifier = None    

    for name, model, parameters in classifier_families:
        classifier = best_config([name, model], parameters, X_train, y_train)
        if (classifier[1] > best_score):
            best_score = classifier[1]
            best_classifier = [name, classifier]
            
    print('#'*150)
    print('Best classifier: ', best_classifier[0])
    print('scored: ', best_classifier[1][1])
    print('with parameters: ', best_classifier[1][0])
    return  best_classifier[1][2]


# I focus on classical models for binary classyfication tuned with class_weight 
# an interesting would be to use anomaly detection models to detect positive classes (click == 1)
# such as LOF, isolation forrest, or autoencoders
def candidate_families(y_train):
    candidates = []
    svm_tuned_parameters = [{'kernel': ['linear', 'rbf'], 
                            'class_weight': ['balanced'], 
                            'C': numpy.linspace(1, 6, 3)}]
    candidates.append(["SVM", 
                       sklearn.svm.SVC(gamma='scale', probability=True), 
                       svm_tuned_parameters])
    
    knn_tuned_parameters = [{'n_neighbors': [2,3,5,15,25]}]
    candidates.append(['kNN', 
                       sklearn.neighbors.KNeighborsClassifier(), 
                       knn_tuned_parameters])
    
    rf_tuned_parameters = [{'n_estimators': [100, 200]}]
    candidates.append(['RandomForest', 
                       sklearn.ensemble.RandomForestClassifier(class_weight='balanced'), 
                       rf_tuned_parameters])  
                       
    scale_pos_weight = y_train[y_train == 0].count() / y_train[y_train == 1].count()
    xgb_tuned_parameters = [{'max_depth': [5,10,15], 
                             'n_estimators': [50, 100,200,300]}]
    candidates.append(['XGBClassifier', 
                       xgboost.XGBClassifier(scale_pos_weight=scale_pos_weight),
                       xgb_tuned_parameters])  
    return candidates

In [4]:
def feature_scaller(categorical,numerical):
    
    categorical_feature_def = sklearn_pandas.gen_features(
        columns=[categorical],
        classes=[{'class': prince.MCA, # I'm just playing with MCA, LabelBinarizer would be a standard choise 
                'n_components': 25, #this should be tuned
                'n_iter': 5,
                'engine': 'auto'}]
    )

    numerical_feature_def = [(numerical, sklearn.preprocessing.StandardScaler())]

    mapper = sklearn_pandas.DataFrameMapper(categorical_feature_def+numerical_feature_def)

    return mapper

In [5]:
mapper = feature_scaller(categorical,numerical)
%time X_train_transformed = mapper.fit_transform(X_train)

Wall time: 3min 10s


In [6]:
%time best_classifier = best_model(candidate_families(y_train), X_train_transformed, y_train)

Grid search for  SVM
Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed: 118.0min
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed: 278.4min finished


Wall time: 5h 38min 35s
Best parameters:  {'C': 3.5, 'class_weight': 'balanced', 'kernel': 'linear'}
scored:  0.5995005723216651
Grid search for  kNN
Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed: 15.7min
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed: 21.9min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed: 21.9min finished


Wall time: 21min 56s
Best parameters:  {'n_neighbors': 25}
scored:  0.5170691050604254
Grid search for  RandomForest
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:   40.6s remaining:   27.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   51.7s finished


Wall time: 1min 11s
Best parameters:  {'n_estimators': 200}
scored:  0.6056348068626191
Grid search for  XGBClassifier
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed: 13.6min finished


Wall time: 15min 3s
Best parameters:  {'max_depth': 15, 'n_estimators': 300}
scored:  0.6256088222379651
######################################################################################################################################################
Best classifier:  XGBClassifier
scored:  0.6256088222379651
with parameters:  {'max_depth': 15, 'n_estimators': 300}
Wall time: 6h 16min 46s


In [7]:
import sklearn.metrics
%time X_test_transformed= mapper.transform(X_test)

Wall time: 155 ms


In [9]:
%time y_test_pred = best_classifier.predict(X_test_transformed)
%time y_test_proba = best_classifier.predict_proba(X_test_transformed)[:,1]

Wall time: 534 ms
Wall time: 465 ms


In [10]:
print(sklearn.metrics.classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99     11061
           1       0.12      0.01      0.01       144

   micro avg       0.99      0.99      0.99     11205
   macro avg       0.56      0.50      0.50     11205
weighted avg       0.98      0.99      0.98     11205



In [11]:
print(sklearn.metrics.confusion_matrix(y_test, y_test_pred))

[[11054     7]
 [  143     1]]


In [12]:
sklearn.metrics.roc_auc_score(y_test, y_test_proba)

0.6153175822961556

In [13]:
precision, recall, thresholds = sklearn.metrics.precision_recall_curve(y_test, y_test_proba)
sklearn.metrics.auc(recall, precision)

0.031424745989432144

In [14]:
sklearn.metrics.average_precision_score(y_test, y_test_proba)

0.03393958376581965

In [15]:
sklearn.metrics.balanced_accuracy_score(y_test, y_test_pred, adjusted=True)

0.006311590272127354

In [16]:
sklearn.metrics.cohen_kappa_score(y_test, y_test_pred)

0.011821106548702387

In [17]:
y_test[y_test == 1]

297266    1
338899    1
47176     1
261753    1
198042    1
167534    1
246714    1
314869    1
18120     1
166606    1
350678    1
335472    1
367961    1
248632    1
137179    1
73218     1
230715    1
312192    1
326168    1
246772    1
29409     1
191120    1
260815    1
250270    1
246057    1
201517    1
46062     1
272000    1
257237    1
281124    1
         ..
143991    1
216061    1
47833     1
60839     1
109243    1
132661    1
59249     1
303449    1
71919     1
200374    1
318102    1
242929    1
54545     1
34091     1
235539    1
334580    1
124781    1
282700    1
212415    1
367702    1
60098     1
156134    1
201436    1
9520      1
352295    1
363955    1
339629    1
324346    1
255663    1
259278    1
Name: click, Length: 144, dtype: int64

In [18]:
y_test_pred[y_test_pred == 1]

array([1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)