In [18]:
import pandas as pd
import numpy as np
import scipy.sparse as sp
import matplotlib.pyplot as plt
import warnings
import tensorflow as tf
from time import time
from collections import Counter
from sklearn.ensemble import GradientBoostingClassifier, ExtraTreesClassifier, RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from tffm import TFFMClassifier
from sklearn.model_selection import RandomizedSearchCV
from keras.models import Sequential
from keras.layers.core import Dense, Dropout
from sklearn import preprocessing
from random import random
from imblearn.pipeline import make_pipeline
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
%matplotlib inline

In [9]:
# preprocessing
data = pd.read_csv(r'...\data_net.csv', sep = ';')
data.drop('Unnamed: 0', axis = 1,inplace = True)

In [10]:
# features & labels
labels = data['Click']
features = data.drop('Click', axis = 1)
features

Unnamed: 0,Browsname,Searcher,TypeCon,Country,deviceType,Model,ModelCompany,NewID,Reversed,Type,System,FirstSearcher,Version,Sex,Age,TimeSpent,InternalCode,id_transf
0,Chrome,Google,wi-fi,ita,SmartPhone,Nokia 2240,Nokia,yes,no,mobile,Android,Yahoo,5.0,male,21,1.02,fergie,25465885
1,Firefox,Yahoo,cable,fra,SmartPhone,Meizu 4 pro,Meizu,yes,no,TV,Android,Yahoo,4.2,female,34,0.123,krag,65458971
2,Chrome,Yandex,wi-fi,rus,SmartPhone,Iphone 6,Iphone,yes,yes,mobile,Ios,Yahoo,9.0,male,53,0.14,leslie,21547895
3,IE,Google,cable,usa,SmartPhone,Iphone X,Iphone,no,no,web,Ios,Yahoo,9.0,male,30,0.01,fergie,32541568
4,IE,Google,wi-fi,usa,SmartPhone,Galaxy J4,SAMSUNG,no,no,TV,Android,Yahoo,5.5.1,female,15,1.1,frent,12023515


In [12]:
# create the dictionary of unique attributes' lists
unique_attr = {}
for attr in features.columns:
    unique_attr[attr] = features[attr].unique().tolist()

In [None]:
# OneHot encoder
encoder = preprocessing.OneHotEncoder(categories=[unique_attr[i] for i in unique_attr], sparse = False, handle_unknown='ignore')
features['CONCAT'] = features.values.tolist()
features['CONCAT'].head()

In [4]:
# encoding into the list
t0 = time()
features['ENCODED'] = [encoder.fit_transform([i]).flatten() for i in features['CONCAT']]
print ("time on encoding:", round(time()-t0, 3), "s")

time on encoding: 50.699 s


In [5]:
# create the array with feature vectors
features_list = [list(i) for i in features['ENCODED']]
features_list_array = np.array(features_list)
# create the array with label vector 
labels_list_array = np.array(labels.tolist())

In [7]:
# cross-validation
X_train, X_test, y_train, y_test = train_test_split(features_list_array, labels_list_array, random_state=35, test_size=0.2)

In [8]:
# Testing different classifiers
clfs = [
        ['RandomForestClassifier', RandomForestClassifier()],
        ['GradientBoostingClassifier', GradientBoostingClassifier()],
        ['ExtraTreesClassifier', ExtraTreesClassifier()],
        ['AdaBoostClassifier', AdaBoostClassifier()],
        ['BaggingClassifier', BaggingClassifier()],
        ['DecisionTreeClassifier', DecisionTreeClassifier()],
        ['MLPClassifier', MLPClassifier()],
        ['XGBClassifier', XGBClassifier(silent=False)],
        ['CatBoostClassifier', CatBoostClassifier()],
        ['LogisticRegression', LogisticRegression()],
        ['SGDClassifier', SGDClassifier()],
        ['TFFMClassifier', TFFMClassifier()]
       ]
t = time()
for name, clf in clfs:
    t0 = time()
    clf.fit(X_train, y_train)
    prediction = clf.predict(X_test)
    print(name)
    print(classification_report(y_test, prediction))
    print(confusion_matrix(y_test, prediction))
    print('Accuracy is ', accuracy_score(y_test, prediction)*100)
    print ("Time on model's work:", round(time()-t0, 3), "s")
    print('='*100)
print ("TOTAL TIME SPENT: ", round(time()-t, 3), "s")



RandomForestClassifier
              precision    recall  f1-score   support

         0.0       0.93      0.98      0.96      9255
         1.0       0.31      0.11      0.16       745

   micro avg       0.92      0.92      0.92     10000
   macro avg       0.62      0.54      0.56     10000
weighted avg       0.89      0.92      0.90     10000

[[9075  180]
 [ 665   80]]
Accuracy is  91.55
Time on model's work: 7.456 s
GradientBoostingClassifier
              precision    recall  f1-score   support

         0.0       0.93      1.00      0.96      9255
         1.0       0.98      0.06      0.12       745

   micro avg       0.93      0.93      0.93     10000
   macro avg       0.95      0.53      0.54     10000
weighted avg       0.93      0.93      0.90     10000

[[9254    1]
 [ 699   46]]
Accuracy is  93.0
Time on model's work: 476.008 s




ExtraTreesClassifier
              precision    recall  f1-score   support

         0.0       0.93      0.98      0.96      9255
         1.0       0.30      0.11      0.16       745

   micro avg       0.91      0.91      0.91     10000
   macro avg       0.62      0.55      0.56     10000
weighted avg       0.89      0.91      0.90     10000

[[9065  190]
 [ 662   83]]
Accuracy is  91.47999999999999
Time on model's work: 14.262 s
AdaBoostClassifier
              precision    recall  f1-score   support

         0.0       0.93      1.00      0.96      9255
         1.0       0.98      0.06      0.11       745

   micro avg       0.93      0.93      0.93     10000
   macro avg       0.95      0.53      0.54     10000
weighted avg       0.93      0.93      0.90     10000

[[9254    1]
 [ 700   45]]
Accuracy is  92.99
Time on model's work: 105.722 s
BaggingClassifier
              precision    recall  f1-score   support

         0.0       0.93      0.97      0.95      9255
         1.0



MLPClassifier
              precision    recall  f1-score   support

         0.0       0.93      0.97      0.95      9255
         1.0       0.27      0.14      0.19       745

   micro avg       0.91      0.91      0.91     10000
   macro avg       0.60      0.56      0.57     10000
weighted avg       0.88      0.91      0.89     10000

[[8961  294]
 [ 638  107]]
Accuracy is  90.68
Time on model's work: 1155.684 s
[16:32:39] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 12 extra nodes, 0 pruned nodes, max_depth=3
[16:32:42] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 12 extra nodes, 0 pruned nodes, max_depth=3
[16:32:45] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 12 extra nodes, 0 pruned nodes, max_depth=3
[16:32:48] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 12 extra nodes, 0 pr

3:	learn: 0.5014626	total: 1.37s	remaining: 5m 40s
4:	learn: 0.4716411	total: 1.64s	remaining: 5m 26s
5:	learn: 0.4325288	total: 2s	remaining: 5m 30s
6:	learn: 0.4116941	total: 2.25s	remaining: 5m 20s
7:	learn: 0.3921239	total: 2.53s	remaining: 5m 13s
8:	learn: 0.3694596	total: 2.92s	remaining: 5m 21s
9:	learn: 0.3555392	total: 3.19s	remaining: 5m 15s
10:	learn: 0.3431922	total: 3.46s	remaining: 5m 11s
11:	learn: 0.3237243	total: 3.82s	remaining: 5m 14s
12:	learn: 0.3138143	total: 4.09s	remaining: 5m 10s
13:	learn: 0.3056673	total: 4.35s	remaining: 5m 6s
14:	learn: 0.2918893	total: 4.73s	remaining: 5m 10s
15:	learn: 0.2853170	total: 5.01s	remaining: 5m 8s
16:	learn: 0.2795128	total: 5.28s	remaining: 5m 5s
17:	learn: 0.2695613	total: 5.63s	remaining: 5m 7s
18:	learn: 0.2644742	total: 5.91s	remaining: 5m 4s
19:	learn: 0.2608495	total: 6.17s	remaining: 5m 2s
20:	learn: 0.2549968	total: 6.52s	remaining: 5m 3s
21:	learn: 0.2522301	total: 6.8s	remaining: 5m 2s
22:	learn: 0.2489211	total: 7.1

162:	learn: 0.2064280	total: 50.7s	remaining: 4m 20s
163:	learn: 0.2063871	total: 51s	remaining: 4m 20s
164:	learn: 0.2063443	total: 51.3s	remaining: 4m 19s
165:	learn: 0.2063376	total: 51.6s	remaining: 4m 19s
166:	learn: 0.2063248	total: 51.9s	remaining: 4m 18s
167:	learn: 0.2063194	total: 52.1s	remaining: 4m 18s
168:	learn: 0.2063094	total: 52.4s	remaining: 4m 17s
169:	learn: 0.2062956	total: 52.7s	remaining: 4m 17s
170:	learn: 0.2062918	total: 53s	remaining: 4m 16s
171:	learn: 0.2062742	total: 53.2s	remaining: 4m 16s
172:	learn: 0.2062661	total: 53.5s	remaining: 4m 15s
173:	learn: 0.2062595	total: 53.8s	remaining: 4m 15s
174:	learn: 0.2062530	total: 54.1s	remaining: 4m 14s
175:	learn: 0.2062067	total: 54.4s	remaining: 4m 14s
176:	learn: 0.2061669	total: 54.8s	remaining: 4m 14s
177:	learn: 0.2061613	total: 55s	remaining: 4m 14s
178:	learn: 0.2061569	total: 55.3s	remaining: 4m 13s
179:	learn: 0.2061521	total: 55.5s	remaining: 4m 12s
180:	learn: 0.2061489	total: 55.8s	remaining: 4m 12s

316:	learn: 0.2037123	total: 1m 36s	remaining: 3m 28s
317:	learn: 0.2036810	total: 1m 37s	remaining: 3m 28s
318:	learn: 0.2036632	total: 1m 37s	remaining: 3m 27s
319:	learn: 0.2036434	total: 1m 37s	remaining: 3m 27s
320:	learn: 0.2036029	total: 1m 38s	remaining: 3m 27s
321:	learn: 0.2035976	total: 1m 38s	remaining: 3m 26s
322:	learn: 0.2035665	total: 1m 38s	remaining: 3m 26s
323:	learn: 0.2035197	total: 1m 39s	remaining: 3m 26s
324:	learn: 0.2035131	total: 1m 39s	remaining: 3m 26s
325:	learn: 0.2034826	total: 1m 39s	remaining: 3m 26s
326:	learn: 0.2034737	total: 1m 39s	remaining: 3m 25s
327:	learn: 0.2034579	total: 1m 40s	remaining: 3m 25s
328:	learn: 0.2034033	total: 1m 40s	remaining: 3m 25s
329:	learn: 0.2033838	total: 1m 40s	remaining: 3m 25s
330:	learn: 0.2033762	total: 1m 41s	remaining: 3m 24s
331:	learn: 0.2033637	total: 1m 41s	remaining: 3m 24s
332:	learn: 0.2033382	total: 1m 41s	remaining: 3m 23s
333:	learn: 0.2033315	total: 1m 42s	remaining: 3m 23s
334:	learn: 0.2033133	total:

470:	learn: 0.2013875	total: 2m 21s	remaining: 2m 39s
471:	learn: 0.2013781	total: 2m 22s	remaining: 2m 38s
472:	learn: 0.2013725	total: 2m 22s	remaining: 2m 38s
473:	learn: 0.2013698	total: 2m 22s	remaining: 2m 38s
474:	learn: 0.2013571	total: 2m 23s	remaining: 2m 38s
475:	learn: 0.2013528	total: 2m 23s	remaining: 2m 37s
476:	learn: 0.2013420	total: 2m 23s	remaining: 2m 37s
477:	learn: 0.2013381	total: 2m 23s	remaining: 2m 37s
478:	learn: 0.2013225	total: 2m 24s	remaining: 2m 36s
479:	learn: 0.2012984	total: 2m 24s	remaining: 2m 36s
480:	learn: 0.2012886	total: 2m 25s	remaining: 2m 36s
481:	learn: 0.2012564	total: 2m 25s	remaining: 2m 36s
482:	learn: 0.2012530	total: 2m 25s	remaining: 2m 35s
483:	learn: 0.2012406	total: 2m 25s	remaining: 2m 35s
484:	learn: 0.2012341	total: 2m 26s	remaining: 2m 35s
485:	learn: 0.2012282	total: 2m 26s	remaining: 2m 34s
486:	learn: 0.2011997	total: 2m 26s	remaining: 2m 34s
487:	learn: 0.2011916	total: 2m 27s	remaining: 2m 34s
488:	learn: 0.2011627	total:

624:	learn: 0.1995361	total: 3m 10s	remaining: 1m 54s
625:	learn: 0.1995181	total: 3m 10s	remaining: 1m 54s
626:	learn: 0.1995079	total: 3m 11s	remaining: 1m 53s
627:	learn: 0.1995046	total: 3m 11s	remaining: 1m 53s
628:	learn: 0.1994973	total: 3m 11s	remaining: 1m 53s
629:	learn: 0.1994812	total: 3m 11s	remaining: 1m 52s
630:	learn: 0.1994704	total: 3m 12s	remaining: 1m 52s
631:	learn: 0.1994663	total: 3m 12s	remaining: 1m 52s
632:	learn: 0.1994613	total: 3m 12s	remaining: 1m 51s
633:	learn: 0.1994547	total: 3m 13s	remaining: 1m 51s
634:	learn: 0.1994340	total: 3m 13s	remaining: 1m 51s
635:	learn: 0.1994234	total: 3m 13s	remaining: 1m 50s
636:	learn: 0.1994140	total: 3m 13s	remaining: 1m 50s
637:	learn: 0.1994077	total: 3m 14s	remaining: 1m 50s
638:	learn: 0.1993928	total: 3m 14s	remaining: 1m 49s
639:	learn: 0.1993814	total: 3m 14s	remaining: 1m 49s
640:	learn: 0.1993475	total: 3m 15s	remaining: 1m 49s
641:	learn: 0.1993308	total: 3m 15s	remaining: 1m 48s
642:	learn: 0.1993268	total:

776:	learn: 0.1978104	total: 3m 58s	remaining: 1m 8s
777:	learn: 0.1978075	total: 3m 59s	remaining: 1m 8s
778:	learn: 0.1978034	total: 3m 59s	remaining: 1m 7s
779:	learn: 0.1977894	total: 3m 59s	remaining: 1m 7s
780:	learn: 0.1977877	total: 4m	remaining: 1m 7s
781:	learn: 0.1977653	total: 4m	remaining: 1m 7s
782:	learn: 0.1977621	total: 4m	remaining: 1m 6s
783:	learn: 0.1977397	total: 4m 1s	remaining: 1m 6s
784:	learn: 0.1977381	total: 4m 1s	remaining: 1m 6s
785:	learn: 0.1977255	total: 4m 1s	remaining: 1m 5s
786:	learn: 0.1977171	total: 4m 2s	remaining: 1m 5s
787:	learn: 0.1977031	total: 4m 2s	remaining: 1m 5s
788:	learn: 0.1976945	total: 4m 2s	remaining: 1m 4s
789:	learn: 0.1976887	total: 4m 3s	remaining: 1m 4s
790:	learn: 0.1976763	total: 4m 3s	remaining: 1m 4s
791:	learn: 0.1976674	total: 4m 3s	remaining: 1m 3s
792:	learn: 0.1976620	total: 4m 3s	remaining: 1m 3s
793:	learn: 0.1976421	total: 4m 4s	remaining: 1m 3s
794:	learn: 0.1976327	total: 4m 4s	remaining: 1m 3s
795:	learn: 0.197

933:	learn: 0.1963884	total: 4m 52s	remaining: 20.7s
934:	learn: 0.1963722	total: 4m 52s	remaining: 20.4s
935:	learn: 0.1963565	total: 4m 53s	remaining: 20s
936:	learn: 0.1963475	total: 4m 53s	remaining: 19.7s
937:	learn: 0.1963354	total: 4m 53s	remaining: 19.4s
938:	learn: 0.1963329	total: 4m 54s	remaining: 19.1s
939:	learn: 0.1963304	total: 4m 54s	remaining: 18.8s
940:	learn: 0.1963191	total: 4m 54s	remaining: 18.5s
941:	learn: 0.1963139	total: 4m 55s	remaining: 18.2s
942:	learn: 0.1963082	total: 4m 55s	remaining: 17.9s
943:	learn: 0.1962885	total: 4m 55s	remaining: 17.5s
944:	learn: 0.1962817	total: 4m 56s	remaining: 17.2s
945:	learn: 0.1962779	total: 4m 56s	remaining: 16.9s
946:	learn: 0.1962754	total: 4m 56s	remaining: 16.6s
947:	learn: 0.1962630	total: 4m 57s	remaining: 16.3s
948:	learn: 0.1962613	total: 4m 57s	remaining: 16s
949:	learn: 0.1962502	total: 4m 57s	remaining: 15.7s
950:	learn: 0.1962395	total: 4m 58s	remaining: 15.4s
951:	learn: 0.1962278	total: 4m 58s	remaining: 15s



LogisticRegression
              precision    recall  f1-score   support

         0.0       0.93      1.00      0.96      9255
         1.0       0.98      0.06      0.12       745

   micro avg       0.93      0.93      0.93     10000
   macro avg       0.95      0.53      0.54     10000
weighted avg       0.93      0.93      0.90     10000

[[9254    1]
 [ 699   46]]
Accuracy is  93.0
Time on model's work: 1.156 s




SGDClassifier
              precision    recall  f1-score   support

         0.0       0.93      1.00      0.96      9255
         1.0       1.00      0.06      0.11       745

   micro avg       0.93      0.93      0.93     10000
   macro avg       0.96      0.53      0.54     10000
weighted avg       0.93      0.93      0.90     10000

[[9255    0]
 [ 700   45]]
Accuracy is  93.0
Time on model's work: 0.846 s
TFFMClassifier
              precision    recall  f1-score   support

         0.0       0.93      1.00      0.96      9255
         1.0       0.65      0.05      0.09       745

   micro avg       0.93      0.93      0.93     10000
   macro avg       0.79      0.52      0.53     10000
weighted avg       0.91      0.93      0.90     10000

[[9235   20]
 [ 708   37]]
Accuracy is  92.72
Time on model's work: 137.038 s
TOTAL TIME SPENT:  2634.366 s


All models were not able to predict imbalanced clas '1' correctly. Let's use some methods to deal with imbalanced classes.

### 1. Resampling variant
a) delete instances from the over-represented class - under-sampling

In [3]:
# create 2 dataframe with all clicked rows and select the same number of rows as in '1' from '0' data 
data_1 = data[data['Is_lp_click'] == 1]
data_0 = data[data['Is_lp_click'] == 0].sample(35059, random_state = 3)
# concat 2 dataframes and shuffle it
data_concat = pd.concat([data_1, data_0])
data_under = data_concat.sample(frac=1)

In [5]:
# features & labels
labels = data_under['Is_lp_click']
features = data_under.drop('Is_lp_click', axis = 1)
# create the dictionary of unique attributes' lists
unique_attr = {}
for attr in features.columns:
    unique_attr[attr] = features[attr].unique().tolist()
unique_attr
# features encoding
encoder = preprocessing.OneHotEncoder(categories=[unique_attr[i] for i in unique_attr], sparse = False, handle_unknown='ignore')
encoder
features['CONCAT'] = features.values.tolist()
features['CONCAT'].head()
t0 = time()
features['ENCODED'] = [encoder.fit_transform([i]).flatten() for i in features['CONCAT']]
print ("time on encoding:", round(time()-t0, 3), "s")
# create the array with feature vectors
features_list = [list(i) for i in features['ENCODED']]
features_list_array = np.array(features_list)
# create the array with label vector 
labels_list_array = np.array(labels.tolist())

time on encoding: 69.695 s


In [6]:
X_train, X_test, y_train, y_test = train_test_split(features_list_array, labels_list_array, random_state=35, test_size=0.2)

In [7]:
clfs = [
        ['RandomForestClassifier', RandomForestClassifier()],
        ['GradientBoostingClassifier', GradientBoostingClassifier()],
        ['ExtraTreesClassifier', ExtraTreesClassifier()],
        ['AdaBoostClassifier', AdaBoostClassifier()],
        ['BaggingClassifier', BaggingClassifier()],
        ['DecisionTreeClassifier', DecisionTreeClassifier()],
        ['MLPClassifier', MLPClassifier()],
        ['XGBClassifier', XGBClassifier()],
        ['CatBoostClassifier', CatBoostClassifier()],
        ['LogisticRegression', LogisticRegression()],
        ['SGDClassifier', SGDClassifier()],
        ['TFFMClassifier', TFFMClassifier()]
       ]
t = time()
for name, clf in clfs:
    t0 = time()
    clf.fit(X_train, y_train)
    prediction = clf.predict(X_test)
    print(name)
    print(classification_report(y_test, prediction))
    print(confusion_matrix(y_test, prediction))
    print('Accuracy is ', accuracy_score(y_test, prediction)*100)
    print ("Time on model's work:", round(time()-t0, 3), "s")
    print('='*100)
print ("TOTAL TIME SPENT: ", round(time()-t, 3), "s")



RandomForestClassifier
              precision    recall  f1-score   support

         0.0       0.75      0.69      0.72      6986
         1.0       0.71      0.77      0.74      7038

   micro avg       0.73      0.73      0.73     14024
   macro avg       0.73      0.73      0.73     14024
weighted avg       0.73      0.73      0.73     14024

[[4825 2161]
 [1624 5414]]
Accuracy is  73.01055333713634
Time on model's work: 20.163 s
GradientBoostingClassifier
              precision    recall  f1-score   support

         0.0       0.89      0.62      0.73      6986
         1.0       0.71      0.92      0.80      7038

   micro avg       0.77      0.77      0.77     14024
   macro avg       0.80      0.77      0.76     14024
weighted avg       0.80      0.77      0.76     14024

[[4304 2682]
 [ 537 6501]]
Accuracy is  77.04649172846548
Time on model's work: 736.97 s




ExtraTreesClassifier
              precision    recall  f1-score   support

         0.0       0.72      0.71      0.72      6986
         1.0       0.72      0.73      0.72      7038

   micro avg       0.72      0.72      0.72     14024
   macro avg       0.72      0.72      0.72     14024
weighted avg       0.72      0.72      0.72     14024

[[4948 2038]
 [1895 5143]]
Accuracy is  71.95521962350256
Time on model's work: 23.708 s
AdaBoostClassifier
              precision    recall  f1-score   support

         0.0       0.87      0.62      0.73      6986
         1.0       0.71      0.91      0.80      7038

   micro avg       0.77      0.77      0.77     14024
   macro avg       0.79      0.77      0.76     14024
weighted avg       0.79      0.77      0.76     14024

[[4354 2632]
 [ 633 6405]]
Accuracy is  76.71848260125499
Time on model's work: 139.741 s
BaggingClassifier
              precision    recall  f1-score   support

         0.0       0.75      0.68      0.72      6986




MLPClassifier
              precision    recall  f1-score   support

         0.0       0.75      0.68      0.71      6986
         1.0       0.71      0.77      0.74      7038

   micro avg       0.73      0.73      0.73     14024
   macro avg       0.73      0.73      0.72     14024
weighted avg       0.73      0.73      0.72     14024

[[4743 2243]
 [1611 5427]]
Accuracy is  72.51853964632059
Time on model's work: 1768.842 s
[18:10:47] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 12 extra nodes, 0 pruned nodes, max_depth=3
[18:10:52] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 12 extra nodes, 0 pruned nodes, max_depth=3
[18:10:57] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 12 extra nodes, 0 pruned nodes, max_depth=3
[18:11:02] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 12 extra

3:	learn: 0.6073020	total: 1.86s	remaining: 7m 44s
4:	learn: 0.5929036	total: 2.31s	remaining: 7m 39s
5:	learn: 0.5807676	total: 2.71s	remaining: 7m 29s
6:	learn: 0.5704816	total: 3.12s	remaining: 7m 22s
7:	learn: 0.5614763	total: 3.51s	remaining: 7m 15s
8:	learn: 0.5534719	total: 3.9s	remaining: 7m 9s
9:	learn: 0.5465642	total: 4.29s	remaining: 7m 4s
10:	learn: 0.5403775	total: 4.64s	remaining: 6m 57s
11:	learn: 0.5347500	total: 5.07s	remaining: 6m 57s
12:	learn: 0.5300839	total: 5.47s	remaining: 6m 55s
13:	learn: 0.5261025	total: 5.81s	remaining: 6m 49s
14:	learn: 0.5225917	total: 6.22s	remaining: 6m 48s
15:	learn: 0.5193816	total: 6.59s	remaining: 6m 45s
16:	learn: 0.5164804	total: 7s	remaining: 6m 44s
17:	learn: 0.5139630	total: 7.35s	remaining: 6m 41s
18:	learn: 0.5117275	total: 7.69s	remaining: 6m 37s
19:	learn: 0.5096184	total: 8.08s	remaining: 6m 35s
20:	learn: 0.5078931	total: 8.43s	remaining: 6m 33s
21:	learn: 0.5062018	total: 8.82s	remaining: 6m 32s
22:	learn: 0.5046138	tota

161:	learn: 0.4848040	total: 51s	remaining: 4m 23s
162:	learn: 0.4847738	total: 51.3s	remaining: 4m 23s
163:	learn: 0.4847216	total: 51.6s	remaining: 4m 23s
164:	learn: 0.4846974	total: 51.9s	remaining: 4m 22s
165:	learn: 0.4846378	total: 52.2s	remaining: 4m 22s
166:	learn: 0.4846175	total: 52.5s	remaining: 4m 21s
167:	learn: 0.4845995	total: 52.7s	remaining: 4m 21s
168:	learn: 0.4845735	total: 53.1s	remaining: 4m 20s
169:	learn: 0.4845582	total: 53.3s	remaining: 4m 20s
170:	learn: 0.4845465	total: 53.5s	remaining: 4m 19s
171:	learn: 0.4845393	total: 53.7s	remaining: 4m 18s
172:	learn: 0.4845261	total: 54s	remaining: 4m 17s
173:	learn: 0.4845077	total: 54.2s	remaining: 4m 17s
174:	learn: 0.4844643	total: 54.5s	remaining: 4m 17s
175:	learn: 0.4844475	total: 54.8s	remaining: 4m 16s
176:	learn: 0.4844131	total: 55.1s	remaining: 4m 16s
177:	learn: 0.4843956	total: 55.3s	remaining: 4m 15s
178:	learn: 0.4843829	total: 55.5s	remaining: 4m 14s
179:	learn: 0.4843682	total: 55.7s	remaining: 4m 1

315:	learn: 0.4814318	total: 1m 33s	remaining: 3m 21s
316:	learn: 0.4814203	total: 1m 33s	remaining: 3m 21s
317:	learn: 0.4813978	total: 1m 33s	remaining: 3m 20s
318:	learn: 0.4813840	total: 1m 33s	remaining: 3m 20s
319:	learn: 0.4813726	total: 1m 34s	remaining: 3m 19s
320:	learn: 0.4813555	total: 1m 34s	remaining: 3m 19s
321:	learn: 0.4813428	total: 1m 34s	remaining: 3m 19s
322:	learn: 0.4813327	total: 1m 34s	remaining: 3m 18s
323:	learn: 0.4813144	total: 1m 35s	remaining: 3m 18s
324:	learn: 0.4813016	total: 1m 35s	remaining: 3m 17s
325:	learn: 0.4812762	total: 1m 35s	remaining: 3m 17s
326:	learn: 0.4812591	total: 1m 35s	remaining: 3m 17s
327:	learn: 0.4812166	total: 1m 36s	remaining: 3m 16s
328:	learn: 0.4812051	total: 1m 36s	remaining: 3m 16s
329:	learn: 0.4811978	total: 1m 36s	remaining: 3m 16s
330:	learn: 0.4811807	total: 1m 36s	remaining: 3m 15s
331:	learn: 0.4811578	total: 1m 37s	remaining: 3m 15s
332:	learn: 0.4811309	total: 1m 37s	remaining: 3m 15s
333:	learn: 0.4810925	total:

469:	learn: 0.4791794	total: 2m 13s	remaining: 2m 30s
470:	learn: 0.4791645	total: 2m 13s	remaining: 2m 30s
471:	learn: 0.4790972	total: 2m 14s	remaining: 2m 29s
472:	learn: 0.4790878	total: 2m 14s	remaining: 2m 29s
473:	learn: 0.4790814	total: 2m 14s	remaining: 2m 29s
474:	learn: 0.4790575	total: 2m 14s	remaining: 2m 28s
475:	learn: 0.4790503	total: 2m 15s	remaining: 2m 28s
476:	learn: 0.4790439	total: 2m 15s	remaining: 2m 28s
477:	learn: 0.4790347	total: 2m 15s	remaining: 2m 28s
478:	learn: 0.4790221	total: 2m 15s	remaining: 2m 27s
479:	learn: 0.4790137	total: 2m 16s	remaining: 2m 27s
480:	learn: 0.4790035	total: 2m 16s	remaining: 2m 27s
481:	learn: 0.4789832	total: 2m 16s	remaining: 2m 26s
482:	learn: 0.4789730	total: 2m 16s	remaining: 2m 26s
483:	learn: 0.4789654	total: 2m 17s	remaining: 2m 26s
484:	learn: 0.4789568	total: 2m 17s	remaining: 2m 26s
485:	learn: 0.4789402	total: 2m 17s	remaining: 2m 25s
486:	learn: 0.4789215	total: 2m 18s	remaining: 2m 25s
487:	learn: 0.4789135	total:

622:	learn: 0.4770739	total: 2m 58s	remaining: 1m 47s
623:	learn: 0.4770671	total: 2m 58s	remaining: 1m 47s
624:	learn: 0.4770502	total: 2m 59s	remaining: 1m 47s
625:	learn: 0.4770359	total: 2m 59s	remaining: 1m 47s
626:	learn: 0.4770248	total: 2m 59s	remaining: 1m 46s
627:	learn: 0.4770198	total: 3m	remaining: 1m 46s
628:	learn: 0.4770151	total: 3m	remaining: 1m 46s
629:	learn: 0.4770035	total: 3m	remaining: 1m 46s
630:	learn: 0.4769957	total: 3m 1s	remaining: 1m 46s
631:	learn: 0.4769461	total: 3m 1s	remaining: 1m 45s
632:	learn: 0.4769377	total: 3m 1s	remaining: 1m 45s
633:	learn: 0.4769295	total: 3m 2s	remaining: 1m 45s
634:	learn: 0.4769128	total: 3m 2s	remaining: 1m 44s
635:	learn: 0.4769072	total: 3m 2s	remaining: 1m 44s
636:	learn: 0.4768923	total: 3m 3s	remaining: 1m 44s
637:	learn: 0.4768834	total: 3m 3s	remaining: 1m 44s
638:	learn: 0.4768712	total: 3m 3s	remaining: 1m 43s
639:	learn: 0.4768628	total: 3m 3s	remaining: 1m 43s
640:	learn: 0.4768387	total: 3m 4s	remaining: 1m 4

775:	learn: 0.4752065	total: 3m 43s	remaining: 1m 4s
776:	learn: 0.4751937	total: 3m 43s	remaining: 1m 4s
777:	learn: 0.4751860	total: 3m 43s	remaining: 1m 3s
778:	learn: 0.4751765	total: 3m 43s	remaining: 1m 3s
779:	learn: 0.4751537	total: 3m 44s	remaining: 1m 3s
780:	learn: 0.4751386	total: 3m 44s	remaining: 1m 2s
781:	learn: 0.4751298	total: 3m 44s	remaining: 1m 2s
782:	learn: 0.4751209	total: 3m 45s	remaining: 1m 2s
783:	learn: 0.4751136	total: 3m 45s	remaining: 1m 2s
784:	learn: 0.4751037	total: 3m 45s	remaining: 1m 1s
785:	learn: 0.4750920	total: 3m 45s	remaining: 1m 1s
786:	learn: 0.4750770	total: 3m 46s	remaining: 1m 1s
787:	learn: 0.4750690	total: 3m 46s	remaining: 1m
788:	learn: 0.4750594	total: 3m 46s	remaining: 1m
789:	learn: 0.4750525	total: 3m 46s	remaining: 1m
790:	learn: 0.4750404	total: 3m 47s	remaining: 1m
791:	learn: 0.4750298	total: 3m 47s	remaining: 59.7s
792:	learn: 0.4750060	total: 3m 47s	remaining: 59.5s
793:	learn: 0.4749946	total: 3m 48s	remaining: 59.2s
794:	

932:	learn: 0.4734175	total: 4m 26s	remaining: 19.2s
933:	learn: 0.4733987	total: 4m 27s	remaining: 18.9s
934:	learn: 0.4733912	total: 4m 27s	remaining: 18.6s
935:	learn: 0.4733775	total: 4m 27s	remaining: 18.3s
936:	learn: 0.4733671	total: 4m 28s	remaining: 18s
937:	learn: 0.4733570	total: 4m 28s	remaining: 17.7s
938:	learn: 0.4733511	total: 4m 28s	remaining: 17.5s
939:	learn: 0.4733459	total: 4m 29s	remaining: 17.2s
940:	learn: 0.4733149	total: 4m 29s	remaining: 16.9s
941:	learn: 0.4733088	total: 4m 29s	remaining: 16.6s
942:	learn: 0.4732921	total: 4m 30s	remaining: 16.3s
943:	learn: 0.4732793	total: 4m 30s	remaining: 16s
944:	learn: 0.4732761	total: 4m 30s	remaining: 15.8s
945:	learn: 0.4732722	total: 4m 30s	remaining: 15.5s
946:	learn: 0.4732610	total: 4m 31s	remaining: 15.2s
947:	learn: 0.4732488	total: 4m 31s	remaining: 14.9s
948:	learn: 0.4732428	total: 4m 31s	remaining: 14.6s
949:	learn: 0.4732351	total: 4m 32s	remaining: 14.3s
950:	learn: 0.4732288	total: 4m 32s	remaining: 14s



LogisticRegression
              precision    recall  f1-score   support

         0.0       0.86      0.63      0.73      6986
         1.0       0.71      0.90      0.80      7038

   micro avg       0.77      0.77      0.77     14024
   macro avg       0.79      0.77      0.76     14024
weighted avg       0.79      0.77      0.76     14024

[[4427 2559]
 [ 693 6345]]
Accuracy is  76.81118083285796
Time on model's work: 3.187 s




SGDClassifier
              precision    recall  f1-score   support

         0.0       0.56      0.92      0.70      6986
         1.0       0.78      0.28      0.41      7038

   micro avg       0.60      0.60      0.60     14024
   macro avg       0.67      0.60      0.55     14024
weighted avg       0.67      0.60      0.55     14024

[[6432  554]
 [5066 1972]]
Accuracy is  59.92584141471763
Time on model's work: 1.925 s
TFFMClassifier
              precision    recall  f1-score   support

         0.0       0.86      0.63      0.73      6986
         1.0       0.71      0.89      0.79      7038

   micro avg       0.76      0.76      0.76     14024
   macro avg       0.78      0.76      0.76     14024
weighted avg       0.78      0.76      0.76     14024

[[4434 2552]
 [ 749 6289]]
Accuracy is  76.46177980604678
Time on model's work: 229.59 s
TOTAL TIME SPENT:  3863.884 s


In [7]:
# TFFM sparse
# only CSR format supported
X_train_sparse = sp.csr_matrix(X_train)
X_test_sparse = sp.csr_matrix(X_test)

In [12]:
# input_type='sparse' /// rank == 10
for order in [2, 3]:
    model = TFFMClassifier(
        order=order, 
        rank=10, 
        optimizer=tf.train.AdamOptimizer(learning_rate=0.001), 
        n_epochs=50, 
        batch_size=512,
        init_std=0.001,
        reg=0.01,
        input_type='sparse',
        #log_dir='./tmp/logs',
        #verbose=1,
        seed=42
    )
    model.fit(X_train_sparse, y_train, show_progress=True)
    predictions = model.predict(X_test_sparse)
    print('[order={}] accuracy: {}'.format(order, accuracy_score(y_test, predictions)))
    # this will close tf.Session and free resources
    print(confusion_matrix(y_test,predictions)) 
    print(classification_report(y_test, predictions))
    model.destroy()

100%|██████████| 50/50 [00:26<00:00,  1.93epoch/s]


[order=2] accuracy: 0.7620507701083856
[[4428 2613]
 [ 724 6259]]
              precision    recall  f1-score   support

         0.0       0.86      0.63      0.73      7041
         1.0       0.71      0.90      0.79      6983

   micro avg       0.76      0.76      0.76     14024
   macro avg       0.78      0.76      0.76     14024
weighted avg       0.78      0.76      0.76     14024



100%|██████████| 50/50 [00:49<00:00,  1.02epoch/s]


[order=3] accuracy: 0.761908157444381
[[4448 2593]
 [ 746 6237]]
              precision    recall  f1-score   support

         0.0       0.86      0.63      0.73      7041
         1.0       0.71      0.89      0.79      6983

   micro avg       0.76      0.76      0.76     14024
   macro avg       0.78      0.76      0.76     14024
weighted avg       0.78      0.76      0.76     14024



In [22]:
# FtrlOptimizer
# input_type='sparse' /// rank == 10
model = TFFMClassifier(
    order=2, 
    rank=10, 
    optimizer=tf.train.FtrlOptimizer(0.01, l2_regularization_strength=0.001, l2_shrinkage_regularization_strength=0.001), 
    n_epochs=50, 
    batch_size=1024,
    init_std=0.001,
    reg=0.01,
    input_type='sparse',
    #log_dir='./tmp/logs',
    #verbose=1,
    seed=42
)
model.fit(X_train_sparse, y_train, show_progress=True)
predictions = model.predict(X_test_sparse)
print('[order={}] accuracy: {}'.format(order, accuracy_score(y_test, predictions)))
# this will close tf.Session and free resources
print(confusion_matrix(y_test,predictions)) 
print(classification_report(y_test, predictions))
model.destroy()


  0%|          | 0/50 [00:00<?, ?epoch/s]
  2%|▏         | 1/50 [00:00<00:30,  1.63epoch/s]
  4%|▍         | 2/50 [00:01<00:27,  1.75epoch/s]
  6%|▌         | 3/50 [00:01<00:25,  1.84epoch/s]
  8%|▊         | 4/50 [00:02<00:24,  1.91epoch/s]
 10%|█         | 5/50 [00:02<00:22,  1.96epoch/s]
 12%|█▏        | 6/50 [00:02<00:21,  2.00epoch/s]
 14%|█▍        | 7/50 [00:03<00:21,  2.02epoch/s]
 16%|█▌        | 8/50 [00:03<00:20,  2.03epoch/s]
 18%|█▊        | 9/50 [00:04<00:20,  2.03epoch/s]
 20%|██        | 10/50 [00:04<00:19,  2.04epoch/s]
 22%|██▏       | 11/50 [00:05<00:18,  2.06epoch/s]
 24%|██▍       | 12/50 [00:05<00:18,  2.07epoch/s]
 26%|██▌       | 13/50 [00:06<00:17,  2.08epoch/s]
 28%|██▊       | 14/50 [00:06<00:17,  2.08epoch/s]
 30%|███       | 15/50 [00:07<00:16,  2.08epoch/s]
 32%|███▏      | 16/50 [00:07<00:16,  2.09epoch/s]
 34%|███▍      | 17/50 [00:08<00:15,  2.09epoch/s]
 36%|███▌      | 18/50 [00:08<00:15,  2.10epoch/s]
 38%|███▊      | 19/50 [00:09<00:15,  2.06epoch/

[order=3] accuracy: 0.7507130633200229
[[4336 2705]
 [ 791 6192]]
              precision    recall  f1-score   support

         0.0       0.85      0.62      0.71      7041
         1.0       0.70      0.89      0.78      6983

   micro avg       0.75      0.75      0.75     14024
   macro avg       0.77      0.75      0.75     14024
weighted avg       0.77      0.75      0.75     14024



In [26]:
# sample_weight='balanced'
model = TFFMClassifier(
    order=2,
    sample_weight='balanced',
    rank=10, 
    optimizer=tf.train.AdamOptimizer(learning_rate=0.001), 
    n_epochs=50, 
    batch_size=1024,
    init_std=0.001,
    reg=0.01,
    input_type='sparse',
    #log_dir='./tmp/logs',
    #verbose=1,
    seed=42
)
model.fit(X_train_sparse, y_train, show_progress=True)
predictions = model.predict(X_test_sparse)
print('[order={}] accuracy: {}'.format(order, accuracy_score(y_test, predictions)))
# this will close tf.Session and free resources
print(confusion_matrix(y_test,predictions)) 
print(classification_report(y_test, predictions))
model.destroy()


  0%|          | 0/50 [00:00<?, ?epoch/s]
  2%|▏         | 1/50 [00:00<00:29,  1.68epoch/s]
  4%|▍         | 2/50 [00:01<00:26,  1.79epoch/s]
  6%|▌         | 3/50 [00:01<00:24,  1.88epoch/s]
  8%|▊         | 4/50 [00:02<00:23,  1.95epoch/s]
 10%|█         | 5/50 [00:02<00:22,  2.00epoch/s]
 12%|█▏        | 6/50 [00:02<00:21,  2.04epoch/s]
 14%|█▍        | 7/50 [00:03<00:20,  2.07epoch/s]
 16%|█▌        | 8/50 [00:03<00:20,  2.08epoch/s]
 18%|█▊        | 9/50 [00:04<00:19,  2.10epoch/s]
 20%|██        | 10/50 [00:04<00:19,  2.10epoch/s]
 22%|██▏       | 11/50 [00:05<00:18,  2.11epoch/s]
 24%|██▍       | 12/50 [00:05<00:17,  2.12epoch/s]
 26%|██▌       | 13/50 [00:06<00:17,  2.11epoch/s]
 28%|██▊       | 14/50 [00:06<00:17,  2.07epoch/s]
 30%|███       | 15/50 [00:07<00:17,  2.05epoch/s]
 32%|███▏      | 16/50 [00:07<00:16,  2.05epoch/s]
 34%|███▍      | 17/50 [00:08<00:16,  2.05epoch/s]
 36%|███▌      | 18/50 [00:08<00:15,  2.02epoch/s]
 38%|███▊      | 19/50 [00:09<00:15,  2.04epoch/

[order=3] accuracy: 0.7623359954363947
[[4427 2614]
 [ 719 6264]]
              precision    recall  f1-score   support

         0.0       0.86      0.63      0.73      7041
         1.0       0.71      0.90      0.79      6983

   micro avg       0.76      0.76      0.76     14024
   macro avg       0.78      0.76      0.76     14024
weighted avg       0.78      0.76      0.76     14024



In [28]:
# weight - optional
pos_class_weight = list(map(float,range(1, 5)))
for weight in pos_class_weight:
    model = TFFMClassifier(
        order=2,
        pos_class_weight=weight,
        rank=10, 
        optimizer=tf.train.AdamOptimizer(learning_rate=0.001), 
        n_epochs=20, 
        batch_size=1024,
        init_std=0.001,
        reg=0.01,
        input_type='sparse',
        #log_dir='./tmp/logs',
        #verbose=1,
        seed=42
    )
    model.fit(X_train_sparse, y_train, show_progress=True)
    predictions = model.predict(X_test_sparse)
    print('[order={}] accuracy: {}'.format(order, accuracy_score(y_test, predictions)))
    # this will close tf.Session and free resources
    print(confusion_matrix(y_test,predictions)) 
    print(classification_report(y_test, predictions))
    model.destroy()


  0%|          | 0/20 [00:00<?, ?epoch/s]
  5%|▌         | 1/20 [00:00<00:11,  1.66epoch/s]
 10%|█         | 2/20 [00:01<00:10,  1.78epoch/s]
 15%|█▌        | 3/20 [00:01<00:09,  1.87epoch/s]
 20%|██        | 4/20 [00:02<00:08,  1.94epoch/s]
 25%|██▌       | 5/20 [00:02<00:07,  1.99epoch/s]
 30%|███       | 6/20 [00:02<00:06,  2.03epoch/s]
 35%|███▌      | 7/20 [00:03<00:06,  2.05epoch/s]
 40%|████      | 8/20 [00:03<00:05,  2.07epoch/s]
 45%|████▌     | 9/20 [00:04<00:05,  2.08epoch/s]
 50%|█████     | 10/20 [00:04<00:04,  2.10epoch/s]
 55%|█████▌    | 11/20 [00:05<00:04,  2.10epoch/s]
 60%|██████    | 12/20 [00:05<00:03,  2.07epoch/s]
 65%|██████▌   | 13/20 [00:06<00:03,  2.07epoch/s]
 70%|███████   | 14/20 [00:06<00:02,  2.08epoch/s]
 75%|███████▌  | 15/20 [00:07<00:02,  2.09epoch/s]
 80%|████████  | 16/20 [00:07<00:01,  2.10epoch/s]
 85%|████████▌ | 17/20 [00:08<00:01,  2.10epoch/s]
 90%|█████████ | 18/20 [00:08<00:00,  2.11epoch/s]
 95%|█████████▌| 19/20 [00:09<00:00,  2.11epoch/

[order=3] accuracy: 0.7638334284084427
[[4509 2532]
 [ 780 6203]]
              precision    recall  f1-score   support

         0.0       0.85      0.64      0.73      7041
         1.0       0.71      0.89      0.79      6983

   micro avg       0.76      0.76      0.76     14024
   macro avg       0.78      0.76      0.76     14024
weighted avg       0.78      0.76      0.76     14024




  0%|          | 0/20 [00:00<?, ?epoch/s]
  5%|▌         | 1/20 [00:00<00:11,  1.67epoch/s]
 10%|█         | 2/20 [00:01<00:10,  1.77epoch/s]
 15%|█▌        | 3/20 [00:01<00:09,  1.87epoch/s]
 20%|██        | 4/20 [00:02<00:08,  1.94epoch/s]
 25%|██▌       | 5/20 [00:02<00:07,  1.99epoch/s]
 30%|███       | 6/20 [00:02<00:06,  2.03epoch/s]
 35%|███▌      | 7/20 [00:03<00:06,  2.06epoch/s]
 40%|████      | 8/20 [00:03<00:05,  2.08epoch/s]
 45%|████▌     | 9/20 [00:04<00:05,  2.09epoch/s]
 50%|█████     | 10/20 [00:04<00:04,  2.07epoch/s]
 55%|█████▌    | 11/20 [00:05<00:04,  2.09epoch/s]
 60%|██████    | 12/20 [00:05<00:03,  2.10epoch/s]
 65%|██████▌   | 13/20 [00:06<00:03,  2.11epoch/s]
 70%|███████   | 14/20 [00:06<00:02,  2.11epoch/s]
 75%|███████▌  | 15/20 [00:07<00:02,  2.09epoch/s]
 80%|████████  | 16/20 [00:07<00:01,  2.09epoch/s]
 85%|████████▌ | 17/20 [00:08<00:01,  2.07epoch/s]
 90%|█████████ | 18/20 [00:08<00:01,  1.93epoch/s]
 95%|█████████▌| 19/20 [00:09<00:00,  1.97epoch/

[order=3] accuracy: 0.7651169423844837
[[4292 2749]
 [ 545 6438]]
              precision    recall  f1-score   support

         0.0       0.89      0.61      0.72      7041
         1.0       0.70      0.92      0.80      6983

   micro avg       0.77      0.77      0.77     14024
   macro avg       0.79      0.77      0.76     14024
weighted avg       0.79      0.77      0.76     14024




  0%|          | 0/20 [00:00<?, ?epoch/s]
  5%|▌         | 1/20 [00:00<00:11,  1.69epoch/s]
 10%|█         | 2/20 [00:01<00:09,  1.81epoch/s]
 15%|█▌        | 3/20 [00:01<00:09,  1.86epoch/s]
 20%|██        | 4/20 [00:02<00:08,  1.84epoch/s]
 25%|██▌       | 5/20 [00:02<00:07,  1.88epoch/s]
 30%|███       | 6/20 [00:03<00:07,  1.89epoch/s]
 35%|███▌      | 7/20 [00:03<00:06,  1.93epoch/s]
 40%|████      | 8/20 [00:04<00:06,  1.98epoch/s]
 45%|████▌     | 9/20 [00:04<00:05,  1.99epoch/s]
 50%|█████     | 10/20 [00:05<00:05,  1.98epoch/s]
 55%|█████▌    | 11/20 [00:05<00:04,  1.99epoch/s]
 60%|██████    | 12/20 [00:06<00:04,  1.86epoch/s]
 65%|██████▌   | 13/20 [00:06<00:03,  1.93epoch/s]
 70%|███████   | 14/20 [00:07<00:03,  1.97epoch/s]
 75%|███████▌  | 15/20 [00:07<00:02,  2.02epoch/s]
 80%|████████  | 16/20 [00:08<00:01,  2.05epoch/s]
 85%|████████▌ | 17/20 [00:08<00:01,  2.08epoch/s]
 90%|█████████ | 18/20 [00:09<00:00,  2.10epoch/s]
 95%|█████████▌| 19/20 [00:09<00:00,  2.11epoch/

[order=3] accuracy: 0.7600541928123218
[[4155 2886]
 [ 479 6504]]
              precision    recall  f1-score   support

         0.0       0.90      0.59      0.71      7041
         1.0       0.69      0.93      0.79      6983

   micro avg       0.76      0.76      0.76     14024
   macro avg       0.79      0.76      0.75     14024
weighted avg       0.80      0.76      0.75     14024




  0%|          | 0/20 [00:00<?, ?epoch/s]
  5%|▌         | 1/20 [00:00<00:13,  1.42epoch/s]
 10%|█         | 2/20 [00:01<00:11,  1.56epoch/s]
 15%|█▌        | 3/20 [00:01<00:10,  1.66epoch/s]
 20%|██        | 4/20 [00:02<00:09,  1.76epoch/s]
 25%|██▌       | 5/20 [00:02<00:08,  1.84epoch/s]
 30%|███       | 6/20 [00:03<00:07,  1.91epoch/s]
 35%|███▌      | 7/20 [00:03<00:06,  1.94epoch/s]
 40%|████      | 8/20 [00:04<00:06,  1.96epoch/s]
 45%|████▌     | 9/20 [00:04<00:05,  2.00epoch/s]
 50%|█████     | 10/20 [00:05<00:04,  2.03epoch/s]
 55%|█████▌    | 11/20 [00:05<00:04,  2.04epoch/s]
 60%|██████    | 12/20 [00:06<00:03,  2.04epoch/s]
 65%|██████▌   | 13/20 [00:06<00:03,  2.02epoch/s]
 70%|███████   | 14/20 [00:07<00:02,  2.03epoch/s]
 75%|███████▌  | 15/20 [00:07<00:02,  2.05epoch/s]
 80%|████████  | 16/20 [00:08<00:01,  2.07epoch/s]
 85%|████████▌ | 17/20 [00:08<00:01,  2.04epoch/s]
 90%|█████████ | 18/20 [00:09<00:00,  2.06epoch/s]
 95%|█████████▌| 19/20 [00:09<00:00,  2.07epoch/

[order=3] accuracy: 0.7501426126640046
[[3960 3081]
 [ 423 6560]]
              precision    recall  f1-score   support

         0.0       0.90      0.56      0.69      7041
         1.0       0.68      0.94      0.79      6983

   micro avg       0.75      0.75      0.75     14024
   macro avg       0.79      0.75      0.74     14024
weighted avg       0.79      0.75      0.74     14024



In [39]:
# weight - 2.0 best
model = TFFMClassifier(
    order=2,
    pos_class_weight=2.0,
    rank=10, 
    optimizer=tf.train.AdamOptimizer(learning_rate=0.001), 
    n_epochs=100, 
    batch_size=1024,
    init_std=0.001,
    reg=0.01,
    input_type='sparse',
    #log_dir='./tmp/logs',
    #verbose=1,
    seed=42
)
model.fit(X_train_sparse, y_train, show_progress=True)
predictions = model.predict(X_test_sparse)
print('[order={}] accuracy: {}'.format(order, accuracy_score(y_test, predictions)))
# this will close tf.Session and free resources
print(confusion_matrix(y_test,predictions)) 
print(classification_report(y_test, predictions))
model.destroy()


  0%|          | 0/100 [00:00<?, ?epoch/s]
  1%|          | 1/100 [00:00<01:00,  1.63epoch/s]
  2%|▏         | 2/100 [00:01<00:56,  1.74epoch/s]
  3%|▎         | 3/100 [00:01<00:52,  1.84epoch/s]
  4%|▍         | 4/100 [00:02<00:50,  1.91epoch/s]
  5%|▌         | 5/100 [00:02<00:48,  1.97epoch/s]
  6%|▌         | 6/100 [00:03<00:47,  1.99epoch/s]
  7%|▋         | 7/100 [00:03<00:46,  2.01epoch/s]
  8%|▊         | 8/100 [00:03<00:45,  2.03epoch/s]
  9%|▉         | 9/100 [00:04<00:44,  2.05epoch/s]
 10%|█         | 10/100 [00:04<00:43,  2.07epoch/s]
 11%|█         | 11/100 [00:05<00:43,  2.07epoch/s]
 12%|█▏        | 12/100 [00:05<00:42,  2.07epoch/s]
 13%|█▎        | 13/100 [00:06<00:42,  2.06epoch/s]
 14%|█▍        | 14/100 [00:06<00:41,  2.07epoch/s]
 15%|█▌        | 15/100 [00:07<00:40,  2.09epoch/s]
 16%|█▌        | 16/100 [00:07<00:40,  2.08epoch/s]
 17%|█▋        | 17/100 [00:08<00:39,  2.09epoch/s]
 18%|█▊        | 18/100 [00:08<00:39,  2.09epoch/s]
 19%|█▉        | 19/100 [00:0

[order=3] accuracy: 0.7580576155162578
[[4204 2837]
 [ 556 6427]]
              precision    recall  f1-score   support

         0.0       0.88      0.60      0.71      7041
         1.0       0.69      0.92      0.79      6983

   micro avg       0.76      0.76      0.76     14024
   macro avg       0.79      0.76      0.75     14024
weighted avg       0.79      0.76      0.75     14024



In [42]:
# BEST OPTIMIZED TFFM
model = TFFMClassifier(
    order=2, 
    rank=10,
    pos_class_weight=2.0,
    optimizer=tf.train.FtrlOptimizer(0.01, l2_regularization_strength=0.001, l2_shrinkage_regularization_strength=0.001), 
    n_epochs=100, 
    batch_size=1024,
    init_std=0.001,
    reg=0.01,
    input_type='sparse',
    #log_dir='./tmp/logs',
    #verbose=1,
    seed=42
)
model.fit(X_train_sparse, y_train, show_progress=True)
predictions = model.predict(X_test_sparse)
print('[order={}] accuracy: {}'.format(order, accuracy_score(y_test, predictions)))
# this will close tf.Session and free resources
print(confusion_matrix(y_test,predictions)) 
print(classification_report(y_test, predictions))
model.destroy()


  0%|          | 0/100 [00:00<?, ?epoch/s]
  1%|          | 1/100 [00:00<01:02,  1.58epoch/s]
  2%|▏         | 2/100 [00:01<01:01,  1.60epoch/s]
  3%|▎         | 3/100 [00:01<00:57,  1.69epoch/s]
  4%|▍         | 4/100 [00:02<00:53,  1.79epoch/s]
  5%|▌         | 5/100 [00:02<00:50,  1.87epoch/s]
  6%|▌         | 6/100 [00:03<00:48,  1.93epoch/s]
  7%|▋         | 7/100 [00:03<00:47,  1.97epoch/s]
  8%|▊         | 8/100 [00:04<00:46,  2.00epoch/s]
  9%|▉         | 9/100 [00:04<00:45,  2.01epoch/s]
 10%|█         | 10/100 [00:05<00:44,  2.02epoch/s]
 11%|█         | 11/100 [00:05<00:43,  2.03epoch/s]
 12%|█▏        | 12/100 [00:06<00:43,  2.03epoch/s]
 13%|█▎        | 13/100 [00:06<00:42,  2.03epoch/s]
 14%|█▍        | 14/100 [00:07<00:42,  2.03epoch/s]
 15%|█▌        | 15/100 [00:07<00:41,  2.05epoch/s]
 16%|█▌        | 16/100 [00:08<00:41,  2.01epoch/s]
 17%|█▋        | 17/100 [00:08<00:41,  1.99epoch/s]
 18%|█▊        | 18/100 [00:09<00:40,  2.00epoch/s]
 19%|█▉        | 19/100 [00:0

[order=3] accuracy: 0.7678265830005705
[[4326 2715]
 [ 541 6442]]
              precision    recall  f1-score   support

         0.0       0.89      0.61      0.73      7041
         1.0       0.70      0.92      0.80      6983

   micro avg       0.77      0.77      0.77     14024
   macro avg       0.80      0.77      0.76     14024
weighted avg       0.80      0.77      0.76     14024



In [44]:
# KERAS
model = Sequential()
model.add(Dense(128, input_dim=2454, activation='sigmoid'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='sigmoid'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

model.fit(X_train, y_train,
          epochs=20,
          batch_size=256)
score = model.evaluate(X_test, y_test, batch_size=256)
print(score)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20


Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20


Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20


Epoch 19/20
Epoch 20/20
[0.49118000042826127, 0.767969195528569]


b) add copies of instances from the under-represented class - over-sampling 

In [3]:
# create 2 dataframe with all clicked rows and select the same number of rows as in '1' from '0' data 
data_1 = data[data['Is_lp_click'] == 1]
data_1_add = data[data['Is_lp_click'] == 1].sample(15000, random_state = 32)
data_0 = data[data['Is_lp_click'] == 0].sample(50000, random_state = 12)
# concat 2 dataframes and shuffle it
data_concat = pd.concat([data_1, data_1_add, data_0])
data_under = data_concat.sample(frac=1)

In [5]:
# features & labels
labels = data_under['Is_lp_click']
features = data_under.drop('Is_lp_click', axis = 1)
# create the dictionary of unique attributes' lists
unique_attr = {}
for attr in features.columns:
    unique_attr[attr] = features[attr].unique().tolist()
unique_attr
# features encoding
encoder = preprocessing.OneHotEncoder(categories=[unique_attr[i] for i in unique_attr], sparse = False, handle_unknown='ignore')
encoder
features['CONCAT'] = features.values.tolist()
features['CONCAT'].head()
t0 = time()
features['ENCODED'] = [encoder.fit_transform([i]).flatten() for i in features['CONCAT']]
print ("time on encoding:", round(time()-t0, 3), "s")
# create the array with feature vectors
features_list = [list(i) for i in features['ENCODED']]
features_list_array = np.array(features_list)
# create the array with label vector 
labels_list_array = np.array(labels.tolist())

time on encoding: 104.521 s


In [6]:
X_train, X_test, y_train, y_test = train_test_split(features_list_array, labels_list_array, random_state=35, test_size=0.3)

In [7]:
clfs = [
        ['RandomForestClassifier', RandomForestClassifier()],
        ['GradientBoostingClassifier', GradientBoostingClassifier()],
        ['ExtraTreesClassifier', ExtraTreesClassifier()],
        ['AdaBoostClassifier', AdaBoostClassifier()],
        ['BaggingClassifier', BaggingClassifier()],
        ['DecisionTreeClassifier', DecisionTreeClassifier()],
        ['MLPClassifier', MLPClassifier()],
        ['XGBClassifier', XGBClassifier()],
        ['CatBoostClassifier', CatBoostClassifier()],
        ['LogisticRegression', LogisticRegression()],
        ['SGDClassifier', SGDClassifier()],
        ['TFFMClassifier', TFFMClassifier()]
       ]
t = time()
for name, clf in clfs:
    t0 = time()
    clf.fit(X_train, y_train)
    prediction = clf.predict(X_test)
    print(name)
    print(classification_report(y_test, prediction))
    print(confusion_matrix(y_test, prediction))
    print('Accuracy is ', accuracy_score(y_test, prediction)*100)
    print ("Time on model's work:", round(time()-t0, 3), "s")
    print('='*100)
print ("TOTAL TIME SPENT: ", round(time()-t, 3), "s")



RandomForestClassifier
              precision    recall  f1-score   support

         0.0       0.78      0.69      0.73     15014
         1.0       0.72      0.81      0.76     15004

   micro avg       0.75      0.75      0.75     30018
   macro avg       0.75      0.75      0.75     30018
weighted avg       0.75      0.75      0.75     30018

[[10350  4664]
 [ 2894 12110]]
Accuracy is  74.82177360250516
Time on model's work: 20.968 s
GradientBoostingClassifier
              precision    recall  f1-score   support

         0.0       0.88      0.62      0.73     15014
         1.0       0.71      0.92      0.80     15004

   micro avg       0.77      0.77      0.77     30018
   macro avg       0.80      0.77      0.76     30018
weighted avg       0.80      0.77      0.76     30018

[[ 9306  5708]
 [ 1217 13787]]
Accuracy is  76.93050836164967
Time on model's work: 1198.169 s




ExtraTreesClassifier
              precision    recall  f1-score   support

         0.0       0.76      0.71      0.73     15014
         1.0       0.73      0.77      0.75     15004

   micro avg       0.74      0.74      0.74     30018
   macro avg       0.74      0.74      0.74     30018
weighted avg       0.74      0.74      0.74     30018

[[10698  4316]
 [ 3405 11599]]
Accuracy is  74.27876607368911
Time on model's work: 45.195 s
AdaBoostClassifier
              precision    recall  f1-score   support

         0.0       0.88      0.63      0.73     15014
         1.0       0.71      0.91      0.80     15004

   micro avg       0.77      0.77      0.77     30018
   macro avg       0.79      0.77      0.76     30018
weighted avg       0.79      0.77      0.76     30018

[[ 9384  5630]
 [ 1338 13666]]
Accuracy is  76.78726097674729
Time on model's work: 462.168 s
BaggingClassifier
              precision    recall  f1-score   support

         0.0       0.79      0.69      0.73   



MLPClassifier
              precision    recall  f1-score   support

         0.0       0.76      0.71      0.73     15014
         1.0       0.73      0.77      0.75     15004

   micro avg       0.74      0.74      0.74     30018
   macro avg       0.74      0.74      0.74     30018
weighted avg       0.74      0.74      0.74     30018

[[10708  4306]
 [ 3453 11551]]
Accuracy is  74.1521753614498
Time on model's work: 2549.128 s
XGBClassifier
              precision    recall  f1-score   support

         0.0       0.88      0.62      0.73     15014
         1.0       0.71      0.92      0.80     15004

   micro avg       0.77      0.77      0.77     30018
   macro avg       0.80      0.77      0.76     30018
weighted avg       0.80      0.77      0.76     30018

[[ 9299  5715]
 [ 1212 13792]]
Accuracy is  76.92384569258445
Time on model's work: 561.64 s
0:	learn: 0.6642920	total: 567ms	remaining: 9m 26s
1:	learn: 0.6401349	total: 1.05s	remaining: 8m 45s
2:	learn: 0.6194234	total: 1.

138:	learn: 0.4837375	total: 49.2s	remaining: 5m 4s
139:	learn: 0.4837155	total: 49.4s	remaining: 5m 3s
140:	learn: 0.4836834	total: 49.7s	remaining: 5m 2s
141:	learn: 0.4836093	total: 50.1s	remaining: 5m 2s
142:	learn: 0.4835482	total: 50.5s	remaining: 5m 2s
143:	learn: 0.4835140	total: 50.8s	remaining: 5m 1s
144:	learn: 0.4834770	total: 51.1s	remaining: 5m 1s
145:	learn: 0.4834445	total: 51.4s	remaining: 5m
146:	learn: 0.4834176	total: 51.7s	remaining: 4m 59s
147:	learn: 0.4833954	total: 52s	remaining: 4m 59s
148:	learn: 0.4833802	total: 52.2s	remaining: 4m 58s
149:	learn: 0.4833616	total: 52.4s	remaining: 4m 57s
150:	learn: 0.4833560	total: 52.7s	remaining: 4m 56s
151:	learn: 0.4833229	total: 52.9s	remaining: 4m 55s
152:	learn: 0.4832516	total: 53.3s	remaining: 4m 55s
153:	learn: 0.4832073	total: 53.7s	remaining: 4m 54s
154:	learn: 0.4831749	total: 54s	remaining: 4m 54s
155:	learn: 0.4831533	total: 54.2s	remaining: 4m 53s
156:	learn: 0.4831363	total: 54.5s	remaining: 4m 52s
157:	lea

293:	learn: 0.4803354	total: 1m 31s	remaining: 3m 38s
294:	learn: 0.4803097	total: 1m 31s	remaining: 3m 38s
295:	learn: 0.4803002	total: 1m 31s	remaining: 3m 38s
296:	learn: 0.4802835	total: 1m 32s	remaining: 3m 38s
297:	learn: 0.4802666	total: 1m 32s	remaining: 3m 37s
298:	learn: 0.4802399	total: 1m 32s	remaining: 3m 37s
299:	learn: 0.4802152	total: 1m 32s	remaining: 3m 36s
300:	learn: 0.4802043	total: 1m 33s	remaining: 3m 36s
301:	learn: 0.4801861	total: 1m 33s	remaining: 3m 36s
302:	learn: 0.4801756	total: 1m 33s	remaining: 3m 35s
303:	learn: 0.4801657	total: 1m 34s	remaining: 3m 35s
304:	learn: 0.4801578	total: 1m 34s	remaining: 3m 34s
305:	learn: 0.4801197	total: 1m 34s	remaining: 3m 34s
306:	learn: 0.4800960	total: 1m 35s	remaining: 3m 34s
307:	learn: 0.4800737	total: 1m 35s	remaining: 3m 34s
308:	learn: 0.4800673	total: 1m 35s	remaining: 3m 33s
309:	learn: 0.4800492	total: 1m 35s	remaining: 3m 33s
310:	learn: 0.4800169	total: 1m 36s	remaining: 3m 33s
311:	learn: 0.4800055	total:

447:	learn: 0.4778118	total: 2m 14s	remaining: 2m 45s
448:	learn: 0.4777935	total: 2m 14s	remaining: 2m 45s
449:	learn: 0.4777876	total: 2m 14s	remaining: 2m 44s
450:	learn: 0.4777435	total: 2m 15s	remaining: 2m 44s
451:	learn: 0.4777258	total: 2m 15s	remaining: 2m 44s
452:	learn: 0.4777171	total: 2m 15s	remaining: 2m 43s
453:	learn: 0.4777005	total: 2m 16s	remaining: 2m 43s
454:	learn: 0.4776853	total: 2m 16s	remaining: 2m 43s
455:	learn: 0.4776756	total: 2m 16s	remaining: 2m 42s
456:	learn: 0.4776674	total: 2m 16s	remaining: 2m 42s
457:	learn: 0.4776558	total: 2m 17s	remaining: 2m 42s
458:	learn: 0.4776486	total: 2m 17s	remaining: 2m 41s
459:	learn: 0.4776277	total: 2m 17s	remaining: 2m 41s
460:	learn: 0.4776186	total: 2m 17s	remaining: 2m 41s
461:	learn: 0.4775528	total: 2m 18s	remaining: 2m 40s
462:	learn: 0.4775355	total: 2m 18s	remaining: 2m 40s
463:	learn: 0.4775096	total: 2m 18s	remaining: 2m 40s
464:	learn: 0.4775010	total: 2m 19s	remaining: 2m 39s
465:	learn: 0.4774861	total:

600:	learn: 0.4755396	total: 2m 56s	remaining: 1m 57s
601:	learn: 0.4755310	total: 2m 57s	remaining: 1m 57s
602:	learn: 0.4755196	total: 2m 57s	remaining: 1m 56s
603:	learn: 0.4755143	total: 2m 57s	remaining: 1m 56s
604:	learn: 0.4754827	total: 2m 57s	remaining: 1m 56s
605:	learn: 0.4754510	total: 2m 58s	remaining: 1m 55s
606:	learn: 0.4754436	total: 2m 58s	remaining: 1m 55s
607:	learn: 0.4754340	total: 2m 58s	remaining: 1m 55s
608:	learn: 0.4754233	total: 2m 58s	remaining: 1m 54s
609:	learn: 0.4754085	total: 2m 59s	remaining: 1m 54s
610:	learn: 0.4753954	total: 2m 59s	remaining: 1m 54s
611:	learn: 0.4753915	total: 2m 59s	remaining: 1m 53s
612:	learn: 0.4753681	total: 3m	remaining: 1m 53s
613:	learn: 0.4753613	total: 3m	remaining: 1m 53s
614:	learn: 0.4753493	total: 3m	remaining: 1m 53s
615:	learn: 0.4753409	total: 3m	remaining: 1m 52s
616:	learn: 0.4753266	total: 3m 1s	remaining: 1m 52s
617:	learn: 0.4753094	total: 3m 1s	remaining: 1m 52s
618:	learn: 0.4752897	total: 3m 1s	remaining: 

753:	learn: 0.4736116	total: 3m 40s	remaining: 1m 11s
754:	learn: 0.4735992	total: 3m 40s	remaining: 1m 11s
755:	learn: 0.4735886	total: 3m 40s	remaining: 1m 11s
756:	learn: 0.4735818	total: 3m 41s	remaining: 1m 10s
757:	learn: 0.4735782	total: 3m 41s	remaining: 1m 10s
758:	learn: 0.4735662	total: 3m 41s	remaining: 1m 10s
759:	learn: 0.4735611	total: 3m 41s	remaining: 1m 10s
760:	learn: 0.4735549	total: 3m 41s	remaining: 1m 9s
761:	learn: 0.4735487	total: 3m 42s	remaining: 1m 9s
762:	learn: 0.4735436	total: 3m 42s	remaining: 1m 9s
763:	learn: 0.4735239	total: 3m 42s	remaining: 1m 8s
764:	learn: 0.4735180	total: 3m 43s	remaining: 1m 8s
765:	learn: 0.4735127	total: 3m 43s	remaining: 1m 8s
766:	learn: 0.4734564	total: 3m 43s	remaining: 1m 7s
767:	learn: 0.4734504	total: 3m 43s	remaining: 1m 7s
768:	learn: 0.4734375	total: 3m 44s	remaining: 1m 7s
769:	learn: 0.4734296	total: 3m 44s	remaining: 1m 7s
770:	learn: 0.4734230	total: 3m 44s	remaining: 1m 6s
771:	learn: 0.4733770	total: 3m 44s	rem

909:	learn: 0.4717107	total: 4m 23s	remaining: 26.1s
910:	learn: 0.4717044	total: 4m 23s	remaining: 25.8s
911:	learn: 0.4717013	total: 4m 23s	remaining: 25.5s
912:	learn: 0.4716944	total: 4m 24s	remaining: 25.2s
913:	learn: 0.4716879	total: 4m 24s	remaining: 24.9s
914:	learn: 0.4716543	total: 4m 24s	remaining: 24.6s
915:	learn: 0.4716474	total: 4m 24s	remaining: 24.3s
916:	learn: 0.4716411	total: 4m 25s	remaining: 24s
917:	learn: 0.4716287	total: 4m 25s	remaining: 23.7s
918:	learn: 0.4716219	total: 4m 25s	remaining: 23.4s
919:	learn: 0.4716119	total: 4m 25s	remaining: 23.1s
920:	learn: 0.4716025	total: 4m 26s	remaining: 22.8s
921:	learn: 0.4715980	total: 4m 26s	remaining: 22.5s
922:	learn: 0.4715876	total: 4m 26s	remaining: 22.3s
923:	learn: 0.4715756	total: 4m 27s	remaining: 22s
924:	learn: 0.4715558	total: 4m 27s	remaining: 21.7s
925:	learn: 0.4715512	total: 4m 27s	remaining: 21.4s
926:	learn: 0.4715465	total: 4m 27s	remaining: 21.1s
927:	learn: 0.4715340	total: 4m 28s	remaining: 20.



LogisticRegression
              precision    recall  f1-score   support

         0.0       0.87      0.63      0.73     15014
         1.0       0.71      0.90      0.80     15004

   micro avg       0.77      0.77      0.77     30018
   macro avg       0.79      0.77      0.76     30018
weighted avg       0.79      0.77      0.76     30018

[[ 9529  5485]
 [ 1478 13526]]
Accuracy is  76.80391764941035
Time on model's work: 3.672 s




SGDClassifier
              precision    recall  f1-score   support

         0.0       0.88      0.62      0.73     15014
         1.0       0.71      0.92      0.80     15004

   micro avg       0.77      0.77      0.77     30018
   macro avg       0.79      0.77      0.76     30018
weighted avg       0.79      0.77      0.76     30018

[[ 9337  5677]
 [ 1265 13739]]
Accuracy is  76.87387567459524
Time on model's work: 2.009 s


MemoryError: 

In [8]:
# TFFM sparse
# only CSR format supported
X_train_sparse = sp.csr_matrix(X_train)
X_test_sparse = sp.csr_matrix(X_test)

In [9]:
# input_type='sparse' /// rank == 10
for order in [2, 3]:
    model = TFFMClassifier(
        order=order, 
        rank=10, 
        optimizer=tf.train.AdamOptimizer(learning_rate=0.001), 
        n_epochs=50, 
        batch_size=512,
        init_std=0.001,
        reg=0.01,
        input_type='sparse',
        #log_dir='./tmp/logs',
        #verbose=1,
        seed=42
    )
    model.fit(X_train_sparse, y_train, show_progress=True)
    predictions = model.predict(X_test_sparse)
    print('[order={}] accuracy: {}'.format(order, accuracy_score(y_test, predictions)))
    # this will close tf.Session and free resources
    print(confusion_matrix(y_test,predictions)) 
    print(classification_report(y_test, predictions))
    model.destroy()

100%|██████████| 50/50 [00:33<00:00,  1.51epoch/s]


[order=2] accuracy: 0.7661736291558399
[[ 9576  5438]
 [ 1581 13423]]
              precision    recall  f1-score   support

         0.0       0.86      0.64      0.73     15014
         1.0       0.71      0.89      0.79     15004

   micro avg       0.77      0.77      0.77     30018
   macro avg       0.78      0.77      0.76     30018
weighted avg       0.79      0.77      0.76     30018



100%|██████████| 50/50 [01:08<00:00,  1.31s/epoch]


[order=3] accuracy: 0.7636085015657272
[[ 9817  5197]
 [ 1899 13105]]
              precision    recall  f1-score   support

         0.0       0.84      0.65      0.73     15014
         1.0       0.72      0.87      0.79     15004

   micro avg       0.76      0.76      0.76     30018
   macro avg       0.78      0.76      0.76     30018
weighted avg       0.78      0.76      0.76     30018



In [10]:
# FtrlOptimizer
# input_type='sparse' /// rank == 10
model = TFFMClassifier(
    order=2, 
    rank=10, 
    optimizer=tf.train.FtrlOptimizer(0.01, l2_regularization_strength=0.001, l2_shrinkage_regularization_strength=0.001), 
    n_epochs=50, 
    batch_size=1024,
    init_std=0.001,
    reg=0.01,
    input_type='sparse',
    #log_dir='./tmp/logs',
    #verbose=1,
    seed=42
)
model.fit(X_train_sparse, y_train, show_progress=True)
predictions = model.predict(X_test_sparse)
print('[order={}] accuracy: {}'.format(order, accuracy_score(y_test, predictions)))
# this will close tf.Session and free resources
print(confusion_matrix(y_test,predictions)) 
print(classification_report(y_test, predictions))
model.destroy()

100%|██████████| 50/50 [00:30<00:00,  1.67epoch/s]


[order=3] accuracy: 0.7656073022852955
[[ 9270  5744]
 [ 1292 13712]]
              precision    recall  f1-score   support

         0.0       0.88      0.62      0.72     15014
         1.0       0.70      0.91      0.80     15004

   micro avg       0.77      0.77      0.77     30018
   macro avg       0.79      0.77      0.76     30018
weighted avg       0.79      0.77      0.76     30018



In [11]:
# sample_weight='balanced'
model = TFFMClassifier(
    order=2,
    sample_weight='balanced',
    rank=10, 
    optimizer=tf.train.AdamOptimizer(learning_rate=0.001), 
    n_epochs=50, 
    batch_size=1024,
    init_std=0.001,
    reg=0.01,
    input_type='sparse',
    #log_dir='./tmp/logs',
    #verbose=1,
    seed=42
)
model.fit(X_train_sparse, y_train, show_progress=True)
predictions = model.predict(X_test_sparse)
print('[order={}] accuracy: {}'.format(order, accuracy_score(y_test, predictions)))
# this will close tf.Session and free resources
print(confusion_matrix(y_test,predictions)) 
print(classification_report(y_test, predictions))
model.destroy()

100%|██████████| 50/50 [00:29<00:00,  1.69epoch/s]


[order=3] accuracy: 0.765240855486708
[[ 9499  5515]
 [ 1532 13472]]
              precision    recall  f1-score   support

         0.0       0.86      0.63      0.73     15014
         1.0       0.71      0.90      0.79     15004

   micro avg       0.77      0.77      0.77     30018
   macro avg       0.79      0.77      0.76     30018
weighted avg       0.79      0.77      0.76     30018



In [12]:
# weight - optional
pos_class_weight = list(map(float,range(1, 5)))
for weight in pos_class_weight:
    model = TFFMClassifier(
        order=2,
        pos_class_weight=weight,
        rank=10, 
        optimizer=tf.train.AdamOptimizer(learning_rate=0.001), 
        n_epochs=20, 
        batch_size=1024,
        init_std=0.001,
        reg=0.01,
        input_type='sparse',
        #log_dir='./tmp/logs',
        #verbose=1,
        seed=42
    )
    model.fit(X_train_sparse, y_train, show_progress=True)
    predictions = model.predict(X_test_sparse)
    print('[order={}] accuracy: {}'.format(order, accuracy_score(y_test, predictions)))
    # this will close tf.Session and free resources
    print(confusion_matrix(y_test,predictions)) 
    print(classification_report(y_test, predictions))
    model.destroy()

100%|██████████| 20/20 [00:12<00:00,  1.67epoch/s]


[order=3] accuracy: 0.7667399560263841
[[ 9483  5531]
 [ 1471 13533]]
              precision    recall  f1-score   support

         0.0       0.87      0.63      0.73     15014
         1.0       0.71      0.90      0.79     15004

   micro avg       0.77      0.77      0.77     30018
   macro avg       0.79      0.77      0.76     30018
weighted avg       0.79      0.77      0.76     30018



100%|██████████| 20/20 [00:12<00:00,  1.65epoch/s]


[order=3] accuracy: 0.7668065827170365
[[ 9148  5866]
 [ 1134 13870]]
              precision    recall  f1-score   support

         0.0       0.89      0.61      0.72     15014
         1.0       0.70      0.92      0.80     15004

   micro avg       0.77      0.77      0.77     30018
   macro avg       0.80      0.77      0.76     30018
weighted avg       0.80      0.77      0.76     30018



100%|██████████| 20/20 [00:12<00:00,  1.68epoch/s]


[order=3] accuracy: 0.7590112599107203
[[ 8781  6233]
 [ 1001 14003]]
              precision    recall  f1-score   support

         0.0       0.90      0.58      0.71     15014
         1.0       0.69      0.93      0.79     15004

   micro avg       0.76      0.76      0.76     30018
   macro avg       0.79      0.76      0.75     30018
weighted avg       0.79      0.76      0.75     30018



100%|██████████| 20/20 [00:12<00:00,  1.67epoch/s]


[order=3] accuracy: 0.7485508694783131
[[ 8341  6673]
 [  875 14129]]
              precision    recall  f1-score   support

         0.0       0.91      0.56      0.69     15014
         1.0       0.68      0.94      0.79     15004

   micro avg       0.75      0.75      0.75     30018
   macro avg       0.79      0.75      0.74     30018
weighted avg       0.79      0.75      0.74     30018



In [13]:
# weight - 2.0 best
model = TFFMClassifier(
    order=2,
    pos_class_weight=2.0,
    rank=10, 
    optimizer=tf.train.AdamOptimizer(learning_rate=0.001), 
    n_epochs=100, 
    batch_size=1024,
    init_std=0.001,
    reg=0.01,
    input_type='sparse',
    #log_dir='./tmp/logs',
    #verbose=1,
    seed=42
)
model.fit(X_train_sparse, y_train, show_progress=True)
predictions = model.predict(X_test_sparse)
print('[order={}] accuracy: {}'.format(order, accuracy_score(y_test, predictions)))
# this will close tf.Session and free resources
print(confusion_matrix(y_test,predictions)) 
print(classification_report(y_test, predictions))
model.destroy()

100%|██████████| 100/100 [01:01<00:00,  1.68epoch/s]


[order=3] accuracy: 0.7647411553068159
[[ 9134  5880]
 [ 1182 13822]]
              precision    recall  f1-score   support

         0.0       0.89      0.61      0.72     15014
         1.0       0.70      0.92      0.80     15004

   micro avg       0.76      0.76      0.76     30018
   macro avg       0.79      0.76      0.76     30018
weighted avg       0.79      0.76      0.76     30018



In [14]:
# BEST OPTIMIZED TFFM
model = TFFMClassifier(
    order=2, 
    rank=10,
    pos_class_weight=2.0,
    optimizer=tf.train.FtrlOptimizer(0.01, l2_regularization_strength=0.001, l2_shrinkage_regularization_strength=0.001), 
    n_epochs=100, 
    batch_size=1024,
    init_std=0.001,
    reg=0.01,
    input_type='sparse',
    #log_dir='./tmp/logs',
    #verbose=1,
    seed=42
)
model.fit(X_train_sparse, y_train, show_progress=True)
predictions = model.predict(X_test_sparse)
print('[order={}] accuracy: {}'.format(order, accuracy_score(y_test, predictions)))
# this will close tf.Session and free resources
print(confusion_matrix(y_test,predictions)) 
print(classification_report(y_test, predictions))
model.destroy()

100%|██████████| 100/100 [01:00<00:00,  1.65epoch/s]


[order=3] accuracy: 0.7683056832567127
[[ 9243  5771]
 [ 1184 13820]]
              precision    recall  f1-score   support

         0.0       0.89      0.62      0.73     15014
         1.0       0.71      0.92      0.80     15004

   micro avg       0.77      0.77      0.77     30018
   macro avg       0.80      0.77      0.76     30018
weighted avg       0.80      0.77      0.76     30018



In [16]:
# KERAS
model = Sequential()
model.add(Dense(128, input_dim=2626, activation='sigmoid'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='sigmoid'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

model.fit(X_train, y_train,
          epochs=20,
          batch_size=256)
score = model.evaluate(X_test, y_test, batch_size=256)
print(score)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20


Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20


Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20


Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20


Epoch 18/20
Epoch 19/20
Epoch 20/20
[0.4910358894124674, 0.7691052034929136]


Best results: 
1. Undersampling
GradientBoostingClassifier
              precision    recall  f1-score   support

         0.0       0.89      0.62      0.73      6986
         1.0       0.71      0.92      0.80      7038

   micro avg       0.77      0.77      0.77     14024
   macro avg       0.80      0.77      0.76     14024
weighted avg       0.80      0.77      0.76     14024

[[4304 2682]
 [ 537 6501]]
Accuracy is  77.04649172846548
Time on model's work: 736.97 s
============================
XGBClassifier
              precision    recall  f1-score   support

         0.0       0.89      0.61      0.73      6986
         1.0       0.71      0.93      0.80      7038

   micro avg       0.77      0.77      0.77     14024
   macro avg       0.80      0.77      0.76     14024
weighted avg       0.80      0.77      0.76     14024

[[4288 2698]
 [ 524 6514]]
Accuracy is  77.02509982886481
Time on model's work: 480.744 s
============================
TFFMClassifier
[order=2] accuracy: 0.7678265830005705
[[4326 2715]
 [ 541 6442]]
              precision    recall  f1-score   support

         0.0       0.89      0.61      0.73      7041
         1.0       0.70      0.92      0.80      6983

   micro avg       0.77      0.77      0.77     14024
   macro avg       0.80      0.77      0.76     14024
weighted avg       0.80      0.77      0.76     14024

2. Oversampling
GradientBoostingClassifier
              precision    recall  f1-score   support

         0.0       0.88      0.62      0.73     15014
         1.0       0.71      0.92      0.80     15004

   micro avg       0.77      0.77      0.77     30018
   macro avg       0.80      0.77      0.76     30018
weighted avg       0.80      0.77      0.76     30018

[[ 9306  5708]
 [ 1217 13787]]
Accuracy is  76.93050836164967
Time on model's work: 1198.169 s
============================
XGBClassifier
              precision    recall  f1-score   support

         0.0       0.88      0.62      0.73     15014
         1.0       0.71      0.92      0.80     15004

   micro avg       0.77      0.77      0.77     30018
   macro avg       0.80      0.77      0.76     30018
weighted avg       0.80      0.77      0.76     30018

[[ 9299  5715]
 [ 1212 13792]]
Accuracy is  76.92384569258445
Time on model's work: 561.64 s
============================
CatBoostClassifier
              precision    recall  f1-score   support

         0.0       0.88      0.63      0.73     15014
         1.0       0.71      0.91      0.80     15004

   micro avg       0.77      0.77      0.77     30018
   macro avg       0.79      0.77      0.76     30018
weighted avg       0.79      0.77      0.76     30018

[[ 9406  5608]
 [ 1304 13700]]
Accuracy is  76.97381571057366
Time on model's work: 318.586 s
============================
TFFMClassifier
[order=2] accuracy: 0.7683056832567127
[[ 9243  5771]
 [ 1184 13820]]
              precision    recall  f1-score   support

         0.0       0.89      0.62      0.73     15014
         1.0       0.71      0.92      0.80     15004

   micro avg       0.77      0.77      0.77     30018
   macro avg       0.80      0.77      0.76     30018
weighted avg       0.80      0.77      0.76     30018



### Undersampling strategy shows better results than oversampling by time, accuracy, precision, recall, f1-score

 ## 2. Imbalanced variant

In [10]:
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.metrics import balanced_accuracy_score
t = time()
bbc = BalancedBaggingClassifier(base_estimator=DecisionTreeClassifier(),
                                sampling_strategy='auto',
                                replacement=False,
                                random_state=0)
bbc.fit(X_train, y_train) 
predictions = bbc.predict(X_test)
print('Balanced accuracy', balanced_accuracy_score(y_test, predictions)) 
print('Accuracy: {}'.format(accuracy_score(y_test, predictions)))
print(confusion_matrix(y_test,predictions)) 
print(classification_report(y_test, predictions))
print ("Time on model's work:", round(time()-t, 3), "s")

Balanced accuracy 0.7042175138171767
Accuracy: 0.7163
[[6662 2612]
 [ 225  501]]
              precision    recall  f1-score   support

         0.0       0.97      0.72      0.82      9274
         1.0       0.16      0.69      0.26       726

   micro avg       0.72      0.72      0.72     10000
   macro avg       0.56      0.70      0.54     10000
weighted avg       0.91      0.72      0.78     10000

Time on model's work: 53.088 s


In [12]:
from imblearn.ensemble import BalancedRandomForestClassifier
t = time()
brf = BalancedBaggingClassifier(n_estimators=100, random_state=0)
brf.fit(X_train, y_train) 
predictions = brf.predict(X_test)
print('Balanced accuracy', balanced_accuracy_score(y_test, predictions)) 
print('Accuracy: {}'.format(accuracy_score(y_test, predictions)))
print(confusion_matrix(y_test,predictions)) 
print(classification_report(y_test, predictions))
print ("Time on model's work:", round(time()-t, 3), "s")

Balanced accuracy 0.729588214570668
Accuracy: 0.6986
[[6430 2844]
 [ 170  556]]
              precision    recall  f1-score   support

         0.0       0.97      0.69      0.81      9274
         1.0       0.16      0.77      0.27       726

   micro avg       0.70      0.70      0.70     10000
   macro avg       0.57      0.73      0.54     10000
weighted avg       0.92      0.70      0.77     10000

Time on model's work: 496.02 s


In [13]:
from imblearn.ensemble import RUSBoostClassifier
t = time()
rusboost = RUSBoostClassifier(random_state=10)
rusboost.fit(X_train, y_train)  
predictions = rusboost.predict(X_test)
print('Balanced accuracy', balanced_accuracy_score(y_test, predictions)) 
print('Accuracy: {}'.format(accuracy_score(y_test, predictions)))
print(confusion_matrix(y_test,predictions)) 
print(classification_report(y_test, predictions))
print ("Time on model's work:", round(time()-t, 3), "s")

Balanced accuracy 0.5633828630770227
Accuracy: 0.7824
[[7601 1673]
 [ 503  223]]
              precision    recall  f1-score   support

         0.0       0.94      0.82      0.87      9274
         1.0       0.12      0.31      0.17       726

   micro avg       0.78      0.78      0.78     10000
   macro avg       0.53      0.56      0.52     10000
weighted avg       0.88      0.78      0.82     10000

Time on model's work: 42.791 s


In [14]:
from imblearn.ensemble import EasyEnsembleClassifier
t = time()
eec = EasyEnsembleClassifier(random_state=11)
eec.fit(X_train, y_train) 
predictions = eec.predict(X_test)
print('Balanced accuracy', balanced_accuracy_score(y_test, predictions)) 
print('Accuracy: {}'.format(accuracy_score(y_test, predictions)))
print(confusion_matrix(y_test,predictions)) 
print(classification_report(y_test, predictions))
print ("Time on model's work:", round(time()-t, 3), "s")

Balanced accuracy 0.7680685241657266
Accuracy: 0.6487
[[5828 3446]
 [  67  659]]
              precision    recall  f1-score   support

         0.0       0.99      0.63      0.77      9274
         1.0       0.16      0.91      0.27       726

   micro avg       0.65      0.65      0.65     10000
   macro avg       0.57      0.77      0.52     10000
weighted avg       0.93      0.65      0.73     10000

Time on model's work: 135.531 s


In [15]:
print(sorted(Counter(labels_list_array).items()))

[(0.0, 46269), (1.0, 3731)]


## RandomUnderSampler

In [16]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=0)
X_resampled, y_resampled = rus.fit_resample(features_list_array, labels_list_array)
print(sorted(Counter(y_resampled).items()))

[(0.0, 3731), (1.0, 3731)]


In [17]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, random_state=35, test_size=0.2)

In [18]:
clfs = [
        ['RandomForestClassifier', RandomForestClassifier()],
        ['GradientBoostingClassifier', GradientBoostingClassifier()],
        ['ExtraTreesClassifier', ExtraTreesClassifier()],
        ['AdaBoostClassifier', AdaBoostClassifier()],
        ['BaggingClassifier', BaggingClassifier()],
        ['DecisionTreeClassifier', DecisionTreeClassifier()],
        ['MLPClassifier', MLPClassifier()],
        ['XGBClassifier', XGBClassifier()],
        ['CatBoostClassifier', CatBoostClassifier()],
        ['LogisticRegression', LogisticRegression()],
        ['SGDClassifier', SGDClassifier()],
        ['TFFMClassifier', TFFMClassifier()]
       ]
t = time()
for name, clf in clfs:
    t0 = time()
    clf.fit(X_train, y_train)
    prediction = clf.predict(X_test)
    print(name)
    print(classification_report(y_test, prediction))
    print(confusion_matrix(y_test, prediction))
    print('Accuracy is ', accuracy_score(y_test, prediction)*100)
    print ("Time on model's work:", round(time()-t0, 3), "s")
    print('='*100)
print ("TOTAL TIME SPENT: ", round(time()-t, 3), "s")



RandomForestClassifier
              precision    recall  f1-score   support

         0.0       0.77      0.72      0.74       760
         1.0       0.73      0.77      0.75       733

   micro avg       0.75      0.75      0.75      1493
   macro avg       0.75      0.75      0.75      1493
weighted avg       0.75      0.75      0.75      1493

[[546 214]
 [165 568]]
Accuracy is  74.61486939048895
Time on model's work: 0.85 s
GradientBoostingClassifier
              precision    recall  f1-score   support

         0.0       0.88      0.64      0.74       760
         1.0       0.71      0.91      0.79       733

   micro avg       0.77      0.77      0.77      1493
   macro avg       0.79      0.77      0.77      1493
weighted avg       0.79      0.77      0.77      1493

[[484 276]
 [ 69 664]]
Accuracy is  76.8921634293369
Time on model's work: 36.23 s




ExtraTreesClassifier
              precision    recall  f1-score   support

         0.0       0.75      0.72      0.73       760
         1.0       0.72      0.74      0.73       733

   micro avg       0.73      0.73      0.73      1493
   macro avg       0.73      0.73      0.73      1493
weighted avg       0.73      0.73      0.73      1493

[[550 210]
 [188 545]]
Accuracy is  73.34226389819156
Time on model's work: 1.31 s
AdaBoostClassifier
              precision    recall  f1-score   support

         0.0       0.86      0.66      0.75       760
         1.0       0.72      0.89      0.79       733

   micro avg       0.77      0.77      0.77      1493
   macro avg       0.79      0.78      0.77      1493
weighted avg       0.79      0.77      0.77      1493

[[504 256]
 [ 82 651]]
Accuracy is  77.36101808439383
Time on model's work: 8.838 s
BaggingClassifier
              precision    recall  f1-score   support

         0.0       0.75      0.71      0.73       760
         1.0



MLPClassifier
              precision    recall  f1-score   support

         0.0       0.74      0.67      0.70       760
         1.0       0.69      0.75      0.72       733

   micro avg       0.71      0.71      0.71      1493
   macro avg       0.71      0.71      0.71      1493
weighted avg       0.71      0.71      0.71      1493

[[511 249]
 [180 553]]
Accuracy is  71.26590756865372
Time on model's work: 116.734 s
XGBClassifier
              precision    recall  f1-score   support

         0.0       0.88      0.64      0.74       760
         1.0       0.71      0.91      0.80       733

   micro avg       0.77      0.77      0.77      1493
   macro avg       0.80      0.77      0.77      1493
weighted avg       0.80      0.77      0.77      1493

[[483 277]
 [ 64 669]]
Accuracy is  77.16008037508372
Time on model's work: 40.978 s
0:	learn: 0.6801145	total: 111ms	remaining: 1m 50s
1:	learn: 0.6679328	total: 172ms	remaining: 1m 25s
2:	learn: 0.6566767	total: 231ms	remaining: 1

141:	learn: 0.4933810	total: 8.69s	remaining: 52.5s
142:	learn: 0.4932554	total: 8.74s	remaining: 52.4s
143:	learn: 0.4931723	total: 8.81s	remaining: 52.4s
144:	learn: 0.4931048	total: 8.88s	remaining: 52.3s
145:	learn: 0.4930253	total: 8.95s	remaining: 52.3s
146:	learn: 0.4929187	total: 9.02s	remaining: 52.3s
147:	learn: 0.4928799	total: 9.07s	remaining: 52.2s
148:	learn: 0.4928128	total: 9.13s	remaining: 52.2s
149:	learn: 0.4927473	total: 9.2s	remaining: 52.2s
150:	learn: 0.4926888	total: 9.27s	remaining: 52.1s
151:	learn: 0.4926768	total: 9.34s	remaining: 52.1s
152:	learn: 0.4926024	total: 9.4s	remaining: 52s
153:	learn: 0.4925505	total: 9.46s	remaining: 51.9s
154:	learn: 0.4924370	total: 9.52s	remaining: 51.9s
155:	learn: 0.4923651	total: 9.57s	remaining: 51.8s
156:	learn: 0.4923028	total: 9.63s	remaining: 51.7s
157:	learn: 0.4922485	total: 9.68s	remaining: 51.6s
158:	learn: 0.4921704	total: 9.74s	remaining: 51.5s
159:	learn: 0.4921220	total: 9.79s	remaining: 51.4s
160:	learn: 0.49

300:	learn: 0.4857744	total: 18.2s	remaining: 42.2s
301:	learn: 0.4857711	total: 18.2s	remaining: 42.1s
302:	learn: 0.4857171	total: 18.3s	remaining: 42s
303:	learn: 0.4856918	total: 18.3s	remaining: 41.9s
304:	learn: 0.4856772	total: 18.4s	remaining: 41.9s
305:	learn: 0.4856430	total: 18.4s	remaining: 41.8s
306:	learn: 0.4856031	total: 18.5s	remaining: 41.7s
307:	learn: 0.4855250	total: 18.5s	remaining: 41.6s
308:	learn: 0.4854745	total: 18.6s	remaining: 41.6s
309:	learn: 0.4854435	total: 18.6s	remaining: 41.5s
310:	learn: 0.4853953	total: 18.7s	remaining: 41.4s
311:	learn: 0.4853701	total: 18.8s	remaining: 41.4s
312:	learn: 0.4853476	total: 18.8s	remaining: 41.3s
313:	learn: 0.4853265	total: 18.9s	remaining: 41.3s
314:	learn: 0.4853066	total: 18.9s	remaining: 41.2s
315:	learn: 0.4852809	total: 19s	remaining: 41.1s
316:	learn: 0.4852734	total: 19.1s	remaining: 41s
317:	learn: 0.4852247	total: 19.1s	remaining: 41s
318:	learn: 0.4851663	total: 19.2s	remaining: 40.9s
319:	learn: 0.485129

459:	learn: 0.4774000	total: 27.5s	remaining: 32.3s
460:	learn: 0.4773601	total: 27.5s	remaining: 32.2s
461:	learn: 0.4771399	total: 27.6s	remaining: 32.2s
462:	learn: 0.4770914	total: 27.7s	remaining: 32.1s
463:	learn: 0.4770487	total: 27.8s	remaining: 32.1s
464:	learn: 0.4769844	total: 27.8s	remaining: 32s
465:	learn: 0.4769263	total: 27.9s	remaining: 31.9s
466:	learn: 0.4768528	total: 27.9s	remaining: 31.9s
467:	learn: 0.4768082	total: 28s	remaining: 31.8s
468:	learn: 0.4767891	total: 28s	remaining: 31.8s
469:	learn: 0.4767177	total: 28.1s	remaining: 31.7s
470:	learn: 0.4766124	total: 28.2s	remaining: 31.7s
471:	learn: 0.4765602	total: 28.2s	remaining: 31.6s
472:	learn: 0.4764654	total: 28.3s	remaining: 31.5s
473:	learn: 0.4763658	total: 28.4s	remaining: 31.5s
474:	learn: 0.4763216	total: 28.4s	remaining: 31.4s
475:	learn: 0.4762818	total: 28.5s	remaining: 31.3s
476:	learn: 0.4762082	total: 28.5s	remaining: 31.3s
477:	learn: 0.4761767	total: 28.6s	remaining: 31.2s
478:	learn: 0.4761

618:	learn: 0.4698167	total: 37s	remaining: 22.7s
619:	learn: 0.4697288	total: 37s	remaining: 22.7s
620:	learn: 0.4697010	total: 37.1s	remaining: 22.6s
621:	learn: 0.4696515	total: 37.1s	remaining: 22.6s
622:	learn: 0.4696050	total: 37.2s	remaining: 22.5s
623:	learn: 0.4695582	total: 37.2s	remaining: 22.4s
624:	learn: 0.4695124	total: 37.3s	remaining: 22.4s
625:	learn: 0.4694933	total: 37.4s	remaining: 22.3s
626:	learn: 0.4693546	total: 37.4s	remaining: 22.3s
627:	learn: 0.4693005	total: 37.5s	remaining: 22.2s
628:	learn: 0.4692683	total: 37.5s	remaining: 22.1s
629:	learn: 0.4691403	total: 37.6s	remaining: 22.1s
630:	learn: 0.4691148	total: 37.6s	remaining: 22s
631:	learn: 0.4690822	total: 37.7s	remaining: 22s
632:	learn: 0.4690409	total: 37.8s	remaining: 21.9s
633:	learn: 0.4690215	total: 37.8s	remaining: 21.8s
634:	learn: 0.4689965	total: 37.9s	remaining: 21.8s
635:	learn: 0.4689778	total: 37.9s	remaining: 21.7s
636:	learn: 0.4688750	total: 38s	remaining: 21.7s
637:	learn: 0.4688273	

777:	learn: 0.4644164	total: 47.3s	remaining: 13.5s
778:	learn: 0.4643623	total: 47.3s	remaining: 13.4s
779:	learn: 0.4643512	total: 47.4s	remaining: 13.4s
780:	learn: 0.4643165	total: 47.4s	remaining: 13.3s
781:	learn: 0.4642967	total: 47.5s	remaining: 13.2s
782:	learn: 0.4642919	total: 47.6s	remaining: 13.2s
783:	learn: 0.4642708	total: 47.6s	remaining: 13.1s
784:	learn: 0.4642480	total: 47.7s	remaining: 13.1s
785:	learn: 0.4642121	total: 47.8s	remaining: 13s
786:	learn: 0.4641990	total: 47.8s	remaining: 12.9s
787:	learn: 0.4641622	total: 47.9s	remaining: 12.9s
788:	learn: 0.4641289	total: 47.9s	remaining: 12.8s
789:	learn: 0.4641180	total: 48s	remaining: 12.8s
790:	learn: 0.4641005	total: 48s	remaining: 12.7s
791:	learn: 0.4640799	total: 48.1s	remaining: 12.6s
792:	learn: 0.4640395	total: 48.1s	remaining: 12.6s
793:	learn: 0.4640302	total: 48.2s	remaining: 12.5s
794:	learn: 0.4640090	total: 48.3s	remaining: 12.4s
795:	learn: 0.4639912	total: 48.3s	remaining: 12.4s
796:	learn: 0.4639

936:	learn: 0.4601396	total: 57.2s	remaining: 3.85s
937:	learn: 0.4601085	total: 57.3s	remaining: 3.79s
938:	learn: 0.4600601	total: 57.4s	remaining: 3.73s
939:	learn: 0.4600427	total: 57.4s	remaining: 3.66s
940:	learn: 0.4600327	total: 57.5s	remaining: 3.6s
941:	learn: 0.4599979	total: 57.5s	remaining: 3.54s
942:	learn: 0.4599841	total: 57.6s	remaining: 3.48s
943:	learn: 0.4599754	total: 57.6s	remaining: 3.42s
944:	learn: 0.4599459	total: 57.7s	remaining: 3.36s
945:	learn: 0.4599291	total: 57.8s	remaining: 3.3s
946:	learn: 0.4598938	total: 57.8s	remaining: 3.23s
947:	learn: 0.4598587	total: 57.9s	remaining: 3.17s
948:	learn: 0.4597615	total: 57.9s	remaining: 3.11s
949:	learn: 0.4597471	total: 58s	remaining: 3.05s
950:	learn: 0.4597312	total: 58s	remaining: 2.99s
951:	learn: 0.4597102	total: 58.1s	remaining: 2.93s
952:	learn: 0.4596861	total: 58.2s	remaining: 2.87s
953:	learn: 0.4596746	total: 58.2s	remaining: 2.81s
954:	learn: 0.4596570	total: 58.3s	remaining: 2.75s
955:	learn: 0.4596



LogisticRegression
              precision    recall  f1-score   support

         0.0       0.86      0.66      0.75       760
         1.0       0.72      0.89      0.79       733

   micro avg       0.77      0.77      0.77      1493
   macro avg       0.79      0.78      0.77      1493
weighted avg       0.79      0.77      0.77      1493

[[504 256]
 [ 81 652]]
Accuracy is  77.42799732083054
Time on model's work: 0.144 s




SGDClassifier
              precision    recall  f1-score   support

         0.0       0.71      0.65      0.68       760
         1.0       0.67      0.73      0.70       733

   micro avg       0.69      0.69      0.69      1493
   macro avg       0.69      0.69      0.69      1493
weighted avg       0.69      0.69      0.69      1493

[[492 268]
 [197 536]]
Accuracy is  68.85465505693234
Time on model's work: 0.182 s
TFFMClassifier
              precision    recall  f1-score   support

         0.0       0.79      0.68      0.73       760
         1.0       0.71      0.81      0.76       733

   micro avg       0.75      0.75      0.75      1493
   macro avg       0.75      0.75      0.74      1493
weighted avg       0.75      0.75      0.74      1493

[[519 241]
 [139 594]]
Accuracy is  74.54789015405224
Time on model's work: 18.646 s
TOTAL TIME SPENT:  296.608 s


In [22]:
# TFFM sparse
# only CSR format supported
X_train_sparse = sp.csr_matrix(X_train)
X_test_sparse = sp.csr_matrix(X_test)
# BEST OPTIMIZED TFFM
model = TFFMClassifier(
    order=2, 
    rank=10,
    pos_class_weight=2.0,
    optimizer=tf.train.FtrlOptimizer(0.01, l2_regularization_strength=0.001, l2_shrinkage_regularization_strength=0.001), 
    n_epochs=100, 
    batch_size=1024,
    init_std=0.001,
    reg=0.01,
    input_type='sparse',
    #log_dir='./tmp/logs',
    #verbose=1,
    seed=42
)
model.fit(X_train_sparse, y_train, show_progress=True)
predictions = model.predict(X_test_sparse)
print('Accuracy: {}'.format(accuracy_score(y_test, predictions)))
# this will close tf.Session and free resources
print(confusion_matrix(y_test,predictions)) 
print(classification_report(y_test, predictions))
model.destroy()

100%|██████████| 100/100 [00:05<00:00, 19.61epoch/s]


Accuracy: 0.7387809778968519
[[456 304]
 [ 86 647]]
              precision    recall  f1-score   support

         0.0       0.84      0.60      0.70       760
         1.0       0.68      0.88      0.77       733

   micro avg       0.74      0.74      0.74      1493
   macro avg       0.76      0.74      0.73      1493
weighted avg       0.76      0.74      0.73      1493



## NearMiss (version = 1)

In [4]:
from imblearn.under_sampling import NearMiss
nm1 = NearMiss(version=1)
X_resampled_nm1, y_resampled1 = nm1.fit_resample(features_list_array, labels_list_array)
print(sorted(Counter(y_resampled1).items()))

[(0.0, 3726), (1.0, 3726)]


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled_nm1, y_resampled1, random_state=35, test_size=0.2)

In [6]:
clfs = [
        ['RandomForestClassifier', RandomForestClassifier()],
        ['GradientBoostingClassifier', GradientBoostingClassifier()],
        ['ExtraTreesClassifier', ExtraTreesClassifier()],
        ['AdaBoostClassifier', AdaBoostClassifier()],
        ['BaggingClassifier', BaggingClassifier()],
        ['DecisionTreeClassifier', DecisionTreeClassifier()],
        ['MLPClassifier', MLPClassifier()],
        ['XGBClassifier', XGBClassifier()],
        ['CatBoostClassifier', CatBoostClassifier()],
        ['LogisticRegression', LogisticRegression()],
        ['SGDClassifier', SGDClassifier()],
        ['TFFMClassifier', TFFMClassifier()]
       ]
t = time()
for name, clf in clfs:
    t0 = time()
    clf.fit(X_train, y_train)
    prediction = clf.predict(X_test)
    print(name)
    print(classification_report(y_test, prediction))
    print(confusion_matrix(y_test, prediction))
    print('Accuracy is ', accuracy_score(y_test, prediction)*100)
    print ("Time on model's work:", round(time()-t0, 3), "s")
    print('='*100)
print ("TOTAL TIME SPENT: ", round(time()-t, 3), "s")



RandomForestClassifier
              precision    recall  f1-score   support

         0.0       0.83      0.94      0.88       763
         1.0       0.92      0.79      0.85       728

   micro avg       0.87      0.87      0.87      1491
   macro avg       0.87      0.86      0.87      1491
weighted avg       0.87      0.87      0.87      1491

[[714  49]
 [150 578]]
Accuracy is  86.65325285043595
Time on model's work: 0.767 s
GradientBoostingClassifier
              precision    recall  f1-score   support

         0.0       0.87      0.87      0.87       763
         1.0       0.86      0.86      0.86       728

   micro avg       0.86      0.86      0.86      1491
   macro avg       0.86      0.86      0.86      1491
weighted avg       0.86      0.86      0.86      1491

[[661 102]
 [100 628]]
Accuracy is  86.45204560697518
Time on model's work: 31.618 s




ExtraTreesClassifier
              precision    recall  f1-score   support

         0.0       0.82      0.96      0.88       763
         1.0       0.94      0.78      0.85       728

   micro avg       0.87      0.87      0.87      1491
   macro avg       0.88      0.87      0.87      1491
weighted avg       0.88      0.87      0.87      1491

[[730  33]
 [163 565]]
Accuracy is  86.85446009389672
Time on model's work: 1.506 s
AdaBoostClassifier
              precision    recall  f1-score   support

         0.0       0.86      0.86      0.86       763
         1.0       0.85      0.85      0.85       728

   micro avg       0.86      0.86      0.86      1491
   macro avg       0.86      0.86      0.86      1491
weighted avg       0.86      0.86      0.86      1491

[[656 107]
 [108 620]]
Accuracy is  85.58014755197854
Time on model's work: 9.615 s
BaggingClassifier
              precision    recall  f1-score   support

         0.0       0.82      0.94      0.87       763
         1.



MLPClassifier
              precision    recall  f1-score   support

         0.0       0.83      0.92      0.88       763
         1.0       0.91      0.81      0.85       728

   micro avg       0.87      0.87      0.87      1491
   macro avg       0.87      0.86      0.87      1491
weighted avg       0.87      0.87      0.87      1491

[[704  59]
 [141 587]]
Accuracy is  86.58618376928237
Time on model's work: 121.631 s
XGBClassifier
              precision    recall  f1-score   support

         0.0       0.89      0.85      0.87       763
         1.0       0.85      0.89      0.87       728

   micro avg       0.87      0.87      0.87      1491
   macro avg       0.87      0.87      0.87      1491
weighted avg       0.87      0.87      0.87      1491

[[649 114]
 [ 82 646]]
Accuracy is  86.85446009389672
Time on model's work: 43.209 s
0:	learn: 0.6791880	total: 102ms	remaining: 1m 41s
1:	learn: 0.6662516	total: 181ms	remaining: 1m 30s
2:	learn: 0.6509329	total: 253ms	remaining: 1

141:	learn: 0.3513225	total: 9.19s	remaining: 55.5s
142:	learn: 0.3510411	total: 9.24s	remaining: 55.4s
143:	learn: 0.3508829	total: 9.3s	remaining: 55.3s
144:	learn: 0.3505285	total: 9.35s	remaining: 55.1s
145:	learn: 0.3502733	total: 9.41s	remaining: 55s
146:	learn: 0.3501829	total: 9.46s	remaining: 54.9s
147:	learn: 0.3496840	total: 9.52s	remaining: 54.8s
148:	learn: 0.3494905	total: 9.57s	remaining: 54.7s
149:	learn: 0.3488890	total: 9.63s	remaining: 54.6s
150:	learn: 0.3485076	total: 9.69s	remaining: 54.5s
151:	learn: 0.3483814	total: 9.74s	remaining: 54.4s
152:	learn: 0.3483428	total: 9.8s	remaining: 54.2s
153:	learn: 0.3482157	total: 9.85s	remaining: 54.1s
154:	learn: 0.3476590	total: 9.91s	remaining: 54s
155:	learn: 0.3474480	total: 9.97s	remaining: 53.9s
156:	learn: 0.3472572	total: 10s	remaining: 53.9s
157:	learn: 0.3471438	total: 10.1s	remaining: 53.9s
158:	learn: 0.3469168	total: 10.2s	remaining: 53.8s
159:	learn: 0.3465948	total: 10.2s	remaining: 53.7s
160:	learn: 0.346365

300:	learn: 0.3227087	total: 18.5s	remaining: 43s
301:	learn: 0.3226231	total: 18.6s	remaining: 43s
302:	learn: 0.3224588	total: 18.7s	remaining: 42.9s
303:	learn: 0.3221291	total: 18.7s	remaining: 42.9s
304:	learn: 0.3220876	total: 18.8s	remaining: 42.8s
305:	learn: 0.3220567	total: 18.8s	remaining: 42.7s
306:	learn: 0.3220140	total: 18.9s	remaining: 42.6s
307:	learn: 0.3218906	total: 18.9s	remaining: 42.5s
308:	learn: 0.3216571	total: 19s	remaining: 42.5s
309:	learn: 0.3214431	total: 19s	remaining: 42.4s
310:	learn: 0.3212921	total: 19.1s	remaining: 42.3s
311:	learn: 0.3211574	total: 19.2s	remaining: 42.3s
312:	learn: 0.3210193	total: 19.2s	remaining: 42.2s
313:	learn: 0.3208479	total: 19.3s	remaining: 42.1s
314:	learn: 0.3207815	total: 19.3s	remaining: 42s
315:	learn: 0.3205829	total: 19.4s	remaining: 42s
316:	learn: 0.3205239	total: 19.4s	remaining: 41.9s
317:	learn: 0.3203518	total: 19.5s	remaining: 41.8s
318:	learn: 0.3203219	total: 19.6s	remaining: 41.8s
319:	learn: 0.3202362	to

459:	learn: 0.3012686	total: 28.6s	remaining: 33.6s
460:	learn: 0.3011333	total: 28.7s	remaining: 33.5s
461:	learn: 0.3010775	total: 28.7s	remaining: 33.5s
462:	learn: 0.3009366	total: 28.8s	remaining: 33.4s
463:	learn: 0.3008546	total: 28.9s	remaining: 33.3s
464:	learn: 0.3007408	total: 28.9s	remaining: 33.3s
465:	learn: 0.3006119	total: 29s	remaining: 33.2s
466:	learn: 0.3005690	total: 29.1s	remaining: 33.2s
467:	learn: 0.3004567	total: 29.1s	remaining: 33.1s
468:	learn: 0.3003810	total: 29.2s	remaining: 33s
469:	learn: 0.3002935	total: 29.3s	remaining: 33s
470:	learn: 0.3002239	total: 29.3s	remaining: 32.9s
471:	learn: 0.3001646	total: 29.4s	remaining: 32.9s
472:	learn: 0.3000687	total: 29.4s	remaining: 32.8s
473:	learn: 0.2999766	total: 29.5s	remaining: 32.7s
474:	learn: 0.2999206	total: 29.6s	remaining: 32.7s
475:	learn: 0.2998358	total: 29.6s	remaining: 32.6s
476:	learn: 0.2997554	total: 29.7s	remaining: 32.5s
477:	learn: 0.2997197	total: 29.7s	remaining: 32.5s
478:	learn: 0.2996

618:	learn: 0.2903434	total: 39.1s	remaining: 24.1s
619:	learn: 0.2903233	total: 39.2s	remaining: 24s
620:	learn: 0.2903010	total: 39.2s	remaining: 23.9s
621:	learn: 0.2902733	total: 39.3s	remaining: 23.9s
622:	learn: 0.2902269	total: 39.3s	remaining: 23.8s
623:	learn: 0.2902147	total: 39.4s	remaining: 23.7s
624:	learn: 0.2901901	total: 39.4s	remaining: 23.7s
625:	learn: 0.2901101	total: 39.5s	remaining: 23.6s
626:	learn: 0.2900368	total: 39.6s	remaining: 23.5s
627:	learn: 0.2899893	total: 39.6s	remaining: 23.5s
628:	learn: 0.2899563	total: 39.7s	remaining: 23.4s
629:	learn: 0.2899151	total: 39.8s	remaining: 23.4s
630:	learn: 0.2898911	total: 39.8s	remaining: 23.3s
631:	learn: 0.2898112	total: 39.9s	remaining: 23.2s
632:	learn: 0.2897792	total: 40s	remaining: 23.2s
633:	learn: 0.2897444	total: 40s	remaining: 23.1s
634:	learn: 0.2897254	total: 40.1s	remaining: 23s
635:	learn: 0.2897060	total: 40.1s	remaining: 23s
636:	learn: 0.2895935	total: 40.2s	remaining: 22.9s
637:	learn: 0.2894498	

777:	learn: 0.2845560	total: 48.9s	remaining: 14s
778:	learn: 0.2845412	total: 49s	remaining: 13.9s
779:	learn: 0.2845310	total: 49s	remaining: 13.8s
780:	learn: 0.2844947	total: 49.1s	remaining: 13.8s
781:	learn: 0.2844488	total: 49.1s	remaining: 13.7s
782:	learn: 0.2844300	total: 49.2s	remaining: 13.6s
783:	learn: 0.2844216	total: 49.3s	remaining: 13.6s
784:	learn: 0.2843870	total: 49.3s	remaining: 13.5s
785:	learn: 0.2843788	total: 49.4s	remaining: 13.4s
786:	learn: 0.2843653	total: 49.4s	remaining: 13.4s
787:	learn: 0.2843538	total: 49.5s	remaining: 13.3s
788:	learn: 0.2843281	total: 49.5s	remaining: 13.2s
789:	learn: 0.2843127	total: 49.6s	remaining: 13.2s
790:	learn: 0.2843031	total: 49.7s	remaining: 13.1s
791:	learn: 0.2842671	total: 49.7s	remaining: 13.1s
792:	learn: 0.2842120	total: 49.8s	remaining: 13s
793:	learn: 0.2842035	total: 49.8s	remaining: 12.9s
794:	learn: 0.2841863	total: 49.9s	remaining: 12.9s
795:	learn: 0.2841780	total: 50s	remaining: 12.8s
796:	learn: 0.2841672	

936:	learn: 0.2805619	total: 58.4s	remaining: 3.93s
937:	learn: 0.2805608	total: 58.5s	remaining: 3.86s
938:	learn: 0.2805528	total: 58.5s	remaining: 3.8s
939:	learn: 0.2805505	total: 58.6s	remaining: 3.74s
940:	learn: 0.2805488	total: 58.7s	remaining: 3.68s
941:	learn: 0.2805435	total: 58.7s	remaining: 3.62s
942:	learn: 0.2804272	total: 58.8s	remaining: 3.55s
943:	learn: 0.2804168	total: 58.9s	remaining: 3.49s
944:	learn: 0.2804098	total: 58.9s	remaining: 3.43s
945:	learn: 0.2804013	total: 59s	remaining: 3.37s
946:	learn: 0.2803941	total: 59.1s	remaining: 3.31s
947:	learn: 0.2803897	total: 59.1s	remaining: 3.24s
948:	learn: 0.2803713	total: 59.2s	remaining: 3.18s
949:	learn: 0.2803705	total: 59.3s	remaining: 3.12s
950:	learn: 0.2803648	total: 59.3s	remaining: 3.06s
951:	learn: 0.2803581	total: 59.4s	remaining: 2.99s
952:	learn: 0.2803505	total: 59.5s	remaining: 2.93s
953:	learn: 0.2803242	total: 59.6s	remaining: 2.87s
954:	learn: 0.2802738	total: 59.6s	remaining: 2.81s
955:	learn: 0.2



LogisticRegression
              precision    recall  f1-score   support

         0.0       0.86      0.89      0.87       763
         1.0       0.88      0.85      0.86       728

   micro avg       0.87      0.87      0.87      1491
   macro avg       0.87      0.87      0.87      1491
weighted avg       0.87      0.87      0.87      1491

[[676  87]
 [111 617]]
Accuracy is  86.72032193158954
Time on model's work: 0.131 s




SGDClassifier
              precision    recall  f1-score   support

         0.0       0.88      0.58      0.70       763
         1.0       0.68      0.92      0.78       728

   micro avg       0.75      0.75      0.75      1491
   macro avg       0.78      0.75      0.74      1491
weighted avg       0.78      0.75      0.74      1491

[[442 321]
 [ 59 669]]
Accuracy is  74.51374916163648
Time on model's work: 0.159 s
TFFMClassifier
              precision    recall  f1-score   support

         0.0       0.84      0.91      0.88       763
         1.0       0.90      0.82      0.86       728

   micro avg       0.87      0.87      0.87      1491
   macro avg       0.87      0.87      0.87      1491
weighted avg       0.87      0.87      0.87      1491

[[695  68]
 [130 598]]
Accuracy is  86.72032193158954
Time on model's work: 19.6 s
TOTAL TIME SPENT:  298.635 s


In [8]:
# TFFM sparse - works worse with sparse
# only CSR format supported
X_train_sparse = sp.csr_matrix(X_train)
X_test_sparse = sp.csr_matrix(X_test)
# weight - optional / AdamOptimizer
pos_class_weight = list(map(float,range(1, 5)))
for weight in pos_class_weight:
    model = TFFMClassifier(
        order=2,
        pos_class_weight=weight,
        rank=10, 
        optimizer=tf.train.AdamOptimizer(learning_rate=0.001), 
        n_epochs=20, 
        batch_size=1024,
        init_std=0.001,
        reg=0.01,
        input_type='sparse',
        #log_dir='./tmp/logs',
        #verbose=1,
        seed=42
    )
    model.fit(X_train_sparse, y_train, show_progress=True)
    predictions = model.predict(X_test_sparse)
    print('accuracy: {}'.format(accuracy_score(y_test, predictions)))
    # this will close tf.Session and free resources
    print(confusion_matrix(y_test,predictions)) 
    print(classification_report(y_test, predictions))
    model.destroy()

100%|██████████| 20/20 [00:01<00:00, 17.37epoch/s]


accuracy: 0.8443997317236754
[[649 114]
 [118 610]]
              precision    recall  f1-score   support

         0.0       0.85      0.85      0.85       763
         1.0       0.84      0.84      0.84       728

   micro avg       0.84      0.84      0.84      1491
   macro avg       0.84      0.84      0.84      1491
weighted avg       0.84      0.84      0.84      1491



100%|██████████| 20/20 [00:01<00:00, 16.57epoch/s]


accuracy: 0.7665995975855131
[[458 305]
 [ 43 685]]
              precision    recall  f1-score   support

         0.0       0.91      0.60      0.72       763
         1.0       0.69      0.94      0.80       728

   micro avg       0.77      0.77      0.77      1491
   macro avg       0.80      0.77      0.76      1491
weighted avg       0.81      0.77      0.76      1491



100%|██████████| 20/20 [00:01<00:00, 16.73epoch/s]


accuracy: 0.6955063715627096
[[323 440]
 [ 14 714]]
              precision    recall  f1-score   support

         0.0       0.96      0.42      0.59       763
         1.0       0.62      0.98      0.76       728

   micro avg       0.70      0.70      0.70      1491
   macro avg       0.79      0.70      0.67      1491
weighted avg       0.79      0.70      0.67      1491



100%|██████████| 20/20 [00:01<00:00, 15.72epoch/s]


accuracy: 0.7015425888665325
[[330 433]
 [ 12 716]]
              precision    recall  f1-score   support

         0.0       0.96      0.43      0.60       763
         1.0       0.62      0.98      0.76       728

   micro avg       0.70      0.70      0.70      1491
   macro avg       0.79      0.71      0.68      1491
weighted avg       0.80      0.70      0.68      1491



In [9]:
# weight - optional / FtrlOptimizer
pos_class_weight = list(map(float,range(1, 5)))
for weight in pos_class_weight:
    model = TFFMClassifier(
        order=2,
        pos_class_weight=weight,
        rank=10, 
        optimizer=tf.train.FtrlOptimizer(0.01, l2_regularization_strength=0.001, l2_shrinkage_regularization_strength=0.001), 
        n_epochs=20, 
        batch_size=1024,
        init_std=0.001,
        reg=0.01,
        input_type='sparse',
        #log_dir='./tmp/logs',
        #verbose=1,
        seed=42
    )
    model.fit(X_train_sparse, y_train, show_progress=True)
    predictions = model.predict(X_test_sparse)
    print('accuracy: {}'.format(accuracy_score(y_test, predictions)))
    # this will close tf.Session and free resources
    print(confusion_matrix(y_test,predictions)) 
    print(classification_report(y_test, predictions))
    model.destroy()

100%|██████████| 20/20 [00:01<00:00, 16.32epoch/s]


accuracy: 0.7002012072434608
[[384 379]
 [ 68 660]]
              precision    recall  f1-score   support

         0.0       0.85      0.50      0.63       763
         1.0       0.64      0.91      0.75       728

   micro avg       0.70      0.70      0.70      1491
   macro avg       0.74      0.70      0.69      1491
weighted avg       0.74      0.70      0.69      1491



100%|██████████| 20/20 [00:01<00:00, 16.28epoch/s]


accuracy: 0.7136150234741784
[[366 397]
 [ 30 698]]
              precision    recall  f1-score   support

         0.0       0.92      0.48      0.63       763
         1.0       0.64      0.96      0.77       728

   micro avg       0.71      0.71      0.71      1491
   macro avg       0.78      0.72      0.70      1491
weighted avg       0.78      0.71      0.70      1491



100%|██████████| 20/20 [00:01<00:00, 16.19epoch/s]


accuracy: 0.6747149564050973
[[292 471]
 [ 14 714]]
              precision    recall  f1-score   support

         0.0       0.95      0.38      0.55       763
         1.0       0.60      0.98      0.75       728

   micro avg       0.67      0.67      0.67      1491
   macro avg       0.78      0.68      0.65      1491
weighted avg       0.78      0.67      0.64      1491



100%|██████████| 20/20 [00:01<00:00, 16.94epoch/s]


accuracy: 0.6753856472166331
[[292 471]
 [ 13 715]]
              precision    recall  f1-score   support

         0.0       0.96      0.38      0.55       763
         1.0       0.60      0.98      0.75       728

   micro avg       0.68      0.68      0.68      1491
   macro avg       0.78      0.68      0.65      1491
weighted avg       0.78      0.68      0.64      1491



In [14]:
# KERAS
X_train.shape

(5961, 2226)

In [15]:
model = Sequential()
model.add(Dense(128, input_dim=2226, activation='sigmoid'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='sigmoid'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

model.fit(X_train, y_train,
          epochs=20,
          batch_size=256)
score = model.evaluate(X_test, y_test, batch_size=256)
print(score)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
[0.3419680971774697, 0.8524480174245009]


## NearMiss (version = 2)

In [16]:
nm2 = NearMiss(version=2)
X_resampled_nm2, y_resampled2 = nm2.fit_resample(features_list_array, labels_list_array)
print(sorted(Counter(y_resampled2).items()))

[(0.0, 3726), (1.0, 3726)]


In [17]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled_nm2, y_resampled2, random_state=35, test_size=0.2)

In [18]:
clfs = [
        ['RandomForestClassifier', RandomForestClassifier()],
        ['GradientBoostingClassifier', GradientBoostingClassifier()],
        ['ExtraTreesClassifier', ExtraTreesClassifier()],
        ['AdaBoostClassifier', AdaBoostClassifier()],
        ['BaggingClassifier', BaggingClassifier()],
        ['DecisionTreeClassifier', DecisionTreeClassifier()],
        ['MLPClassifier', MLPClassifier()],
        ['XGBClassifier', XGBClassifier()],
        ['CatBoostClassifier', CatBoostClassifier()],
        ['LogisticRegression', LogisticRegression()],
        ['SGDClassifier', SGDClassifier()],
        ['TFFMClassifier', TFFMClassifier()]
       ]
t = time()
for name, clf in clfs:
    t0 = time()
    clf.fit(X_train, y_train)
    prediction = clf.predict(X_test)
    print(name)
    print(classification_report(y_test, prediction))
    print(confusion_matrix(y_test, prediction))
    print('Accuracy is ', accuracy_score(y_test, prediction)*100)
    print ("Time on model's work:", round(time()-t0, 3), "s")
    print('='*100)
print ("TOTAL TIME SPENT: ", round(time()-t, 3), "s")



RandomForestClassifier
              precision    recall  f1-score   support

         0.0       0.80      0.82      0.81       763
         1.0       0.81      0.78      0.79       728

   micro avg       0.80      0.80      0.80      1491
   macro avg       0.80      0.80      0.80      1491
weighted avg       0.80      0.80      0.80      1491

[[625 138]
 [157 571]]
Accuracy is  80.21462105969148
Time on model's work: 0.722 s
GradientBoostingClassifier
              precision    recall  f1-score   support

         0.0       0.79      0.88      0.83       763
         1.0       0.86      0.75      0.80       728

   micro avg       0.82      0.82      0.82      1491
   macro avg       0.82      0.82      0.82      1491
weighted avg       0.82      0.82      0.82      1491

[[671  92]
 [181 547]]
Accuracy is  81.69014084507043
Time on model's work: 36.396 s




ExtraTreesClassifier
              precision    recall  f1-score   support

         0.0       0.80      0.84      0.82       763
         1.0       0.83      0.78      0.80       728

   micro avg       0.81      0.81      0.81      1491
   macro avg       0.81      0.81      0.81      1491
weighted avg       0.81      0.81      0.81      1491

[[644 119]
 [160 568]]
Accuracy is  81.28772635814889
Time on model's work: 1.25 s
AdaBoostClassifier
              precision    recall  f1-score   support

         0.0       0.86      0.72      0.79       763
         1.0       0.75      0.88      0.81       728

   micro avg       0.80      0.80      0.80      1491
   macro avg       0.80      0.80      0.80      1491
weighted avg       0.81      0.80      0.80      1491

[[552 211]
 [ 91 637]]
Accuracy is  79.74513749161638
Time on model's work: 9.626 s
BaggingClassifier
              precision    recall  f1-score   support

         0.0       0.79      0.87      0.83       763
         1.0



MLPClassifier
              precision    recall  f1-score   support

         0.0       0.81      0.80      0.81       763
         1.0       0.80      0.80      0.80       728

   micro avg       0.80      0.80      0.80      1491
   macro avg       0.80      0.80      0.80      1491
weighted avg       0.80      0.80      0.80      1491

[[613 150]
 [143 585]]
Accuracy is  80.34875922199866
Time on model's work: 129.26 s
XGBClassifier
              precision    recall  f1-score   support

         0.0       0.81      0.89      0.85       763
         1.0       0.87      0.77      0.82       728

   micro avg       0.83      0.83      0.83      1491
   macro avg       0.84      0.83      0.83      1491
weighted avg       0.84      0.83      0.83      1491

[[680  83]
 [164 564]]
Accuracy is  83.43393695506371
Time on model's work: 42.938 s
0:	learn: 0.6516410	total: 66.6ms	remaining: 1m 6s
1:	learn: 0.6170341	total: 123ms	remaining: 1m 1s
2:	learn: 0.5852969	total: 192ms	remaining: 1m 

141:	learn: 0.3154715	total: 8.89s	remaining: 53.7s
142:	learn: 0.3154249	total: 8.96s	remaining: 53.7s
143:	learn: 0.3153435	total: 9.03s	remaining: 53.7s
144:	learn: 0.3152882	total: 9.11s	remaining: 53.7s
145:	learn: 0.3152407	total: 9.16s	remaining: 53.6s
146:	learn: 0.3151946	total: 9.22s	remaining: 53.5s
147:	learn: 0.3151699	total: 9.27s	remaining: 53.4s
148:	learn: 0.3150716	total: 9.34s	remaining: 53.4s
149:	learn: 0.3150413	total: 9.42s	remaining: 53.4s
150:	learn: 0.3150055	total: 9.51s	remaining: 53.5s
151:	learn: 0.3149511	total: 9.57s	remaining: 53.4s
152:	learn: 0.3148590	total: 9.62s	remaining: 53.3s
153:	learn: 0.3148128	total: 9.7s	remaining: 53.3s
154:	learn: 0.3147167	total: 9.79s	remaining: 53.4s
155:	learn: 0.3146521	total: 9.85s	remaining: 53.3s
156:	learn: 0.3145557	total: 9.92s	remaining: 53.3s
157:	learn: 0.3144589	total: 9.98s	remaining: 53.2s
158:	learn: 0.3143890	total: 10s	remaining: 53.1s
159:	learn: 0.3143357	total: 10.1s	remaining: 53s
160:	learn: 0.314

300:	learn: 0.3080180	total: 18.5s	remaining: 43s
301:	learn: 0.3079932	total: 18.6s	remaining: 42.9s
302:	learn: 0.3079661	total: 18.6s	remaining: 42.9s
303:	learn: 0.3079475	total: 18.7s	remaining: 42.8s
304:	learn: 0.3079127	total: 18.8s	remaining: 42.7s
305:	learn: 0.3079079	total: 18.8s	remaining: 42.7s
306:	learn: 0.3078988	total: 18.9s	remaining: 42.6s
307:	learn: 0.3078522	total: 18.9s	remaining: 42.5s
308:	learn: 0.3078481	total: 19s	remaining: 42.5s
309:	learn: 0.3077678	total: 19.1s	remaining: 42.4s
310:	learn: 0.3076946	total: 19.1s	remaining: 42.3s
311:	learn: 0.3076809	total: 19.2s	remaining: 42.3s
312:	learn: 0.3076618	total: 19.2s	remaining: 42.2s
313:	learn: 0.3076567	total: 19.3s	remaining: 42.1s
314:	learn: 0.3076552	total: 19.3s	remaining: 42s
315:	learn: 0.3075058	total: 19.4s	remaining: 42s
316:	learn: 0.3074937	total: 19.5s	remaining: 41.9s
317:	learn: 0.3074489	total: 19.5s	remaining: 41.9s
318:	learn: 0.3074092	total: 19.6s	remaining: 41.8s
319:	learn: 0.307382

459:	learn: 0.3001160	total: 29s	remaining: 34s
460:	learn: 0.3000794	total: 29s	remaining: 33.9s
461:	learn: 0.3000353	total: 29.1s	remaining: 33.9s
462:	learn: 0.3000006	total: 29.2s	remaining: 33.8s
463:	learn: 0.2999682	total: 29.2s	remaining: 33.8s
464:	learn: 0.2999544	total: 29.3s	remaining: 33.7s
465:	learn: 0.2999377	total: 29.4s	remaining: 33.6s
466:	learn: 0.2999219	total: 29.4s	remaining: 33.6s
467:	learn: 0.2998941	total: 29.5s	remaining: 33.5s
468:	learn: 0.2998799	total: 29.6s	remaining: 33.5s
469:	learn: 0.2996671	total: 29.6s	remaining: 33.4s
470:	learn: 0.2996306	total: 29.7s	remaining: 33.4s
471:	learn: 0.2996116	total: 29.8s	remaining: 33.3s
472:	learn: 0.2996024	total: 29.8s	remaining: 33.3s
473:	learn: 0.2995440	total: 29.9s	remaining: 33.2s
474:	learn: 0.2995152	total: 30s	remaining: 33.1s
475:	learn: 0.2994241	total: 30s	remaining: 33.1s
476:	learn: 0.2993420	total: 30.1s	remaining: 33s
477:	learn: 0.2992674	total: 30.2s	remaining: 32.9s
478:	learn: 0.2992287	to

618:	learn: 0.2891100	total: 39.5s	remaining: 24.3s
619:	learn: 0.2890808	total: 39.6s	remaining: 24.2s
620:	learn: 0.2890146	total: 39.6s	remaining: 24.2s
621:	learn: 0.2890059	total: 39.7s	remaining: 24.1s
622:	learn: 0.2889708	total: 39.7s	remaining: 24.1s
623:	learn: 0.2889275	total: 39.8s	remaining: 24s
624:	learn: 0.2888998	total: 39.9s	remaining: 23.9s
625:	learn: 0.2887982	total: 39.9s	remaining: 23.9s
626:	learn: 0.2887866	total: 40s	remaining: 23.8s
627:	learn: 0.2887689	total: 40.1s	remaining: 23.7s
628:	learn: 0.2887570	total: 40.1s	remaining: 23.7s
629:	learn: 0.2887419	total: 40.2s	remaining: 23.6s
630:	learn: 0.2887082	total: 40.2s	remaining: 23.5s
631:	learn: 0.2886877	total: 40.3s	remaining: 23.5s
632:	learn: 0.2886746	total: 40.4s	remaining: 23.4s
633:	learn: 0.2886411	total: 40.4s	remaining: 23.3s
634:	learn: 0.2886276	total: 40.5s	remaining: 23.3s
635:	learn: 0.2885650	total: 40.6s	remaining: 23.2s
636:	learn: 0.2885308	total: 40.6s	remaining: 23.2s
637:	learn: 0.28

777:	learn: 0.2836026	total: 50.2s	remaining: 14.3s
778:	learn: 0.2835619	total: 50.3s	remaining: 14.3s
779:	learn: 0.2835113	total: 50.3s	remaining: 14.2s
780:	learn: 0.2834878	total: 50.4s	remaining: 14.1s
781:	learn: 0.2834704	total: 50.5s	remaining: 14.1s
782:	learn: 0.2834414	total: 50.5s	remaining: 14s
783:	learn: 0.2834358	total: 50.6s	remaining: 13.9s
784:	learn: 0.2834259	total: 50.6s	remaining: 13.9s
785:	learn: 0.2834204	total: 50.7s	remaining: 13.8s
786:	learn: 0.2833956	total: 50.8s	remaining: 13.7s
787:	learn: 0.2833848	total: 50.8s	remaining: 13.7s
788:	learn: 0.2833739	total: 50.9s	remaining: 13.6s
789:	learn: 0.2832787	total: 50.9s	remaining: 13.5s
790:	learn: 0.2832711	total: 51s	remaining: 13.5s
791:	learn: 0.2832492	total: 51.1s	remaining: 13.4s
792:	learn: 0.2832363	total: 51.1s	remaining: 13.3s
793:	learn: 0.2832182	total: 51.2s	remaining: 13.3s
794:	learn: 0.2832103	total: 51.2s	remaining: 13.2s
795:	learn: 0.2831769	total: 51.3s	remaining: 13.1s
796:	learn: 0.28

937:	learn: 0.2793619	total: 1m	remaining: 4.03s
938:	learn: 0.2792465	total: 1m 1s	remaining: 3.96s
939:	learn: 0.2792380	total: 1m 1s	remaining: 3.9s
940:	learn: 0.2792269	total: 1m 1s	remaining: 3.83s
941:	learn: 0.2792213	total: 1m 1s	remaining: 3.77s
942:	learn: 0.2792029	total: 1m 1s	remaining: 3.71s
943:	learn: 0.2791362	total: 1m 1s	remaining: 3.64s
944:	learn: 0.2791292	total: 1m 1s	remaining: 3.58s
945:	learn: 0.2791112	total: 1m 1s	remaining: 3.51s
946:	learn: 0.2791054	total: 1m 1s	remaining: 3.45s
947:	learn: 0.2790948	total: 1m 1s	remaining: 3.38s
948:	learn: 0.2790210	total: 1m 1s	remaining: 3.32s
949:	learn: 0.2790054	total: 1m 1s	remaining: 3.25s
950:	learn: 0.2789956	total: 1m 1s	remaining: 3.19s
951:	learn: 0.2789875	total: 1m 1s	remaining: 3.12s
952:	learn: 0.2789685	total: 1m 2s	remaining: 3.06s
953:	learn: 0.2789524	total: 1m 2s	remaining: 2.99s
954:	learn: 0.2789328	total: 1m 2s	remaining: 2.93s
955:	learn: 0.2789261	total: 1m 2s	remaining: 2.86s
956:	learn: 0.27



LogisticRegression
              precision    recall  f1-score   support

         0.0       0.84      0.80      0.82       763
         1.0       0.80      0.84      0.82       728

   micro avg       0.82      0.82      0.82      1491
   macro avg       0.82      0.82      0.82      1491
weighted avg       0.82      0.82      0.82      1491

[[608 155]
 [117 611]]
Accuracy is  81.75720992622401
Time on model's work: 0.127 s




SGDClassifier
              precision    recall  f1-score   support

         0.0       0.92      0.58      0.72       763
         1.0       0.69      0.95      0.80       728

   micro avg       0.76      0.76      0.76      1491
   macro avg       0.80      0.77      0.76      1491
weighted avg       0.81      0.76      0.75      1491

[[446 317]
 [ 38 690]]
Accuracy is  76.19047619047619
Time on model's work: 0.172 s
TFFMClassifier
              precision    recall  f1-score   support

         0.0       0.83      0.85      0.84       763
         1.0       0.83      0.81      0.82       728

   micro avg       0.83      0.83      0.83      1491
   macro avg       0.83      0.83      0.83      1491
weighted avg       0.83      0.83      0.83      1491

[[645 118]
 [136 592]]
Accuracy is  82.9644533869886
Time on model's work: 20.84 s
TOTAL TIME SPENT:  314.363 s


In [22]:
# TFFM sparse - works worse with sparse
# only CSR format supported
X_train_sparse = sp.csr_matrix(X_train)
X_test_sparse = sp.csr_matrix(X_test)
# weight - optional / AdamOptimizer
pos_class_weight = list(map(float,range(1, 5)))
for weight in pos_class_weight:
    model = TFFMClassifier(
        order=2,
        pos_class_weight=weight,
        rank=10, 
        optimizer=tf.train.AdamOptimizer(learning_rate=0.001), 
        n_epochs=50, 
        batch_size=1024,
        init_std=0.001,
        reg=0.01,
        input_type='sparse',
        #log_dir='./tmp/logs',
        #verbose=1,
        seed=42
    )
    model.fit(X_train_sparse, y_train, show_progress=True)
    predictions = model.predict(X_test_sparse)
    print('accuracy: {}'.format(accuracy_score(y_test, predictions)))
    # this will close tf.Session and free resources
    print(confusion_matrix(y_test,predictions)) 
    print(classification_report(y_test, predictions))
    model.destroy()

100%|██████████| 50/50 [00:02<00:00, 18.27epoch/s]


accuracy: 0.8356807511737089
[[658 105]
 [140 588]]
              precision    recall  f1-score   support

         0.0       0.82      0.86      0.84       763
         1.0       0.85      0.81      0.83       728

   micro avg       0.84      0.84      0.84      1491
   macro avg       0.84      0.84      0.84      1491
weighted avg       0.84      0.84      0.84      1491



100%|██████████| 50/50 [00:02<00:00, 17.32epoch/s]


accuracy: 0.7994634473507712
[[535 228]
 [ 71 657]]
              precision    recall  f1-score   support

         0.0       0.88      0.70      0.78       763
         1.0       0.74      0.90      0.81       728

   micro avg       0.80      0.80      0.80      1491
   macro avg       0.81      0.80      0.80      1491
weighted avg       0.81      0.80      0.80      1491



100%|██████████| 50/50 [00:02<00:00, 19.44epoch/s]


accuracy: 0.784037558685446
[[491 272]
 [ 50 678]]
              precision    recall  f1-score   support

         0.0       0.91      0.64      0.75       763
         1.0       0.71      0.93      0.81       728

   micro avg       0.78      0.78      0.78      1491
   macro avg       0.81      0.79      0.78      1491
weighted avg       0.81      0.78      0.78      1491



100%|██████████| 50/50 [00:02<00:00, 20.03epoch/s]


accuracy: 0.7739771965124078
[[460 303]
 [ 34 694]]
              precision    recall  f1-score   support

         0.0       0.93      0.60      0.73       763
         1.0       0.70      0.95      0.80       728

   micro avg       0.77      0.77      0.77      1491
   macro avg       0.81      0.78      0.77      1491
weighted avg       0.82      0.77      0.77      1491



In [23]:
# weight - optional / FtrlOptimizer
pos_class_weight = list(map(float,range(1, 5)))
for weight in pos_class_weight:
    model = TFFMClassifier(
        order=2,
        pos_class_weight=weight,
        rank=10, 
        optimizer=tf.train.FtrlOptimizer(0.01, l2_regularization_strength=0.001, l2_shrinkage_regularization_strength=0.001), 
        n_epochs=50, 
        batch_size=1024,
        init_std=0.001,
        reg=0.01,
        input_type='sparse',
        #log_dir='./tmp/logs',
        #verbose=1,
        seed=42
    )
    model.fit(X_train_sparse, y_train, show_progress=True)
    predictions = model.predict(X_test_sparse)
    print('accuracy: {}'.format(accuracy_score(y_test, predictions)))
    # this will close tf.Session and free resources
    print(confusion_matrix(y_test,predictions)) 
    print(classification_report(y_test, predictions))
    model.destroy()

100%|██████████| 50/50 [00:02<00:00, 17.57epoch/s]


accuracy: 0.7323943661971831
[[456 307]
 [ 92 636]]
              precision    recall  f1-score   support

         0.0       0.83      0.60      0.70       763
         1.0       0.67      0.87      0.76       728

   micro avg       0.73      0.73      0.73      1491
   macro avg       0.75      0.74      0.73      1491
weighted avg       0.76      0.73      0.73      1491



100%|██████████| 50/50 [00:02<00:00, 18.23epoch/s]


accuracy: 0.7377598926894702
[[445 318]
 [ 73 655]]
              precision    recall  f1-score   support

         0.0       0.86      0.58      0.69       763
         1.0       0.67      0.90      0.77       728

   micro avg       0.74      0.74      0.74      1491
   macro avg       0.77      0.74      0.73      1491
weighted avg       0.77      0.74      0.73      1491



100%|██████████| 50/50 [00:02<00:00, 18.37epoch/s]


accuracy: 0.7431254191817572
[[444 319]
 [ 64 664]]
              precision    recall  f1-score   support

         0.0       0.87      0.58      0.70       763
         1.0       0.68      0.91      0.78       728

   micro avg       0.74      0.74      0.74      1491
   macro avg       0.77      0.75      0.74      1491
weighted avg       0.78      0.74      0.74      1491



100%|██████████| 50/50 [00:02<00:00, 18.25epoch/s]


accuracy: 0.7592219986586184
[[436 327]
 [ 32 696]]
              precision    recall  f1-score   support

         0.0       0.93      0.57      0.71       763
         1.0       0.68      0.96      0.79       728

   micro avg       0.76      0.76      0.76      1491
   macro avg       0.81      0.76      0.75      1491
weighted avg       0.81      0.76      0.75      1491



In [27]:
# KERAS
X_train.shape[1]

2226

In [28]:
model = Sequential()
model.add(Dense(128, input_dim=X_train.shape[1], activation='sigmoid'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='sigmoid'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

model.fit(X_train, y_train,
          epochs=20,
          batch_size=256)
score = model.evaluate(X_test, y_test, batch_size=256)
print(score)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
[0.3912385860239396, 0.8001341402010979]


## NearMiss (version = 3)

In [29]:
nm3 = NearMiss(version=3)
X_resampled_nm3, y_resampled3 = nm3.fit_resample(features_list_array, labels_list_array)
print(sorted(Counter(y_resampled2).items()))

[(0.0, 3726), (1.0, 3726)]


In [30]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled_nm3, y_resampled3, random_state=35, test_size=0.2)

In [31]:
clfs = [
        ['RandomForestClassifier', RandomForestClassifier()],
        ['GradientBoostingClassifier', GradientBoostingClassifier()],
        ['ExtraTreesClassifier', ExtraTreesClassifier()],
        ['AdaBoostClassifier', AdaBoostClassifier()],
        ['BaggingClassifier', BaggingClassifier()],
        ['DecisionTreeClassifier', DecisionTreeClassifier()],
        ['MLPClassifier', MLPClassifier()],
        ['XGBClassifier', XGBClassifier()],
        ['CatBoostClassifier', CatBoostClassifier()],
        ['LogisticRegression', LogisticRegression()],
        ['SGDClassifier', SGDClassifier()],
        ['TFFMClassifier', TFFMClassifier()]
       ]
t = time()
for name, clf in clfs:
    t0 = time()
    clf.fit(X_train, y_train)
    prediction = clf.predict(X_test)
    print(name)
    print(classification_report(y_test, prediction))
    print(confusion_matrix(y_test, prediction))
    print('Accuracy is ', accuracy_score(y_test, prediction)*100)
    print ("Time on model's work:", round(time()-t0, 3), "s")
    print('='*100)
print ("TOTAL TIME SPENT: ", round(time()-t, 3), "s")



RandomForestClassifier
              precision    recall  f1-score   support

         0.0       0.61      0.69      0.65       763
         1.0       0.62      0.55      0.58       728

   micro avg       0.62      0.62      0.62      1491
   macro avg       0.62      0.62      0.62      1491
weighted avg       0.62      0.62      0.62      1491

[[524 239]
 [330 398]]
Accuracy is  61.837692823608315
Time on model's work: 0.907 s
GradientBoostingClassifier
              precision    recall  f1-score   support

         0.0       0.62      0.74      0.67       763
         1.0       0.66      0.53      0.59       728

   micro avg       0.64      0.64      0.64      1491
   macro avg       0.64      0.63      0.63      1491
weighted avg       0.64      0.64      0.63      1491

[[562 201]
 [343 385]]
Accuracy is  63.51441985244802
Time on model's work: 35.264 s




ExtraTreesClassifier
              precision    recall  f1-score   support

         0.0       0.59      0.66      0.62       763
         1.0       0.59      0.51      0.55       728

   micro avg       0.59      0.59      0.59      1491
   macro avg       0.59      0.58      0.58      1491
weighted avg       0.59      0.59      0.58      1491

[[500 263]
 [354 374]]
Accuracy is  58.61837692823608
Time on model's work: 1.553 s
AdaBoostClassifier
              precision    recall  f1-score   support

         0.0       0.62      0.70      0.66       763
         1.0       0.64      0.56      0.59       728

   micro avg       0.63      0.63      0.63      1491
   macro avg       0.63      0.63      0.63      1491
weighted avg       0.63      0.63      0.63      1491

[[532 231]
 [322 406]]
Accuracy is  62.91079812206573
Time on model's work: 9.505 s
BaggingClassifier
              precision    recall  f1-score   support

         0.0       0.62      0.75      0.68       763
         1.



MLPClassifier
              precision    recall  f1-score   support

         0.0       0.61      0.55      0.58       763
         1.0       0.57      0.62      0.60       728

   micro avg       0.59      0.59      0.59      1491
   macro avg       0.59      0.59      0.59      1491
weighted avg       0.59      0.59      0.59      1491

[[421 342]
 [274 454]]
Accuracy is  58.68544600938967
Time on model's work: 118.156 s
XGBClassifier
              precision    recall  f1-score   support

         0.0       0.62      0.74      0.67       763
         1.0       0.66      0.52      0.58       728

   micro avg       0.63      0.63      0.63      1491
   macro avg       0.64      0.63      0.63      1491
weighted avg       0.64      0.63      0.63      1491

[[563 200]
 [347 381]]
Accuracy is  63.31321260898726
Time on model's work: 41.343 s
0:	learn: 0.6892697	total: 69.8ms	remaining: 1m 9s
1:	learn: 0.6855122	total: 153ms	remaining: 1m 16s
2:	learn: 0.6823025	total: 224ms	remaining: 1

138:	learn: 0.6174709	total: 11.1s	remaining: 1m 8s
139:	learn: 0.6174350	total: 11.1s	remaining: 1m 8s
140:	learn: 0.6172956	total: 11.2s	remaining: 1m 8s
141:	learn: 0.6172124	total: 11.2s	remaining: 1m 7s
142:	learn: 0.6170515	total: 11.3s	remaining: 1m 7s
143:	learn: 0.6169332	total: 11.4s	remaining: 1m 7s
144:	learn: 0.6167651	total: 11.4s	remaining: 1m 7s
145:	learn: 0.6166714	total: 11.5s	remaining: 1m 7s
146:	learn: 0.6166256	total: 11.6s	remaining: 1m 7s
147:	learn: 0.6164988	total: 11.6s	remaining: 1m 7s
148:	learn: 0.6163757	total: 11.7s	remaining: 1m 6s
149:	learn: 0.6162858	total: 11.8s	remaining: 1m 6s
150:	learn: 0.6162117	total: 11.8s	remaining: 1m 6s
151:	learn: 0.6161325	total: 11.9s	remaining: 1m 6s
152:	learn: 0.6159820	total: 12s	remaining: 1m 6s
153:	learn: 0.6158477	total: 12s	remaining: 1m 6s
154:	learn: 0.6156839	total: 12.1s	remaining: 1m 6s
155:	learn: 0.6156033	total: 12.2s	remaining: 1m 5s
156:	learn: 0.6155354	total: 12.2s	remaining: 1m 5s
157:	learn: 0.61

298:	learn: 0.6036119	total: 21.7s	remaining: 50.9s
299:	learn: 0.6034671	total: 21.8s	remaining: 50.8s
300:	learn: 0.6034338	total: 21.8s	remaining: 50.7s
301:	learn: 0.6033851	total: 21.9s	remaining: 50.6s
302:	learn: 0.6033305	total: 22s	remaining: 50.5s
303:	learn: 0.6033122	total: 22s	remaining: 50.4s
304:	learn: 0.6032613	total: 22.1s	remaining: 50.4s
305:	learn: 0.6031080	total: 22.2s	remaining: 50.3s
306:	learn: 0.6030356	total: 22.2s	remaining: 50.2s
307:	learn: 0.6029953	total: 22.3s	remaining: 50.1s
308:	learn: 0.6029229	total: 22.4s	remaining: 50s
309:	learn: 0.6029106	total: 22.4s	remaining: 49.9s
310:	learn: 0.6028973	total: 22.5s	remaining: 49.8s
311:	learn: 0.6028416	total: 22.6s	remaining: 49.7s
312:	learn: 0.6028074	total: 22.6s	remaining: 49.7s
313:	learn: 0.6027421	total: 22.7s	remaining: 49.6s
314:	learn: 0.6026392	total: 22.8s	remaining: 49.5s
315:	learn: 0.6025229	total: 22.8s	remaining: 49.4s
316:	learn: 0.6024768	total: 22.9s	remaining: 49.4s
317:	learn: 0.6024

457:	learn: 0.5901406	total: 32.9s	remaining: 38.9s
458:	learn: 0.5901008	total: 32.9s	remaining: 38.8s
459:	learn: 0.5900307	total: 33s	remaining: 38.7s
460:	learn: 0.5899852	total: 33.1s	remaining: 38.7s
461:	learn: 0.5898565	total: 33.1s	remaining: 38.6s
462:	learn: 0.5898408	total: 33.2s	remaining: 38.5s
463:	learn: 0.5898232	total: 33.2s	remaining: 38.4s
464:	learn: 0.5897168	total: 33.3s	remaining: 38.3s
465:	learn: 0.5895158	total: 33.4s	remaining: 38.3s
466:	learn: 0.5894618	total: 33.5s	remaining: 38.2s
467:	learn: 0.5894175	total: 33.5s	remaining: 38.1s
468:	learn: 0.5893181	total: 33.6s	remaining: 38s
469:	learn: 0.5892635	total: 33.6s	remaining: 37.9s
470:	learn: 0.5892067	total: 33.7s	remaining: 37.9s
471:	learn: 0.5891740	total: 33.8s	remaining: 37.8s
472:	learn: 0.5891415	total: 33.8s	remaining: 37.7s
473:	learn: 0.5890855	total: 33.9s	remaining: 37.6s
474:	learn: 0.5890144	total: 34s	remaining: 37.6s
475:	learn: 0.5889382	total: 34s	remaining: 37.5s
476:	learn: 0.588836

616:	learn: 0.5813544	total: 43.4s	remaining: 26.9s
617:	learn: 0.5812389	total: 43.5s	remaining: 26.9s
618:	learn: 0.5812043	total: 43.5s	remaining: 26.8s
619:	learn: 0.5810824	total: 43.6s	remaining: 26.7s
620:	learn: 0.5810119	total: 43.7s	remaining: 26.7s
621:	learn: 0.5808386	total: 43.7s	remaining: 26.6s
622:	learn: 0.5806233	total: 43.8s	remaining: 26.5s
623:	learn: 0.5806050	total: 43.9s	remaining: 26.4s
624:	learn: 0.5805144	total: 43.9s	remaining: 26.4s
625:	learn: 0.5804725	total: 44s	remaining: 26.3s
626:	learn: 0.5804541	total: 44.1s	remaining: 26.2s
627:	learn: 0.5804153	total: 44.1s	remaining: 26.1s
628:	learn: 0.5803419	total: 44.2s	remaining: 26.1s
629:	learn: 0.5803042	total: 44.3s	remaining: 26s
630:	learn: 0.5802678	total: 44.3s	remaining: 25.9s
631:	learn: 0.5802208	total: 44.4s	remaining: 25.8s
632:	learn: 0.5801493	total: 44.4s	remaining: 25.8s
633:	learn: 0.5799163	total: 44.5s	remaining: 25.7s
634:	learn: 0.5798280	total: 44.6s	remaining: 25.6s
635:	learn: 0.57

775:	learn: 0.5733868	total: 54s	remaining: 15.6s
776:	learn: 0.5733314	total: 54s	remaining: 15.5s
777:	learn: 0.5733053	total: 54.1s	remaining: 15.4s
778:	learn: 0.5732751	total: 54.2s	remaining: 15.4s
779:	learn: 0.5732277	total: 54.2s	remaining: 15.3s
780:	learn: 0.5732134	total: 54.3s	remaining: 15.2s
781:	learn: 0.5729589	total: 54.4s	remaining: 15.2s
782:	learn: 0.5729465	total: 54.4s	remaining: 15.1s
783:	learn: 0.5729295	total: 54.5s	remaining: 15s
784:	learn: 0.5729161	total: 54.6s	remaining: 14.9s
785:	learn: 0.5728844	total: 54.7s	remaining: 14.9s
786:	learn: 0.5728701	total: 54.7s	remaining: 14.8s
787:	learn: 0.5728493	total: 54.8s	remaining: 14.7s
788:	learn: 0.5728373	total: 54.9s	remaining: 14.7s
789:	learn: 0.5728157	total: 54.9s	remaining: 14.6s
790:	learn: 0.5727557	total: 55s	remaining: 14.5s
791:	learn: 0.5727464	total: 55.1s	remaining: 14.5s
792:	learn: 0.5727180	total: 55.1s	remaining: 14.4s
793:	learn: 0.5726942	total: 55.2s	remaining: 14.3s
794:	learn: 0.572668

934:	learn: 0.5678095	total: 1m 6s	remaining: 4.63s
935:	learn: 0.5677453	total: 1m 6s	remaining: 4.56s
936:	learn: 0.5676420	total: 1m 6s	remaining: 4.49s
937:	learn: 0.5676173	total: 1m 6s	remaining: 4.42s
938:	learn: 0.5676035	total: 1m 6s	remaining: 4.35s
939:	learn: 0.5675919	total: 1m 6s	remaining: 4.27s
940:	learn: 0.5675682	total: 1m 7s	remaining: 4.2s
941:	learn: 0.5675423	total: 1m 7s	remaining: 4.13s
942:	learn: 0.5674986	total: 1m 7s	remaining: 4.06s
943:	learn: 0.5674316	total: 1m 7s	remaining: 3.99s
944:	learn: 0.5674041	total: 1m 7s	remaining: 3.92s
945:	learn: 0.5673762	total: 1m 7s	remaining: 3.85s
946:	learn: 0.5673639	total: 1m 7s	remaining: 3.77s
947:	learn: 0.5673618	total: 1m 7s	remaining: 3.7s
948:	learn: 0.5672606	total: 1m 7s	remaining: 3.63s
949:	learn: 0.5672333	total: 1m 7s	remaining: 3.56s
950:	learn: 0.5672122	total: 1m 7s	remaining: 3.49s
951:	learn: 0.5672005	total: 1m 7s	remaining: 3.42s
952:	learn: 0.5671820	total: 1m 7s	remaining: 3.35s
953:	learn: 0.



LogisticRegression
              precision    recall  f1-score   support

         0.0       0.62      0.67      0.65       763
         1.0       0.63      0.57      0.60       728

   micro avg       0.62      0.62      0.62      1491
   macro avg       0.62      0.62      0.62      1491
weighted avg       0.62      0.62      0.62      1491

[[515 248]
 [313 415]]
Accuracy is  62.37424547283702
Time on model's work: 0.184 s




SGDClassifier
              precision    recall  f1-score   support

         0.0       0.56      0.93      0.70       763
         1.0       0.76      0.22      0.35       728

   micro avg       0.59      0.59      0.59      1491
   macro avg       0.66      0.58      0.52      1491
weighted avg       0.66      0.59      0.53      1491

[[712  51]
 [565 163]]
Accuracy is  58.68544600938967
Time on model's work: 0.163 s
TFFMClassifier
              precision    recall  f1-score   support

         0.0       0.61      0.62      0.61       763
         1.0       0.59      0.58      0.58       728

   micro avg       0.60      0.60      0.60      1491
   macro avg       0.60      0.60      0.60      1491
weighted avg       0.60      0.60      0.60      1491

[[475 288]
 [308 420]]
Accuracy is  60.02682763246143
Time on model's work: 19.537 s
TOTAL TIME SPENT:  307.114 s


In [32]:
# TFFM sparse - works worse with sparse
# only CSR format supported
X_train_sparse = sp.csr_matrix(X_train)
X_test_sparse = sp.csr_matrix(X_test)
# weight - optional / AdamOptimizer
pos_class_weight = list(map(float,range(1, 5)))
for weight in pos_class_weight:
    model = TFFMClassifier(
        order=2,
        pos_class_weight=weight,
        rank=10, 
        optimizer=tf.train.AdamOptimizer(learning_rate=0.001), 
        n_epochs=50, 
        batch_size=1024,
        init_std=0.001,
        reg=0.01,
        input_type='sparse',
        #log_dir='./tmp/logs',
        #verbose=1,
        seed=42
    )
    model.fit(X_train_sparse, y_train, show_progress=True)
    predictions = model.predict(X_test_sparse)
    print('accuracy: {}'.format(accuracy_score(y_test, predictions)))
    # this will close tf.Session and free resources
    print(confusion_matrix(y_test,predictions)) 
    print(classification_report(y_test, predictions))
    model.destroy()

100%|██████████| 50/50 [00:02<00:00, 17.89epoch/s]


accuracy: 0.6170355466130114
[[512 251]
 [320 408]]
              precision    recall  f1-score   support

         0.0       0.62      0.67      0.64       763
         1.0       0.62      0.56      0.59       728

   micro avg       0.62      0.62      0.62      1491
   macro avg       0.62      0.62      0.62      1491
weighted avg       0.62      0.62      0.62      1491



100%|██████████| 50/50 [00:02<00:00, 18.10epoch/s]


accuracy: 0.574111334674715
[[261 502]
 [133 595]]
              precision    recall  f1-score   support

         0.0       0.66      0.34      0.45       763
         1.0       0.54      0.82      0.65       728

   micro avg       0.57      0.57      0.57      1491
   macro avg       0.60      0.58      0.55      1491
weighted avg       0.60      0.57      0.55      1491



100%|██████████| 50/50 [00:02<00:00, 17.87epoch/s]


accuracy: 0.5197853789403085
[[117 646]
 [ 70 658]]
              precision    recall  f1-score   support

         0.0       0.63      0.15      0.25       763
         1.0       0.50      0.90      0.65       728

   micro avg       0.52      0.52      0.52      1491
   macro avg       0.57      0.53      0.45      1491
weighted avg       0.57      0.52      0.44      1491



100%|██████████| 50/50 [00:02<00:00, 18.27epoch/s]


accuracy: 0.5050301810865191
[[ 54 709]
 [ 29 699]]
              precision    recall  f1-score   support

         0.0       0.65      0.07      0.13       763
         1.0       0.50      0.96      0.65       728

   micro avg       0.51      0.51      0.51      1491
   macro avg       0.57      0.52      0.39      1491
weighted avg       0.58      0.51      0.38      1491



In [33]:
# weight - optional / FtrlOptimizer
pos_class_weight = list(map(float,range(1, 5)))
for weight in pos_class_weight:
    model = TFFMClassifier(
        order=2,
        pos_class_weight=weight,
        rank=10, 
        optimizer=tf.train.FtrlOptimizer(0.01, l2_regularization_strength=0.001, l2_shrinkage_regularization_strength=0.001), 
        n_epochs=50, 
        batch_size=1024,
        init_std=0.001,
        reg=0.01,
        input_type='sparse',
        #log_dir='./tmp/logs',
        #verbose=1,
        seed=42
    )
    model.fit(X_train_sparse, y_train, show_progress=True)
    predictions = model.predict(X_test_sparse)
    print('accuracy: {}'.format(accuracy_score(y_test, predictions)))
    # this will close tf.Session and free resources
    print(confusion_matrix(y_test,predictions)) 
    print(classification_report(y_test, predictions))
    model.destroy()

100%|██████████| 50/50 [00:02<00:00, 17.59epoch/s]


accuracy: 0.5761234071093226
[[412 351]
 [281 447]]
              precision    recall  f1-score   support

         0.0       0.59      0.54      0.57       763
         1.0       0.56      0.61      0.59       728

   micro avg       0.58      0.58      0.58      1491
   macro avg       0.58      0.58      0.58      1491
weighted avg       0.58      0.58      0.58      1491



100%|██████████| 50/50 [00:02<00:00, 17.53epoch/s]


accuracy: 0.48826291079812206
[[  0 763]
 [  0 728]]


  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00       763
         1.0       0.49      1.00      0.66       728

   micro avg       0.49      0.49      0.49      1491
   macro avg       0.24      0.50      0.33      1491
weighted avg       0.24      0.49      0.32      1491



100%|██████████| 50/50 [00:02<00:00, 18.10epoch/s]


accuracy: 0.48826291079812206
[[  0 763]
 [  0 728]]
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00       763
         1.0       0.49      1.00      0.66       728

   micro avg       0.49      0.49      0.49      1491
   macro avg       0.24      0.50      0.33      1491
weighted avg       0.24      0.49      0.32      1491



100%|██████████| 50/50 [00:02<00:00, 18.28epoch/s]


accuracy: 0.48826291079812206
[[  0 763]
 [  0 728]]
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00       763
         1.0       0.49      1.00      0.66       728

   micro avg       0.49      0.49      0.49      1491
   macro avg       0.24      0.50      0.33      1491
weighted avg       0.24      0.49      0.32      1491



In [34]:
# KERAS
model = Sequential()
model.add(Dense(128, input_dim=X_train.shape[1], activation='sigmoid'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='sigmoid'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

model.fit(X_train, y_train,
          epochs=20,
          batch_size=256)
score = model.evaluate(X_test, y_test, batch_size=256)
print(score)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
[0.6452999041034102, 0.6351441986843853]


## Nearmiss (version = 1) shows the best results

In [4]:
from imblearn.under_sampling import EditedNearestNeighbours
enn = EditedNearestNeighbours()
X_resampled_enn, y_resampled_enn = enn.fit_resample(features_list_array, labels_list_array)
print(sorted(Counter(y_resampled_enn).items()))

[(0.0, 38652), (1.0, 3726)]


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled_enn, y_resampled_enn, random_state=35, test_size=0.2)

In [6]:
clfs = [
        ['RandomForestClassifier', RandomForestClassifier()],
        ['GradientBoostingClassifier', GradientBoostingClassifier()],
        ['ExtraTreesClassifier', ExtraTreesClassifier()],
        ['AdaBoostClassifier', AdaBoostClassifier()],
        ['BaggingClassifier', BaggingClassifier()],
        ['DecisionTreeClassifier', DecisionTreeClassifier()],
        ['MLPClassifier', MLPClassifier()],
        ['XGBClassifier', XGBClassifier()],
        ['CatBoostClassifier', CatBoostClassifier()],
        ['LogisticRegression', LogisticRegression()],
        ['SGDClassifier', SGDClassifier()],
        ['TFFMClassifier', TFFMClassifier()]
       ]
t = time()
for name, clf in clfs:
    t0 = time()
    clf.fit(X_train, y_train)
    prediction = clf.predict(X_test)
    print(name)
    print(classification_report(y_test, prediction))
    print(confusion_matrix(y_test, prediction))
    print('Accuracy is ', accuracy_score(y_test, prediction)*100)
    print ("Time on model's work:", round(time()-t0, 3), "s")
    print('='*100)
print ("TOTAL TIME SPENT: ", round(time()-t, 3), "s")



RandomForestClassifier
              precision    recall  f1-score   support

         0.0       0.94      0.99      0.96      7748
         1.0       0.75      0.32      0.45       728

   micro avg       0.93      0.93      0.93      8476
   macro avg       0.84      0.65      0.71      8476
weighted avg       0.92      0.93      0.92      8476

[[7669   79]
 [ 496  232]]
Accuracy is  93.21613968853232
Time on model's work: 5.333 s
GradientBoostingClassifier
              precision    recall  f1-score   support

         0.0       0.92      1.00      0.96      7748
         1.0       0.92      0.07      0.13       728

   micro avg       0.92      0.92      0.92      8476
   macro avg       0.92      0.53      0.54      8476
weighted avg       0.92      0.92      0.89      8476

[[7744    4]
 [ 679   49]]
Accuracy is  91.9419537517697
Time on model's work: 308.262 s




ExtraTreesClassifier
              precision    recall  f1-score   support

         0.0       0.94      0.99      0.96      7748
         1.0       0.75      0.32      0.45       728

   micro avg       0.93      0.93      0.93      8476
   macro avg       0.85      0.66      0.71      8476
weighted avg       0.92      0.93      0.92      8476

[[7672   76]
 [ 495  233]]
Accuracy is  93.26333176026428
Time on model's work: 9.704 s
AdaBoostClassifier
              precision    recall  f1-score   support

         0.0       0.92      1.00      0.96      7748
         1.0       0.83      0.07      0.13       728

   micro avg       0.92      0.92      0.92      8476
   macro avg       0.88      0.53      0.54      8476
weighted avg       0.91      0.92      0.89      8476

[[7738   10]
 [ 678   50]]
Accuracy is  91.88296366210477
Time on model's work: 72.002 s
BaggingClassifier
              precision    recall  f1-score   support

         0.0       0.94      0.98      0.96      7748
  



MLPClassifier
              precision    recall  f1-score   support

         0.0       0.95      0.96      0.96      7748
         1.0       0.54      0.44      0.49       728

   micro avg       0.92      0.92      0.92      8476
   macro avg       0.74      0.70      0.72      8476
weighted avg       0.91      0.92      0.92      8476

[[7474  274]
 [ 405  323]]
Accuracy is  91.98914582350164
Time on model's work: 1019.896 s
XGBClassifier
              precision    recall  f1-score   support

         0.0       0.92      1.00      0.96      7748
         1.0       0.96      0.06      0.12       728

   micro avg       0.92      0.92      0.92      8476
   macro avg       0.94      0.53      0.54      8476
weighted avg       0.92      0.92      0.89      8476

[[7746    2]
 [ 681   47]]
Accuracy is  91.9419537517697
Time on model's work: 265.292 s
0:	learn: 0.6249963	total: 444ms	remaining: 7m 23s
1:	learn: 0.5666980	total: 798ms	remaining: 6m 38s
2:	learn: 0.5169890	total: 1.15s	rem

138:	learn: 0.2136336	total: 49.4s	remaining: 5m 6s
139:	learn: 0.2136093	total: 49.7s	remaining: 5m 5s
140:	learn: 0.2136069	total: 50s	remaining: 5m 4s
141:	learn: 0.2135793	total: 50.3s	remaining: 5m 4s
142:	learn: 0.2135630	total: 50.6s	remaining: 5m 3s
143:	learn: 0.2135402	total: 50.9s	remaining: 5m 2s
144:	learn: 0.2134616	total: 51.2s	remaining: 5m 1s
145:	learn: 0.2134420	total: 51.5s	remaining: 5m
146:	learn: 0.2134211	total: 51.8s	remaining: 5m
147:	learn: 0.2134002	total: 52.1s	remaining: 4m 59s
148:	learn: 0.2133066	total: 52.6s	remaining: 5m
149:	learn: 0.2132858	total: 53.1s	remaining: 5m
150:	learn: 0.2132438	total: 53.6s	remaining: 5m 1s
151:	learn: 0.2131767	total: 54s	remaining: 5m 1s
152:	learn: 0.2131261	total: 54.5s	remaining: 5m 1s
153:	learn: 0.2131106	total: 54.9s	remaining: 5m 1s
154:	learn: 0.2130904	total: 55.3s	remaining: 5m 1s
155:	learn: 0.2130708	total: 55.6s	remaining: 5m 1s
156:	learn: 0.2130586	total: 56s	remaining: 5m
157:	learn: 0.2130442	total: 56.

292:	learn: 0.2088979	total: 1m 43s	remaining: 4m 8s
293:	learn: 0.2088604	total: 1m 43s	remaining: 4m 8s
294:	learn: 0.2088434	total: 1m 43s	remaining: 4m 8s
295:	learn: 0.2088261	total: 1m 44s	remaining: 4m 7s
296:	learn: 0.2088053	total: 1m 44s	remaining: 4m 7s
297:	learn: 0.2087905	total: 1m 44s	remaining: 4m 6s
298:	learn: 0.2087767	total: 1m 44s	remaining: 4m 6s
299:	learn: 0.2087596	total: 1m 45s	remaining: 4m 5s
300:	learn: 0.2087392	total: 1m 45s	remaining: 4m 5s
301:	learn: 0.2087079	total: 1m 45s	remaining: 4m 4s
302:	learn: 0.2086315	total: 1m 46s	remaining: 4m 4s
303:	learn: 0.2086177	total: 1m 46s	remaining: 4m 3s
304:	learn: 0.2085762	total: 1m 46s	remaining: 4m 3s
305:	learn: 0.2085580	total: 1m 47s	remaining: 4m 3s
306:	learn: 0.2085364	total: 1m 47s	remaining: 4m 2s
307:	learn: 0.2084976	total: 1m 47s	remaining: 4m 2s
308:	learn: 0.2084762	total: 1m 48s	remaining: 4m 1s
309:	learn: 0.2084507	total: 1m 48s	remaining: 4m 1s
310:	learn: 0.2084312	total: 1m 48s	remaining:

446:	learn: 0.2049190	total: 2m 32s	remaining: 3m 8s
447:	learn: 0.2048935	total: 2m 32s	remaining: 3m 8s
448:	learn: 0.2048238	total: 2m 33s	remaining: 3m 7s
449:	learn: 0.2047820	total: 2m 33s	remaining: 3m 7s
450:	learn: 0.2047718	total: 2m 33s	remaining: 3m 7s
451:	learn: 0.2047586	total: 2m 33s	remaining: 3m 6s
452:	learn: 0.2047482	total: 2m 34s	remaining: 3m 6s
453:	learn: 0.2047395	total: 2m 34s	remaining: 3m 5s
454:	learn: 0.2047312	total: 2m 34s	remaining: 3m 5s
455:	learn: 0.2047146	total: 2m 34s	remaining: 3m 4s
456:	learn: 0.2046941	total: 2m 35s	remaining: 3m 4s
457:	learn: 0.2046719	total: 2m 35s	remaining: 3m 4s
458:	learn: 0.2046612	total: 2m 35s	remaining: 3m 3s
459:	learn: 0.2046427	total: 2m 36s	remaining: 3m 3s
460:	learn: 0.2046257	total: 2m 36s	remaining: 3m 2s
461:	learn: 0.2046162	total: 2m 36s	remaining: 3m 2s
462:	learn: 0.2045882	total: 2m 36s	remaining: 3m 1s
463:	learn: 0.2045407	total: 2m 37s	remaining: 3m 1s
464:	learn: 0.2045349	total: 2m 37s	remaining:

599:	learn: 0.2013885	total: 3m 19s	remaining: 2m 12s
600:	learn: 0.2013240	total: 3m 19s	remaining: 2m 12s
601:	learn: 0.2013213	total: 3m 19s	remaining: 2m 12s
602:	learn: 0.2013114	total: 3m 20s	remaining: 2m 11s
603:	learn: 0.2012907	total: 3m 20s	remaining: 2m 11s
604:	learn: 0.2012868	total: 3m 20s	remaining: 2m 11s
605:	learn: 0.2012611	total: 3m 21s	remaining: 2m 10s
606:	learn: 0.2012522	total: 3m 21s	remaining: 2m 10s
607:	learn: 0.2011921	total: 3m 22s	remaining: 2m 10s
608:	learn: 0.2011443	total: 3m 22s	remaining: 2m 9s
609:	learn: 0.2011261	total: 3m 22s	remaining: 2m 9s
610:	learn: 0.2011193	total: 3m 23s	remaining: 2m 9s
611:	learn: 0.2011015	total: 3m 23s	remaining: 2m 8s
612:	learn: 0.2010964	total: 3m 23s	remaining: 2m 8s
613:	learn: 0.2010255	total: 3m 24s	remaining: 2m 8s
614:	learn: 0.2010145	total: 3m 24s	remaining: 2m 7s
615:	learn: 0.2009908	total: 3m 24s	remaining: 2m 7s
616:	learn: 0.2009662	total: 3m 24s	remaining: 2m 7s
617:	learn: 0.2009468	total: 3m 25s	r

752:	learn: 0.1986105	total: 4m 8s	remaining: 1m 21s
753:	learn: 0.1986065	total: 4m 8s	remaining: 1m 21s
754:	learn: 0.1985993	total: 4m 8s	remaining: 1m 20s
755:	learn: 0.1985771	total: 4m 9s	remaining: 1m 20s
756:	learn: 0.1985678	total: 4m 9s	remaining: 1m 20s
757:	learn: 0.1985649	total: 4m 9s	remaining: 1m 19s
758:	learn: 0.1985585	total: 4m 10s	remaining: 1m 19s
759:	learn: 0.1985244	total: 4m 10s	remaining: 1m 19s
760:	learn: 0.1984878	total: 4m 10s	remaining: 1m 18s
761:	learn: 0.1984673	total: 4m 11s	remaining: 1m 18s
762:	learn: 0.1984482	total: 4m 11s	remaining: 1m 18s
763:	learn: 0.1984426	total: 4m 11s	remaining: 1m 17s
764:	learn: 0.1984381	total: 4m 12s	remaining: 1m 17s
765:	learn: 0.1984337	total: 4m 12s	remaining: 1m 17s
766:	learn: 0.1984031	total: 4m 13s	remaining: 1m 16s
767:	learn: 0.1983508	total: 4m 13s	remaining: 1m 16s
768:	learn: 0.1982999	total: 4m 13s	remaining: 1m 16s
769:	learn: 0.1982867	total: 4m 14s	remaining: 1m 15s
770:	learn: 0.1982763	total: 4m 14

907:	learn: 0.1964215	total: 4m 58s	remaining: 30.2s
908:	learn: 0.1964021	total: 4m 58s	remaining: 29.9s
909:	learn: 0.1963979	total: 4m 58s	remaining: 29.5s
910:	learn: 0.1963696	total: 4m 58s	remaining: 29.2s
911:	learn: 0.1963628	total: 4m 59s	remaining: 28.9s
912:	learn: 0.1963617	total: 4m 59s	remaining: 28.5s
913:	learn: 0.1963543	total: 4m 59s	remaining: 28.2s
914:	learn: 0.1963526	total: 5m	remaining: 27.9s
915:	learn: 0.1963347	total: 5m	remaining: 27.6s
916:	learn: 0.1963109	total: 5m	remaining: 27.2s
917:	learn: 0.1963037	total: 5m 1s	remaining: 26.9s
918:	learn: 0.1962748	total: 5m 1s	remaining: 26.6s
919:	learn: 0.1962534	total: 5m 1s	remaining: 26.2s
920:	learn: 0.1962260	total: 5m 2s	remaining: 25.9s
921:	learn: 0.1962223	total: 5m 2s	remaining: 25.6s
922:	learn: 0.1962216	total: 5m 2s	remaining: 25.2s
923:	learn: 0.1962153	total: 5m 2s	remaining: 24.9s
924:	learn: 0.1962096	total: 5m 3s	remaining: 24.6s
925:	learn: 0.1962078	total: 5m 3s	remaining: 24.2s
926:	learn: 0.



LogisticRegression
              precision    recall  f1-score   support

         0.0       0.92      1.00      0.96      7748
         1.0       0.73      0.09      0.17       728

   micro avg       0.92      0.92      0.92      8476
   macro avg       0.83      0.55      0.56      8476
weighted avg       0.91      0.92      0.89      8476

[[7723   25]
 [ 659   69]]
Accuracy is  91.93015573383671
Time on model's work: 1.057 s




SGDClassifier
              precision    recall  f1-score   support

         0.0       0.93      0.97      0.95      7748
         1.0       0.42      0.25      0.32       728

   micro avg       0.91      0.91      0.91      8476
   macro avg       0.68      0.61      0.63      8476
weighted avg       0.89      0.91      0.90      8476

[[7495  253]
 [ 544  184]]
Accuracy is  90.59697970740915
Time on model's work: 0.804 s
TFFMClassifier
              precision    recall  f1-score   support

         0.0       0.92      0.99      0.95      7748
         1.0       0.51      0.13      0.21       728

   micro avg       0.91      0.91      0.91      8476
   macro avg       0.72      0.56      0.58      8476
weighted avg       0.89      0.91      0.89      8476

[[7659   89]
 [ 634   94]]
Accuracy is  91.47003303445021
Time on model's work: 119.171 s
TOTAL TIME SPENT:  2230.274 s


In [7]:
# TFFM sparse - works worse with sparse
# only CSR format supported
X_train_sparse = sp.csr_matrix(X_train)
X_test_sparse = sp.csr_matrix(X_test)
# weight - optional / AdamOptimizer
pos_class_weight = list(map(float,range(1, 5)))
for weight in pos_class_weight:
    model = TFFMClassifier(
        order=2,
        pos_class_weight=weight,
        rank=10, 
        optimizer=tf.train.AdamOptimizer(learning_rate=0.001), 
        n_epochs=50, 
        batch_size=1024,
        init_std=0.001,
        reg=0.01,
        input_type='sparse',
        #log_dir='./tmp/logs',
        #verbose=1,
        seed=42
    )
    model.fit(X_train_sparse, y_train, show_progress=True)
    predictions = model.predict(X_test_sparse)
    print('accuracy: {}'.format(accuracy_score(y_test, predictions)))
    # this will close tf.Session and free resources
    print(confusion_matrix(y_test,predictions)) 
    print(classification_report(y_test, predictions))
    model.destroy()

100%|██████████| 50/50 [00:16<00:00,  3.36epoch/s]


accuracy: 0.9177678150070788
[[7662   86]
 [ 611  117]]
              precision    recall  f1-score   support

         0.0       0.93      0.99      0.96      7748
         1.0       0.58      0.16      0.25       728

   micro avg       0.92      0.92      0.92      8476
   macro avg       0.75      0.57      0.60      8476
weighted avg       0.90      0.92      0.90      8476



100%|██████████| 50/50 [00:15<00:00,  3.44epoch/s]


accuracy: 0.9084473808400189
[[7495  253]
 [ 523  205]]
              precision    recall  f1-score   support

         0.0       0.93      0.97      0.95      7748
         1.0       0.45      0.28      0.35       728

   micro avg       0.91      0.91      0.91      8476
   macro avg       0.69      0.62      0.65      8476
weighted avg       0.89      0.91      0.90      8476



100%|██████████| 50/50 [00:14<00:00,  3.44epoch/s]


accuracy: 0.8900424728645587
[[7231  517]
 [ 415  313]]
              precision    recall  f1-score   support

         0.0       0.95      0.93      0.94      7748
         1.0       0.38      0.43      0.40       728

   micro avg       0.89      0.89      0.89      8476
   macro avg       0.66      0.68      0.67      8476
weighted avg       0.90      0.89      0.89      8476



100%|██████████| 50/50 [00:14<00:00,  3.53epoch/s]


accuracy: 0.8671543180745634
[[6947  801]
 [ 325  403]]
              precision    recall  f1-score   support

         0.0       0.96      0.90      0.93      7748
         1.0       0.33      0.55      0.42       728

   micro avg       0.87      0.87      0.87      8476
   macro avg       0.65      0.73      0.67      8476
weighted avg       0.90      0.87      0.88      8476



In [8]:
# weight - optional / FtrlOptimizer
pos_class_weight = list(map(float,range(1, 5)))
for weight in pos_class_weight:
    model = TFFMClassifier(
        order=2,
        pos_class_weight=weight,
        rank=10, 
        optimizer=tf.train.FtrlOptimizer(0.01, l2_regularization_strength=0.001, l2_shrinkage_regularization_strength=0.001), 
        n_epochs=50, 
        batch_size=1024,
        init_std=0.001,
        reg=0.01,
        input_type='sparse',
        #log_dir='./tmp/logs',
        #verbose=1,
        seed=42
    )
    model.fit(X_train_sparse, y_train, show_progress=True)
    predictions = model.predict(X_test_sparse)
    print('accuracy: {}'.format(accuracy_score(y_test, predictions)))
    # this will close tf.Session and free resources
    print(confusion_matrix(y_test,predictions)) 
    print(classification_report(y_test, predictions))
    model.destroy()

100%|██████████| 50/50 [00:14<00:00,  3.47epoch/s]


accuracy: 0.9141104294478528
[[7748    0]
 [ 728    0]]


  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

         0.0       0.91      1.00      0.96      7748
         1.0       0.00      0.00      0.00       728

   micro avg       0.91      0.91      0.91      8476
   macro avg       0.46      0.50      0.48      8476
weighted avg       0.84      0.91      0.87      8476



100%|██████████| 50/50 [00:14<00:00,  3.44epoch/s]


accuracy: 0.9137564889098632
[[7745    3]
 [ 728    0]]
              precision    recall  f1-score   support

         0.0       0.91      1.00      0.95      7748
         1.0       0.00      0.00      0.00       728

   micro avg       0.91      0.91      0.91      8476
   macro avg       0.46      0.50      0.48      8476
weighted avg       0.84      0.91      0.87      8476



100%|██████████| 50/50 [00:14<00:00,  3.49epoch/s]


accuracy: 0.8856772062293534
[[7313  435]
 [ 534  194]]
              precision    recall  f1-score   support

         0.0       0.93      0.94      0.94      7748
         1.0       0.31      0.27      0.29       728

   micro avg       0.89      0.89      0.89      8476
   macro avg       0.62      0.61      0.61      8476
weighted avg       0.88      0.89      0.88      8476



100%|██████████| 50/50 [00:14<00:00,  3.41epoch/s]


accuracy: 0.8141812175554507
[[6442 1306]
 [ 269  459]]
              precision    recall  f1-score   support

         0.0       0.96      0.83      0.89      7748
         1.0       0.26      0.63      0.37       728

   micro avg       0.81      0.81      0.81      8476
   macro avg       0.61      0.73      0.63      8476
weighted avg       0.90      0.81      0.85      8476



In [9]:
# KERAS
model = Sequential()
model.add(Dense(128, input_dim=X_train.shape[1], activation='sigmoid'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='sigmoid'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

model.fit(X_train, y_train,
          epochs=20,
          batch_size=256)
score = model.evaluate(X_test, y_test, batch_size=256)
print(score)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20


Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20


Epoch 18/20
Epoch 19/20
Epoch 20/20
[0.20319069388932134, 0.9193015572539811]


In [4]:
from imblearn.combine import SMOTEENN
smote_enn = SMOTEENN(random_state=0)
X_resampled_smote_enn, y_resampled_smote_enn = smote_enn.fit_resample(features_list_array, labels_list_array)
print(sorted(Counter(y_resampled_smote_enn).items()))

[(0.0, 12603), (1.0, 15225)]


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled_smote_enn, y_resampled_smote_enn, random_state=35, test_size=0.2)

In [6]:
clfs = [
        ['RandomForestClassifier', RandomForestClassifier()],
        ['GradientBoostingClassifier', GradientBoostingClassifier()],
        ['ExtraTreesClassifier', ExtraTreesClassifier()],
        ['AdaBoostClassifier', AdaBoostClassifier()],
        ['BaggingClassifier', BaggingClassifier()],
        ['DecisionTreeClassifier', DecisionTreeClassifier()],
        ['MLPClassifier', MLPClassifier()],
        ['XGBClassifier', XGBClassifier()],
        ['CatBoostClassifier', CatBoostClassifier()],
        ['LogisticRegression', LogisticRegression()],
        ['SGDClassifier', SGDClassifier()],
        ['TFFMClassifier', TFFMClassifier()]
       ]
t = time()
for name, clf in clfs:
    t0 = time()
    clf.fit(X_train, y_train)
    prediction = clf.predict(X_test)
    print(name)
    print(classification_report(y_test, prediction))
    print(confusion_matrix(y_test, prediction))
    print('Accuracy is ', accuracy_score(y_test, prediction)*100)
    print ("Time on model's work:", round(time()-t0, 3), "s")
    print('='*100)
print ("TOTAL TIME SPENT: ", round(time()-t, 3), "s")



RandomForestClassifier
              precision    recall  f1-score   support

         0.0       0.98      0.98      0.98      2587
         1.0       0.98      0.99      0.98      2979

   micro avg       0.98      0.98      0.98      5566
   macro avg       0.98      0.98      0.98      5566
weighted avg       0.98      0.98      0.98      5566

[[2538   49]
 [  44 2935]]
Accuracy is  98.32914121451671
Time on model's work: 2.02 s
GradientBoostingClassifier
              precision    recall  f1-score   support

         0.0       0.95      0.83      0.89      2587
         1.0       0.87      0.96      0.91      2979

   micro avg       0.90      0.90      0.90      5566
   macro avg       0.91      0.90      0.90      5566
weighted avg       0.91      0.90      0.90      5566

[[2149  438]
 [ 105 2874]]
Accuracy is  90.24434063959755
Time on model's work: 140.479 s




ExtraTreesClassifier
              precision    recall  f1-score   support

         0.0       0.99      0.98      0.98      2587
         1.0       0.98      0.99      0.99      2979

   micro avg       0.99      0.99      0.99      5566
   macro avg       0.99      0.98      0.99      5566
weighted avg       0.99      0.99      0.99      5566

[[2524   63]
 [  17 2962]]
Accuracy is  98.56270212001438
Time on model's work: 4.549 s
AdaBoostClassifier
              precision    recall  f1-score   support

         0.0       0.94      0.82      0.87      2587
         1.0       0.86      0.95      0.90      2979

   micro avg       0.89      0.89      0.89      5566
   macro avg       0.90      0.88      0.89      5566
weighted avg       0.89      0.89      0.89      5566

[[2109  478]
 [ 142 2837]]
Accuracy is  88.8609414301114
Time on model's work: 35.168 s
BaggingClassifier
              precision    recall  f1-score   support

         0.0       0.97      0.97      0.97      2587
   



MLPClassifier
              precision    recall  f1-score   support

         0.0       0.99      0.97      0.98      2587
         1.0       0.97      0.99      0.98      2979

   micro avg       0.98      0.98      0.98      5566
   macro avg       0.98      0.98      0.98      5566
weighted avg       0.98      0.98      0.98      5566

[[2497   90]
 [  23 2956]]
Accuracy is  97.9698167445203
Time on model's work: 465.867 s
XGBClassifier
              precision    recall  f1-score   support

         0.0       0.95      0.82      0.88      2587
         1.0       0.86      0.97      0.91      2979

   micro avg       0.90      0.90      0.90      5566
   macro avg       0.91      0.89      0.90      5566
weighted avg       0.91      0.90      0.90      5566

[[2126  461]
 [ 101 2878]]
Accuracy is  89.90298239310097
Time on model's work: 117.317 s
0:	learn: 0.6583400	total: 287ms	remaining: 4m 46s
1:	learn: 0.6265091	total: 515ms	remaining: 4m 16s
2:	learn: 0.5994792	total: 728ms	rema

139:	learn: 0.2116318	total: 28.6s	remaining: 2m 55s
140:	learn: 0.2113064	total: 28.8s	remaining: 2m 55s
141:	learn: 0.2111223	total: 29s	remaining: 2m 55s
142:	learn: 0.2109607	total: 29.2s	remaining: 2m 54s
143:	learn: 0.2099574	total: 29.3s	remaining: 2m 54s
144:	learn: 0.2097866	total: 29.5s	remaining: 2m 54s
145:	learn: 0.2096651	total: 29.7s	remaining: 2m 53s
146:	learn: 0.2095745	total: 29.9s	remaining: 2m 53s
147:	learn: 0.2086767	total: 30.1s	remaining: 2m 53s
148:	learn: 0.2084267	total: 30.2s	remaining: 2m 52s
149:	learn: 0.2080643	total: 30.4s	remaining: 2m 52s
150:	learn: 0.2079191	total: 30.6s	remaining: 2m 52s
151:	learn: 0.2071393	total: 30.8s	remaining: 2m 51s
152:	learn: 0.2062555	total: 31s	remaining: 2m 51s
153:	learn: 0.2053842	total: 31.3s	remaining: 2m 51s
154:	learn: 0.2052411	total: 31.4s	remaining: 2m 51s
155:	learn: 0.2040973	total: 31.6s	remaining: 2m 51s
156:	learn: 0.2039477	total: 31.8s	remaining: 2m 50s
157:	learn: 0.2035948	total: 32s	remaining: 2m 50s

295:	learn: 0.1560774	total: 59.5s	remaining: 2m 21s
296:	learn: 0.1559852	total: 59.8s	remaining: 2m 21s
297:	learn: 0.1558816	total: 60s	remaining: 2m 21s
298:	learn: 0.1555807	total: 1m	remaining: 2m 21s
299:	learn: 0.1555136	total: 1m	remaining: 2m 20s
300:	learn: 0.1554384	total: 1m	remaining: 2m 20s
301:	learn: 0.1553610	total: 1m	remaining: 2m 20s
302:	learn: 0.1542484	total: 1m	remaining: 2m 20s
303:	learn: 0.1541097	total: 1m 1s	remaining: 2m 19s
304:	learn: 0.1538491	total: 1m 1s	remaining: 2m 19s
305:	learn: 0.1534202	total: 1m 1s	remaining: 2m 19s
306:	learn: 0.1528383	total: 1m 1s	remaining: 2m 19s
307:	learn: 0.1525760	total: 1m 1s	remaining: 2m 19s
308:	learn: 0.1525120	total: 1m 2s	remaining: 2m 18s
309:	learn: 0.1521892	total: 1m 2s	remaining: 2m 18s
310:	learn: 0.1519246	total: 1m 2s	remaining: 2m 18s
311:	learn: 0.1513583	total: 1m 2s	remaining: 2m 18s
312:	learn: 0.1508249	total: 1m 2s	remaining: 2m 18s
313:	learn: 0.1504005	total: 1m 3s	remaining: 2m 18s
314:	learn

450:	learn: 0.1248746	total: 1m 28s	remaining: 1m 48s
451:	learn: 0.1248214	total: 1m 28s	remaining: 1m 47s
452:	learn: 0.1245711	total: 1m 29s	remaining: 1m 47s
453:	learn: 0.1245462	total: 1m 29s	remaining: 1m 47s
454:	learn: 0.1245041	total: 1m 29s	remaining: 1m 47s
455:	learn: 0.1244658	total: 1m 29s	remaining: 1m 46s
456:	learn: 0.1244066	total: 1m 29s	remaining: 1m 46s
457:	learn: 0.1243811	total: 1m 29s	remaining: 1m 46s
458:	learn: 0.1243509	total: 1m 30s	remaining: 1m 46s
459:	learn: 0.1243078	total: 1m 30s	remaining: 1m 46s
460:	learn: 0.1242714	total: 1m 30s	remaining: 1m 45s
461:	learn: 0.1241344	total: 1m 30s	remaining: 1m 45s
462:	learn: 0.1235750	total: 1m 30s	remaining: 1m 45s
463:	learn: 0.1235479	total: 1m 31s	remaining: 1m 45s
464:	learn: 0.1234153	total: 1m 31s	remaining: 1m 44s
465:	learn: 0.1230330	total: 1m 31s	remaining: 1m 44s
466:	learn: 0.1230085	total: 1m 31s	remaining: 1m 44s
467:	learn: 0.1226429	total: 1m 31s	remaining: 1m 44s
468:	learn: 0.1226079	total:

602:	learn: 0.1066334	total: 1m 55s	remaining: 1m 16s
603:	learn: 0.1066169	total: 1m 56s	remaining: 1m 16s
604:	learn: 0.1064784	total: 1m 56s	remaining: 1m 15s
605:	learn: 0.1064579	total: 1m 56s	remaining: 1m 15s
606:	learn: 0.1064443	total: 1m 56s	remaining: 1m 15s
607:	learn: 0.1063975	total: 1m 56s	remaining: 1m 15s
608:	learn: 0.1063743	total: 1m 56s	remaining: 1m 15s
609:	learn: 0.1063537	total: 1m 57s	remaining: 1m 14s
610:	learn: 0.1063089	total: 1m 57s	remaining: 1m 14s
611:	learn: 0.1062952	total: 1m 57s	remaining: 1m 14s
612:	learn: 0.1062772	total: 1m 57s	remaining: 1m 14s
613:	learn: 0.1062515	total: 1m 57s	remaining: 1m 14s
614:	learn: 0.1062263	total: 1m 57s	remaining: 1m 13s
615:	learn: 0.1061943	total: 1m 58s	remaining: 1m 13s
616:	learn: 0.1058710	total: 1m 58s	remaining: 1m 13s
617:	learn: 0.1058479	total: 1m 58s	remaining: 1m 13s
618:	learn: 0.1055850	total: 1m 58s	remaining: 1m 13s
619:	learn: 0.1055711	total: 1m 58s	remaining: 1m 12s
620:	learn: 0.1055531	total:

758:	learn: 0.0963984	total: 2m 26s	remaining: 46.5s
759:	learn: 0.0963788	total: 2m 26s	remaining: 46.3s
760:	learn: 0.0963650	total: 2m 26s	remaining: 46.1s
761:	learn: 0.0963494	total: 2m 26s	remaining: 45.9s
762:	learn: 0.0960717	total: 2m 27s	remaining: 45.7s
763:	learn: 0.0960646	total: 2m 27s	remaining: 45.5s
764:	learn: 0.0959945	total: 2m 27s	remaining: 45.3s
765:	learn: 0.0959824	total: 2m 27s	remaining: 45.1s
766:	learn: 0.0959720	total: 2m 27s	remaining: 44.9s
767:	learn: 0.0959599	total: 2m 27s	remaining: 44.7s
768:	learn: 0.0959480	total: 2m 28s	remaining: 44.5s
769:	learn: 0.0959336	total: 2m 28s	remaining: 44.3s
770:	learn: 0.0958712	total: 2m 28s	remaining: 44.1s
771:	learn: 0.0957467	total: 2m 28s	remaining: 43.9s
772:	learn: 0.0957284	total: 2m 28s	remaining: 43.7s
773:	learn: 0.0954188	total: 2m 29s	remaining: 43.5s
774:	learn: 0.0953214	total: 2m 29s	remaining: 43.3s
775:	learn: 0.0951268	total: 2m 29s	remaining: 43.2s
776:	learn: 0.0951172	total: 2m 29s	remaining:

914:	learn: 0.0893955	total: 2m 56s	remaining: 16.4s
915:	learn: 0.0893610	total: 2m 56s	remaining: 16.2s
916:	learn: 0.0893508	total: 2m 56s	remaining: 16s
917:	learn: 0.0893421	total: 2m 57s	remaining: 15.8s
918:	learn: 0.0892201	total: 2m 57s	remaining: 15.6s
919:	learn: 0.0891807	total: 2m 57s	remaining: 15.4s
920:	learn: 0.0891573	total: 2m 57s	remaining: 15.2s
921:	learn: 0.0891464	total: 2m 57s	remaining: 15s
922:	learn: 0.0891228	total: 2m 57s	remaining: 14.8s
923:	learn: 0.0890563	total: 2m 58s	remaining: 14.7s
924:	learn: 0.0889708	total: 2m 58s	remaining: 14.5s
925:	learn: 0.0889651	total: 2m 58s	remaining: 14.3s
926:	learn: 0.0889507	total: 2m 58s	remaining: 14.1s
927:	learn: 0.0889378	total: 2m 58s	remaining: 13.9s
928:	learn: 0.0889330	total: 2m 59s	remaining: 13.7s
929:	learn: 0.0889219	total: 2m 59s	remaining: 13.5s
930:	learn: 0.0889113	total: 2m 59s	remaining: 13.3s
931:	learn: 0.0888994	total: 2m 59s	remaining: 13.1s
932:	learn: 0.0888901	total: 2m 59s	remaining: 12.



LogisticRegression
              precision    recall  f1-score   support

         0.0       0.94      0.81      0.87      2587
         1.0       0.85      0.96      0.90      2979

   micro avg       0.89      0.89      0.89      5566
   macro avg       0.90      0.88      0.89      5566
weighted avg       0.90      0.89      0.89      5566

[[2103  484]
 [ 130 2849]]
Accuracy is  88.96873877111031
Time on model's work: 0.633 s




SGDClassifier
              precision    recall  f1-score   support

         0.0       0.92      0.83      0.87      2587
         1.0       0.86      0.93      0.90      2979

   micro avg       0.88      0.88      0.88      5566
   macro avg       0.89      0.88      0.88      5566
weighted avg       0.89      0.88      0.88      5566

[[2143  444]
 [ 197 2782]]
Accuracy is  88.48365073661516
Time on model's work: 0.382 s
TFFMClassifier
              precision    recall  f1-score   support

         0.0       0.95      0.82      0.88      2587
         1.0       0.86      0.96      0.91      2979

   micro avg       0.90      0.90      0.90      5566
   macro avg       0.90      0.89      0.89      5566
weighted avg       0.90      0.90      0.89      5566

[[2126  461]
 [ 123 2856]]
Accuracy is  89.50772547610492
Time on model's work: 52.556 s
TOTAL TIME SPENT:  1062.091 s


In [8]:
# TFFM sparse - works worse with sparse
# only CSR format supported
X_train_sparse = sp.csr_matrix(X_train)
X_test_sparse = sp.csr_matrix(X_test)
# weight - optional / AdamOptimizer
pos_class_weight = list(map(float,range(1, 5)))
for weight in pos_class_weight:
    model = TFFMClassifier(
        order=2,
        pos_class_weight=weight,
        rank=10, 
        optimizer=tf.train.AdamOptimizer(learning_rate=0.001), 
        n_epochs=50, 
        batch_size=1024,
        init_std=0.001,
        reg=0.01,
        input_type='sparse',
        #log_dir='./tmp/logs',
        #verbose=1,
        seed=42
    )
    model.fit(X_train_sparse, y_train, show_progress=True)
    predictions = model.predict(X_test_sparse)
    print('accuracy: {}'.format(accuracy_score(y_test, predictions)))
    # this will close tf.Session and free resources
    print(confusion_matrix(y_test,predictions)) 
    print(classification_report(y_test, predictions))
    model.destroy()

100%|██████████| 50/50 [00:09<00:00,  5.15epoch/s]


accuracy: 0.8979518505210204
[[2140  447]
 [ 121 2858]]
              precision    recall  f1-score   support

         0.0       0.95      0.83      0.88      2587
         1.0       0.86      0.96      0.91      2979

   micro avg       0.90      0.90      0.90      5566
   macro avg       0.91      0.89      0.90      5566
weighted avg       0.90      0.90      0.90      5566



100%|██████████| 50/50 [00:09<00:00,  5.18epoch/s]


accuracy: 0.8981315127560187
[[2088  499]
 [  68 2911]]
              precision    recall  f1-score   support

         0.0       0.97      0.81      0.88      2587
         1.0       0.85      0.98      0.91      2979

   micro avg       0.90      0.90      0.90      5566
   macro avg       0.91      0.89      0.90      5566
weighted avg       0.91      0.90      0.90      5566



100%|██████████| 50/50 [00:09<00:00,  5.16epoch/s]


accuracy: 0.8889687387711103
[[2020  567]
 [  51 2928]]
              precision    recall  f1-score   support

         0.0       0.98      0.78      0.87      2587
         1.0       0.84      0.98      0.90      2979

   micro avg       0.89      0.89      0.89      5566
   macro avg       0.91      0.88      0.89      5566
weighted avg       0.90      0.89      0.89      5566



100%|██████████| 50/50 [00:09<00:00,  5.16epoch/s]


accuracy: 0.8842975206611571
[[1978  609]
 [  35 2944]]
              precision    recall  f1-score   support

         0.0       0.98      0.76      0.86      2587
         1.0       0.83      0.99      0.90      2979

   micro avg       0.88      0.88      0.88      5566
   macro avg       0.91      0.88      0.88      5566
weighted avg       0.90      0.88      0.88      5566



In [9]:
# weight - optional / FtrlOptimizer
pos_class_weight = list(map(float,range(1, 5)))
for weight in pos_class_weight:
    model = TFFMClassifier(
        order=2,
        pos_class_weight=weight,
        rank=10, 
        optimizer=tf.train.FtrlOptimizer(0.01, l2_regularization_strength=0.001, l2_shrinkage_regularization_strength=0.001), 
        n_epochs=50, 
        batch_size=1024,
        init_std=0.001,
        reg=0.01,
        input_type='sparse',
        #log_dir='./tmp/logs',
        #verbose=1,
        seed=42
    )
    model.fit(X_train_sparse, y_train, show_progress=True)
    predictions = model.predict(X_test_sparse)
    print('accuracy: {}'.format(accuracy_score(y_test, predictions)))
    # this will close tf.Session and free resources
    print(confusion_matrix(y_test,predictions)) 
    print(classification_report(y_test, predictions))
    model.destroy()

100%|██████████| 50/50 [00:10<00:00,  4.73epoch/s]


accuracy: 0.8512396694214877
[[2063  524]
 [ 304 2675]]
              precision    recall  f1-score   support

         0.0       0.87      0.80      0.83      2587
         1.0       0.84      0.90      0.87      2979

   micro avg       0.85      0.85      0.85      5566
   macro avg       0.85      0.85      0.85      5566
weighted avg       0.85      0.85      0.85      5566



100%|██████████| 50/50 [00:10<00:00,  5.01epoch/s]


accuracy: 0.8681279195113187
[[2043  544]
 [ 190 2789]]
              precision    recall  f1-score   support

         0.0       0.91      0.79      0.85      2587
         1.0       0.84      0.94      0.88      2979

   micro avg       0.87      0.87      0.87      5566
   macro avg       0.88      0.86      0.87      5566
weighted avg       0.87      0.87      0.87      5566



100%|██████████| 50/50 [00:10<00:00,  5.02epoch/s]


accuracy: 0.8607617678763924
[[1964  623]
 [ 152 2827]]
              precision    recall  f1-score   support

         0.0       0.93      0.76      0.84      2587
         1.0       0.82      0.95      0.88      2979

   micro avg       0.86      0.86      0.86      5566
   macro avg       0.87      0.85      0.86      5566
weighted avg       0.87      0.86      0.86      5566



100%|██████████| 50/50 [00:10<00:00,  4.99epoch/s]


accuracy: 0.8559108875314408
[[1894  693]
 [ 109 2870]]
              precision    recall  f1-score   support

         0.0       0.95      0.73      0.83      2587
         1.0       0.81      0.96      0.88      2979

   micro avg       0.86      0.86      0.86      5566
   macro avg       0.88      0.85      0.85      5566
weighted avg       0.87      0.86      0.85      5566



In [10]:
# KERAS
model = Sequential()
model.add(Dense(128, input_dim=X_train.shape[1], activation='sigmoid'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='sigmoid'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

model.fit(X_train, y_train,
          epochs=20,
          batch_size=256)
score = model.evaluate(X_test, y_test, batch_size=256)
print(score)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20


Epoch 18/20
Epoch 19/20
Epoch 20/20
[0.2920728946306103, 0.8812432620450827]


In [11]:
from imblearn.combine import SMOTETomek
smote_tomek = SMOTETomek(random_state=0)
X_resampled_smote_tomek, y_resampled_smote_tomek = smote_tomek.fit_resample(features_list_array, labels_list_array)
print(sorted(Counter(y_resampled_smote_tomek).items()))

[(0.0, 18502), (1.0, 18502)]


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled_smote_tomek, y_resampled_smote_tomek, random_state=35, test_size=0.2)

In [13]:
clfs = [
        ['RandomForestClassifier', RandomForestClassifier()],
        ['GradientBoostingClassifier', GradientBoostingClassifier()],
        ['ExtraTreesClassifier', ExtraTreesClassifier()],
        ['AdaBoostClassifier', AdaBoostClassifier()],
        ['BaggingClassifier', BaggingClassifier()],
        ['DecisionTreeClassifier', DecisionTreeClassifier()],
        ['MLPClassifier', MLPClassifier()],
        ['XGBClassifier', XGBClassifier()],
        ['CatBoostClassifier', CatBoostClassifier()],
        ['LogisticRegression', LogisticRegression()],
        ['SGDClassifier', SGDClassifier()],
        ['TFFMClassifier', TFFMClassifier()]
       ]
t = time()
for name, clf in clfs:
    t0 = time()
    clf.fit(X_train, y_train)
    prediction = clf.predict(X_test)
    print(name)
    print(classification_report(y_test, prediction))
    print(confusion_matrix(y_test, prediction))
    print('Accuracy is ', accuracy_score(y_test, prediction)*100)
    print ("Time on model's work:", round(time()-t0, 3), "s")
    print('='*100)
print ("TOTAL TIME SPENT: ", round(time()-t, 3), "s")



RandomForestClassifier
              precision    recall  f1-score   support

         0.0       0.93      0.95      0.94      3781
         1.0       0.95      0.92      0.94      3620

   micro avg       0.94      0.94      0.94      7401
   macro avg       0.94      0.94      0.94      7401
weighted avg       0.94      0.94      0.94      7401

[[3609  172]
 [ 282 3338]]
Accuracy is  93.86569382515876
Time on model's work: 3.074 s
GradientBoostingClassifier
              precision    recall  f1-score   support

         0.0       0.91      0.78      0.84      3781
         1.0       0.80      0.92      0.86      3620

   micro avg       0.85      0.85      0.85      7401
   macro avg       0.86      0.85      0.85      7401
weighted avg       0.86      0.85      0.85      7401

[[2960  821]
 [ 286 3334]]
Accuracy is  85.04256181597081
Time on model's work: 214.916 s




ExtraTreesClassifier
              precision    recall  f1-score   support

         0.0       0.94      0.95      0.94      3781
         1.0       0.95      0.93      0.94      3620

   micro avg       0.94      0.94      0.94      7401
   macro avg       0.94      0.94      0.94      7401
weighted avg       0.94      0.94      0.94      7401

[[3595  186]
 [ 244 3376]]
Accuracy is  94.18997432779355
Time on model's work: 6.802 s
AdaBoostClassifier
              precision    recall  f1-score   support

         0.0       0.88      0.73      0.80      3781
         1.0       0.76      0.89      0.82      3620

   micro avg       0.81      0.81      0.81      7401
   macro avg       0.82      0.81      0.81      7401
weighted avg       0.82      0.81      0.81      7401

[[2753 1028]
 [ 384 3236]]
Accuracy is  80.92149709498716
Time on model's work: 47.91 s
BaggingClassifier
              precision    recall  f1-score   support

         0.0       0.92      0.95      0.94      3781
   

97:	learn: 0.3020851	total: 33.1s	remaining: 5m 4s
98:	learn: 0.3014498	total: 33.4s	remaining: 5m 3s
99:	learn: 0.3005859	total: 33.7s	remaining: 5m 3s
100:	learn: 0.2992419	total: 34s	remaining: 5m 2s
101:	learn: 0.2989841	total: 34.3s	remaining: 5m 1s
102:	learn: 0.2988311	total: 34.5s	remaining: 5m
103:	learn: 0.2983429	total: 34.8s	remaining: 4m 59s
104:	learn: 0.2981288	total: 35.1s	remaining: 4m 58s
105:	learn: 0.2976077	total: 35.4s	remaining: 4m 58s
106:	learn: 0.2970635	total: 35.7s	remaining: 4m 57s
107:	learn: 0.2963908	total: 35.9s	remaining: 4m 56s
108:	learn: 0.2948922	total: 36.3s	remaining: 4m 56s
109:	learn: 0.2944404	total: 36.6s	remaining: 4m 55s
110:	learn: 0.2941960	total: 36.8s	remaining: 4m 55s
111:	learn: 0.2934196	total: 37.1s	remaining: 4m 54s
112:	learn: 0.2928920	total: 37.5s	remaining: 4m 53s
113:	learn: 0.2921070	total: 37.7s	remaining: 4m 53s
114:	learn: 0.2916729	total: 38s	remaining: 4m 52s
115:	learn: 0.2914540	total: 38.3s	remaining: 4m 52s
116:	lear

253:	learn: 0.2304796	total: 1m 19s	remaining: 3m 54s
254:	learn: 0.2299709	total: 1m 20s	remaining: 3m 53s
255:	learn: 0.2296216	total: 1m 20s	remaining: 3m 53s
256:	learn: 0.2292042	total: 1m 20s	remaining: 3m 53s
257:	learn: 0.2286464	total: 1m 21s	remaining: 3m 52s
258:	learn: 0.2280058	total: 1m 21s	remaining: 3m 52s
259:	learn: 0.2276208	total: 1m 21s	remaining: 3m 52s
260:	learn: 0.2275388	total: 1m 21s	remaining: 3m 51s
261:	learn: 0.2269536	total: 1m 22s	remaining: 3m 51s
262:	learn: 0.2265725	total: 1m 22s	remaining: 3m 51s
263:	learn: 0.2264630	total: 1m 22s	remaining: 3m 50s
264:	learn: 0.2262428	total: 1m 23s	remaining: 3m 50s
265:	learn: 0.2253850	total: 1m 23s	remaining: 3m 50s
266:	learn: 0.2247655	total: 1m 23s	remaining: 3m 49s
267:	learn: 0.2244315	total: 1m 24s	remaining: 3m 49s
268:	learn: 0.2241323	total: 1m 24s	remaining: 3m 49s
269:	learn: 0.2240538	total: 1m 24s	remaining: 3m 48s
270:	learn: 0.2230588	total: 1m 24s	remaining: 3m 48s
271:	learn: 0.2229371	total:

406:	learn: 0.1906009	total: 2m 4s	remaining: 3m 1s
407:	learn: 0.1905221	total: 2m 4s	remaining: 3m
408:	learn: 0.1904706	total: 2m 4s	remaining: 3m
409:	learn: 0.1904235	total: 2m 5s	remaining: 3m
410:	learn: 0.1903907	total: 2m 5s	remaining: 2m 59s
411:	learn: 0.1901512	total: 2m 5s	remaining: 2m 59s
412:	learn: 0.1900960	total: 2m 5s	remaining: 2m 58s
413:	learn: 0.1900694	total: 2m 6s	remaining: 2m 58s
414:	learn: 0.1900191	total: 2m 6s	remaining: 2m 58s
415:	learn: 0.1899946	total: 2m 6s	remaining: 2m 57s
416:	learn: 0.1899498	total: 2m 6s	remaining: 2m 57s
417:	learn: 0.1898860	total: 2m 7s	remaining: 2m 57s
418:	learn: 0.1898479	total: 2m 7s	remaining: 2m 56s
419:	learn: 0.1896859	total: 2m 7s	remaining: 2m 56s
420:	learn: 0.1894315	total: 2m 7s	remaining: 2m 55s
421:	learn: 0.1894047	total: 2m 8s	remaining: 2m 55s
422:	learn: 0.1893838	total: 2m 8s	remaining: 2m 55s
423:	learn: 0.1893136	total: 2m 8s	remaining: 2m 54s
424:	learn: 0.1892546	total: 2m 8s	remaining: 2m 54s
425:	l

559:	learn: 0.1775201	total: 2m 45s	remaining: 2m 10s
560:	learn: 0.1774864	total: 2m 45s	remaining: 2m 9s
561:	learn: 0.1774565	total: 2m 46s	remaining: 2m 9s
562:	learn: 0.1774261	total: 2m 46s	remaining: 2m 9s
563:	learn: 0.1773424	total: 2m 46s	remaining: 2m 8s
564:	learn: 0.1770985	total: 2m 46s	remaining: 2m 8s
565:	learn: 0.1770137	total: 2m 47s	remaining: 2m 8s
566:	learn: 0.1769145	total: 2m 47s	remaining: 2m 7s
567:	learn: 0.1768945	total: 2m 47s	remaining: 2m 7s
568:	learn: 0.1768741	total: 2m 47s	remaining: 2m 7s
569:	learn: 0.1768409	total: 2m 48s	remaining: 2m 6s
570:	learn: 0.1768199	total: 2m 48s	remaining: 2m 6s
571:	learn: 0.1767891	total: 2m 48s	remaining: 2m 6s
572:	learn: 0.1767766	total: 2m 48s	remaining: 2m 5s
573:	learn: 0.1767512	total: 2m 49s	remaining: 2m 5s
574:	learn: 0.1765594	total: 2m 49s	remaining: 2m 5s
575:	learn: 0.1763091	total: 2m 49s	remaining: 2m 5s
576:	learn: 0.1762964	total: 2m 50s	remaining: 2m 4s
577:	learn: 0.1760920	total: 2m 50s	remaining

713:	learn: 0.1686387	total: 3m 27s	remaining: 1m 23s
714:	learn: 0.1686219	total: 3m 27s	remaining: 1m 22s
715:	learn: 0.1686153	total: 3m 27s	remaining: 1m 22s
716:	learn: 0.1684440	total: 3m 28s	remaining: 1m 22s
717:	learn: 0.1684384	total: 3m 28s	remaining: 1m 21s
718:	learn: 0.1682042	total: 3m 28s	remaining: 1m 21s
719:	learn: 0.1681984	total: 3m 28s	remaining: 1m 21s
720:	learn: 0.1681913	total: 3m 29s	remaining: 1m 20s
721:	learn: 0.1681733	total: 3m 29s	remaining: 1m 20s
722:	learn: 0.1681426	total: 3m 29s	remaining: 1m 20s
723:	learn: 0.1681003	total: 3m 29s	remaining: 1m 20s
724:	learn: 0.1680769	total: 3m 30s	remaining: 1m 19s
725:	learn: 0.1680682	total: 3m 30s	remaining: 1m 19s
726:	learn: 0.1680626	total: 3m 30s	remaining: 1m 19s
727:	learn: 0.1680541	total: 3m 30s	remaining: 1m 18s
728:	learn: 0.1680363	total: 3m 31s	remaining: 1m 18s
729:	learn: 0.1679363	total: 3m 31s	remaining: 1m 18s
730:	learn: 0.1679115	total: 3m 31s	remaining: 1m 17s
731:	learn: 0.1678854	total:

869:	learn: 0.1636642	total: 4m 8s	remaining: 37.1s
870:	learn: 0.1636578	total: 4m 8s	remaining: 36.8s
871:	learn: 0.1636540	total: 4m 8s	remaining: 36.5s
872:	learn: 0.1636495	total: 4m 9s	remaining: 36.2s
873:	learn: 0.1636438	total: 4m 9s	remaining: 35.9s
874:	learn: 0.1636221	total: 4m 9s	remaining: 35.6s
875:	learn: 0.1636195	total: 4m 9s	remaining: 35.4s
876:	learn: 0.1636097	total: 4m 10s	remaining: 35.1s
877:	learn: 0.1636077	total: 4m 10s	remaining: 34.8s
878:	learn: 0.1635692	total: 4m 10s	remaining: 34.5s
879:	learn: 0.1635662	total: 4m 10s	remaining: 34.2s
880:	learn: 0.1634992	total: 4m 11s	remaining: 33.9s
881:	learn: 0.1634632	total: 4m 11s	remaining: 33.6s
882:	learn: 0.1634389	total: 4m 11s	remaining: 33.4s
883:	learn: 0.1634309	total: 4m 11s	remaining: 33.1s
884:	learn: 0.1634199	total: 4m 12s	remaining: 32.8s
885:	learn: 0.1634072	total: 4m 12s	remaining: 32.5s
886:	learn: 0.1634004	total: 4m 12s	remaining: 32.2s
887:	learn: 0.1633964	total: 4m 12s	remaining: 31.9s




LogisticRegression
              precision    recall  f1-score   support

         0.0       0.90      0.70      0.79      3781
         1.0       0.75      0.92      0.82      3620

   micro avg       0.81      0.81      0.81      7401
   macro avg       0.82      0.81      0.81      7401
weighted avg       0.82      0.81      0.81      7401

[[2658 1123]
 [ 298 3322]]
Accuracy is  80.79989190649913
Time on model's work: 0.92 s




SGDClassifier
              precision    recall  f1-score   support

         0.0       0.93      0.62      0.74      3781
         1.0       0.70      0.95      0.81      3620

   micro avg       0.78      0.78      0.78      7401
   macro avg       0.81      0.78      0.77      7401
weighted avg       0.82      0.78      0.77      7401

[[2337 1444]
 [ 189 3431]]
Accuracy is  77.93541413322525
Time on model's work: 0.494 s
TFFMClassifier
              precision    recall  f1-score   support

         0.0       0.91      0.70      0.79      3781
         1.0       0.75      0.92      0.83      3620

   micro avg       0.81      0.81      0.81      7401
   macro avg       0.83      0.81      0.81      7401
weighted avg       0.83      0.81      0.81      7401

[[2662 1119]
 [ 274 3346]]
Accuracy is  81.17821915957303
Time on model's work: 63.489 s
TOTAL TIME SPENT:  1247.359 s


In [14]:
# TFFM sparse - works worse with sparse
# only CSR format supported
X_train_sparse = sp.csr_matrix(X_train)
X_test_sparse = sp.csr_matrix(X_test)
# weight - optional / AdamOptimizer
pos_class_weight = list(map(float,range(1, 5)))
for weight in pos_class_weight:
    model = TFFMClassifier(
        order=2,
        pos_class_weight=weight,
        rank=10, 
        optimizer=tf.train.AdamOptimizer(learning_rate=0.001), 
        n_epochs=50, 
        batch_size=1024,
        init_std=0.001,
        reg=0.01,
        input_type='sparse',
        #log_dir='./tmp/logs',
        #verbose=1,
        seed=42
    )
    model.fit(X_train_sparse, y_train, show_progress=True)
    predictions = model.predict(X_test_sparse)
    print('accuracy: {}'.format(accuracy_score(y_test, predictions)))
    # this will close tf.Session and free resources
    print(confusion_matrix(y_test,predictions)) 
    print(classification_report(y_test, predictions))
    model.destroy()

100%|██████████| 50/50 [00:13<00:00,  3.92epoch/s]


accuracy: 0.8197540872855019
[[2698 1083]
 [ 251 3369]]
              precision    recall  f1-score   support

         0.0       0.91      0.71      0.80      3781
         1.0       0.76      0.93      0.83      3620

   micro avg       0.82      0.82      0.82      7401
   macro avg       0.84      0.82      0.82      7401
weighted avg       0.84      0.82      0.82      7401



100%|██████████| 50/50 [00:13<00:00,  3.84epoch/s]


accuracy: 0.8079989190649912
[[2497 1284]
 [ 137 3483]]
              precision    recall  f1-score   support

         0.0       0.95      0.66      0.78      3781
         1.0       0.73      0.96      0.83      3620

   micro avg       0.81      0.81      0.81      7401
   macro avg       0.84      0.81      0.80      7401
weighted avg       0.84      0.81      0.80      7401



100%|██████████| 50/50 [00:14<00:00,  3.60epoch/s]


accuracy: 0.8048912309147412
[[2420 1361]
 [  83 3537]]
              precision    recall  f1-score   support

         0.0       0.97      0.64      0.77      3781
         1.0       0.72      0.98      0.83      3620

   micro avg       0.80      0.80      0.80      7401
   macro avg       0.84      0.81      0.80      7401
weighted avg       0.85      0.80      0.80      7401



100%|██████████| 50/50 [00:13<00:00,  3.83epoch/s]


accuracy: 0.7966491014727739
[[2347 1434]
 [  71 3549]]
              precision    recall  f1-score   support

         0.0       0.97      0.62      0.76      3781
         1.0       0.71      0.98      0.83      3620

   micro avg       0.80      0.80      0.80      7401
   macro avg       0.84      0.80      0.79      7401
weighted avg       0.84      0.80      0.79      7401



In [15]:
# weight - optional / FtrlOptimizer
pos_class_weight = list(map(float,range(1, 5)))
for weight in pos_class_weight:
    model = TFFMClassifier(
        order=2,
        pos_class_weight=weight,
        rank=10, 
        optimizer=tf.train.FtrlOptimizer(0.01, l2_regularization_strength=0.001, l2_shrinkage_regularization_strength=0.001), 
        n_epochs=50, 
        batch_size=1024,
        init_std=0.001,
        reg=0.01,
        input_type='sparse',
        #log_dir='./tmp/logs',
        #verbose=1,
        seed=42
    )
    model.fit(X_train_sparse, y_train, show_progress=True)
    predictions = model.predict(X_test_sparse)
    print('accuracy: {}'.format(accuracy_score(y_test, predictions)))
    # this will close tf.Session and free resources
    print(confusion_matrix(y_test,predictions)) 
    print(classification_report(y_test, predictions))
    model.destroy()

100%|██████████| 50/50 [00:13<00:00,  3.77epoch/s]


accuracy: 0.7419267666531549
[[2305 1476]
 [ 434 3186]]
              precision    recall  f1-score   support

         0.0       0.84      0.61      0.71      3781
         1.0       0.68      0.88      0.77      3620

   micro avg       0.74      0.74      0.74      7401
   macro avg       0.76      0.74      0.74      7401
weighted avg       0.76      0.74      0.74      7401



100%|██████████| 50/50 [00:13<00:00,  3.83epoch/s]


accuracy: 0.7592217267936765
[[2288 1493]
 [ 289 3331]]
              precision    recall  f1-score   support

         0.0       0.89      0.61      0.72      3781
         1.0       0.69      0.92      0.79      3620

   micro avg       0.76      0.76      0.76      7401
   macro avg       0.79      0.76      0.75      7401
weighted avg       0.79      0.76      0.75      7401



100%|██████████| 50/50 [00:13<00:00,  3.81epoch/s]


accuracy: 0.7588163761653831
[[2218 1563]
 [ 222 3398]]
              precision    recall  f1-score   support

         0.0       0.91      0.59      0.71      3781
         1.0       0.68      0.94      0.79      3620

   micro avg       0.76      0.76      0.76      7401
   macro avg       0.80      0.76      0.75      7401
weighted avg       0.80      0.76      0.75      7401



100%|██████████| 50/50 [00:13<00:00,  3.85epoch/s]


accuracy: 0.7471963248209701
[[2071 1710]
 [ 161 3459]]
              precision    recall  f1-score   support

         0.0       0.93      0.55      0.69      3781
         1.0       0.67      0.96      0.79      3620

   micro avg       0.75      0.75      0.75      7401
   macro avg       0.80      0.75      0.74      7401
weighted avg       0.80      0.75      0.74      7401



In [16]:
# KERAS
model = Sequential()
model.add(Dense(128, input_dim=X_train.shape[1], activation='sigmoid'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='sigmoid'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

model.fit(X_train, y_train,
          epochs=20,
          batch_size=256)
score = model.evaluate(X_test, y_test, batch_size=256)
print(score)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20


Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
[0.4414152992390214, 0.7905688421208542]


## RESULTS BEST SAMPLERS AND MODELS
### 1. Sampler 'SMOTEEN'. Models - RandomForest, ExtraTrees, MLP
### 2. Sampler 'Nearmiss(version = 1)'. Models - GradientBoosting, XGB, TFFM