## PyFactorizationMachines
### https://github.com/dstein64/PyFactorizationMachines/blob/master/documentation.md

In [1]:
import pandas as pd
import numpy as np
import scipy.sparse as sp
import tensorflow as tf
import matplotlib.pyplot as plt
import pyfms
import pyfms.regularizers
from time import time
from sklearn import preprocessing
from sklearn.feature_extraction import DictVectorizer
from random import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.metrics import sensitivity_score
from imblearn.metrics import specificity_score
%matplotlib inline



In [5]:
# preprocessing
data = pd.read_csv('...\data_net.csv', sep = ';')
data.head()

Unnamed: 0,Click,Browsname,Searcher,TypeCon,Country,deviceType,Model,ModelCompany,NewID,Reversed,Type,System,FirstSearcher,Version,Sex,Age,TimeSpent,InternalCode,id_transf
0,0.0,Chrome,Google,wi-fi,ita,SmartPhone,Nokia 2240,Nokia,yes,no,mobile,Android,Yahoo,5.0,male,21,1.02,fergie,25465885
1,0.0,Firefox,Yahoo,cable,fra,SmartPhone,Meizu 4 pro,Meizu,yes,no,TV,Android,Yahoo,4.2,female,34,0.123,krag,65458971
2,0.0,Chrome,Yandex,wi-fi,rus,SmartPhone,Iphone 6,Iphone,yes,yes,mobile,Ios,Yahoo,9.0,male,53,0.14,leslie,21547895
3,0.0,IE,Google,cable,usa,SmartPhone,Iphone X,Iphone,no,no,web,Ios,Yahoo,9.0,male,30,0.01,fergie,32541568
4,0.0,IE,Google,wi-fi,usa,SmartPhone,Galaxy J4,SAMSUNG,no,no,TV,Android,Yahoo,5.5.1,female,15,1.1,frent,12023515


In [6]:
# features & labels
labels = data['Click']
features = data.drop('Click', axis = 1)

In [7]:
# create the dictionary of unique attributes' lists
unique_attr = {}
for attr in features.columns:
    unique_attr[attr] = features[attr].unique().tolist()
unique_attr
# features encoding
encoder = preprocessing.OneHotEncoder(categories=[unique_attr[i] for i in unique_attr], sparse = False, handle_unknown='ignore')
encoder

OneHotEncoder(categorical_features=None,
       categories=[['Chrome', 'Firefox', 'IE'], ['Google', 'Yahoo', 'Yandex'], ['wi-fi', 'cable'], ['ita', 'fra', 'rus', 'usa'], ['SmartPhone'], ['Nokia 2240', 'Meizu 4 pro', 'Iphone 6', 'Iphone X', 'Galaxy J4'], ['Nokia', 'Meizu', 'Iphone', 'SAMSUNG'], ['yes', 'no'], ['no', 'yes'], ['mobile', 'TV', 'web']...01, 1.1], ['fergie', 'krag', 'leslie', 'frent'], [25465885, 65458971, 21547895, 32541568, 12023515]],
       dtype=<class 'numpy.float64'>, handle_unknown='ignore',
       n_values=None, sparse=False)

In [8]:
features['CONCAT'] = features.values.tolist()
features['CONCAT'].head()

0    [Chrome, Google, wi-fi, ita, SmartPhone, Nokia...
1    [Firefox, Yahoo, cable, fra, SmartPhone, Meizu...
2    [Chrome, Yandex, wi-fi, rus, SmartPhone, Iphon...
3    [IE, Google, cable, usa, SmartPhone, Iphone X,...
4    [IE, Google, wi-fi, usa, SmartPhone, Galaxy J4...
Name: CONCAT, dtype: object

In [36]:
t0 = time()
features['ENCODED'] = [encoder.fit_transform([i]).flatten() for i in features['CONCAT']]
print ("time on encoding:", round(time()-t0, 3), "s")

time on encoding: 14.23 s


In [37]:
# create the array with feature vectors
features_list = [list(i) for i in features['ENCODED']]
features_list_array = np.array(features_list)

In [38]:
# create the array with label vector 
labels_list_array = np.array(labels.tolist())

In [39]:
X_train, X_test, y_train, y_test = train_test_split(features_list_array, labels_list_array, random_state=42, test_size=0.3)

In [40]:
# error score function
def error_score(y_true, y_pred):
    return 1.0 - accuracy_score(y_true, y_pred)

In [41]:
print('* Binary Classification Example')
print('* (with sample weighting and sparse data)')

* Binary Classification Example
* (with sample weighting and sparse data)


In [42]:
# only CSR format supported
X_train_sparse = sp.csr_matrix(X_train)
X_test_sparse = sp.csr_matrix(X_test)

In [43]:
# define dimensions
classifier_dims = features_list_array.shape[1]

In [44]:
# classifier initializing
fm_classifier = pyfms.Classifier(classifier_dims, k=2, X_format="csr")

In [45]:
# define number of items within 2 classes and weights
class_count_lookup = dict(zip(*np.unique(y_train, return_counts=True)))
sample_weight = np.array([1.0 / class_count_lookup[_y] for _y in y_train])

In [46]:
# training, prediction, testing model
t0 = time()
fm_classifier.fit(X_train_sparse, y_train)
predictions = fm_classifier.predict(X_test_sparse)
print('Factorization Machine Error: {}'.format(error_score(y_test, predictions)))
print('Accuracy is ', accuracy_score(y_test, predictions)*100)
print(classification_report(y_test, predictions))
print(confusion_matrix(y_test,predictions))
print('Sensivity score is ', sensitivity_score(y_test, predictions))
print('Specificity_score is ', specificity_score(y_test, predictions))
print ("time on encoding:", round(time()-t0, 3), "s")

Factorization Machine Error: 0.07633333333333336
Accuracy is  92.36666666666666
              precision    recall  f1-score   support

         0.0       0.93      1.00      0.96      5530
         1.0       0.63      0.06      0.11       470

   micro avg       0.92      0.92      0.92      6000
   macro avg       0.78      0.53      0.54      6000
weighted avg       0.90      0.92      0.89      6000

[[5513   17]
 [ 441   29]]
Sensivity score is  0.06170212765957447
Specificity_score is  0.9969258589511754
time on encoding: 1742.691 s
