In [1]:
import pandas as pd
import numpy as np
import torch

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import xgboost as xgb
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import GaussianNB
from skmultilearn.problem_transform import ClassifierChain
from skmultilearn.problem_transform import LabelPowerset
from skmultilearn.adapt import MLkNN

from torch import nn, optim
import torch.nn.functional as F

In [1]:
!pip install scikit-multilearn --user
!pip install xgboost --user


Collecting scikit-multilearn
[?25l  Downloading https://files.pythonhosted.org/packages/bb/1f/e6ff649c72a1cdf2c7a1d31eb21705110ce1c5d3e7e26b2cc300e1637272/scikit_multilearn-0.2.0-py3-none-any.whl (89kB)
[K    100% |████████████████████████████████| 92kB 4.1MB/s ta 0:00:011
[?25hInstalling collected packages: scikit-multilearn
Successfully installed scikit-multilearn-0.2.0
Collecting xgboost
[?25l  Downloading https://files.pythonhosted.org/packages/c1/24/5fe7237b2eca13ee0cfb100bec8c23f4e69ce9df852a64b0493d49dae4e0/xgboost-0.90-py2.py3-none-manylinux1_x86_64.whl (142.8MB)
[K    100% |████████████████████████████████| 142.8MB 246kB/s eta 0:00:01 6% |██                              | 8.8MB 34.0MB/s eta 0:00:04    8% |██▋                             | 11.6MB 33.8MB/s eta 0:00:04    9% |███                             | 13.0MB 29.7MB/s eta 0:00:05    11% |███▌                            | 15.8MB 30.8MB/s eta 0:00:05    12% |███▉                            | 17.3MB 25.3MB/s eta 0:00:05 

## 1)

In [2]:
# read dataset

x_dataset = pd.read_pickle("./x_dataset.pkl")
y_dataset = pd.read_pickle("./y_dataset.pkl")


In [3]:
# del unused columns

del x_dataset["id"]
del y_dataset["user_id"]

In [4]:
y_dataset = y_dataset.astype('float64')

In [19]:
y_dataset.dtypes

index                               float64
ae264e3637204a6fb9bb56bc8210ddfd    float64
4d5c57ea9a6940dd891ad53e9dbe8da0    float64
3f207df678b143eea3cee63160fa8bed    float64
9b98b8c7a33c4b65b9aebfe6a799e6d9    float64
0b1e1539f2cc45b7b9fa7c272da2e1d7    float64
2298d6c36e964ae4a3e7e9706d1fb8c2    float64
fafdcd668e3743c1bb461111dcafc2a4    float64
5a8bc65990b245e5a138643cd4eb9837    float64
f19421c1d4aa40978ebb69ca19b0e20d    float64
2906b810c7d4411798c6938adc9daaa5    float64
dtype: object

In [20]:
x_dataset.head()

Unnamed: 0,age,income,user_day,gender_F,gender_M,gender_O
1,0.445783,0.911111,0.206253,1,0,0
3,0.686747,0.777778,0.243006,1,0,0
5,0.60241,0.444444,0.049918,0,1,0
8,0.566265,0.255556,0.091607,0,1,0
12,0.481928,0.233333,0.140976,0,1,0


In [21]:
xTrain, xTest, yTrain, yTest = train_test_split(x_dataset, y_dataset, test_size = 0.2, random_state = 0)

In [22]:
# if GPUs are available they are used

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [23]:
# Clasifier with pytorch with n input features, 2 hidden layers and an output features

class Classifier(nn.Module):
    def __init__(self, input_features, hidden_dim_1, hidden_dim_2, output_dim):
        """
        Initialize the model by setting up linear layers.
        Use the input parameters to help define the layers of your model.
        :param input_features: the number of input features in your training/test data
        :param hidden_dim: helps define the number of nodes in the hidden layer(s)
        :param output_dim: the number of outputs you want to produce
        """
        super().__init__()
        
        #Fully conected
        self.fc1 = nn.Linear(input_features, hidden_dim_1)
        self.fc2 = nn.Linear(hidden_dim_1, hidden_dim_2)
        self.fc3 = nn.Linear(hidden_dim_2, output_dim)
       
         # Droput 
        self.drop = nn.Dropout(0.3)
        
        # Softmax Layer
        self.sig = nn.Sigmoid()
        

# The feed forward is with relu activation on layers, 0.3 of droput and a sigmoid activation

    def forward(self, x):
        """
        Perform a forward pass of our model on input features, x.
        :param x: A batch of input features of size (batch_size, input_features)
        :return: A single, sigmoid-activated value as output
        """
        
        # feedforward behavior
        
        out = F.relu(self.fc1(x))
        out = self.drop(out)
        out = F.relu(self.fc2(out))
        out = self.drop(out)
        out = self.fc3(out)
        
        return self.sig(out)
        

In [24]:
from torchsummary import summary


ModuleNotFoundError: No module named 'torchsummary'

In [25]:
# This model has 6 inputs 2 hidden layers and 10 oputputs as there is 10 different offers

model = Classifier(xTrain.shape[1], 16, 32, yTrain.shape[1]).to(device)


In [26]:
xTrain.shape[1], 16, 32, yTrain.shape[1]

(6, 16, 32, 11)

In [27]:

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.003)

In [28]:
%%time

epochs = 5
model = model.train()
model = model.double()
for e in range(epochs):
    running_loss = 0
    for x, y in zip(xTrain.values, yTrain.values):
        x = torch.from_numpy(x).to(device)
        y = torch.from_numpy(y).to(device)
        log_ps = model(x)

        loss = criterion(log_ps, y)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    else:
        print(f"Training loss: {running_loss/len(xTrain)}")

Training loss: 0.30337950904152905
Training loss: 0.29797648211548655
Training loss: 0.297147302403637
Training loss: 0.2966598218896223
Training loss: 0.2969094497153587
CPU times: user 35.9 s, sys: 3.28 s, total: 39.2 s
Wall time: 39.6 s


In [29]:
model.eval()
running_loss = 0
for x, y in zip(xTest.values, yTest.values):
    x = torch.from_numpy(x).to(device)
    y = torch.from_numpy(y).to(device)
    log_ps = model(x)

    loss = criterion(log_ps, y)

    optimizer.zero_grad()
    running_loss += loss.item()
else:
    print(f"Test loss: {running_loss/len(xTest)}")

Test loss: 0.29830405286008266


## 1.1)

In [5]:

y_dataset = pd.read_pickle("./y_dataset1.pkl")

In [6]:
del y_dataset["user_id"]

In [7]:
y_dataset = y_dataset.astype('float64')

In [8]:
x_dataset.head()

Unnamed: 0,age,income,user_day,gender_F,gender_M,gender_O
1,0.445783,0.911111,0.206253,1,0,0
3,0.686747,0.777778,0.243006,1,0,0
5,0.60241,0.444444,0.049918,0,1,0
8,0.566265,0.255556,0.091607,0,1,0
12,0.481928,0.233333,0.140976,0,1,0


In [9]:
xTrain, xTest, yTrain, yTest = train_test_split(x_dataset, y_dataset, test_size = 0.2, random_state = 0)

Different aprroaches using a combination of gaussian naive bayes and different clasifiers 

In [10]:

# initialize binary relevance multi-label classifier
# with a gaussian naive bayes base classifier

classifier = BinaryRelevance(GaussianNB())

# train
classifier.fit(xTrain, yTrain)

# predict
predictions = classifier.predict(xTest)

In [11]:
from sklearn.metrics import accuracy_score
print(classification_report(yTest, predictions))

accuracy_score(yTest,predictions)


             precision    recall  f1-score   support

          0       0.00      0.00      0.00         0
          1       0.17      0.02      0.03       401
          2       0.00      0.00      0.00       430
          3       0.00      0.00      0.00         0
          4       0.24      0.03      0.06       313
          5       0.14      0.03      0.04       236
          6       0.17      0.01      0.02       629
          7       0.00      0.00      0.00       676
          8       0.00      0.00      0.00         0
          9       0.21      0.02      0.03       536
         10       0.07      0.01      0.01       364

avg / total       0.12      0.01      0.02      3585



  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


0.31062394603709947

In [12]:
# using classifier chains

# initialize classifier chains multi-label classifier
# with a gaussian naive bayes base classifier
classifier = ClassifierChain(GaussianNB())

# train
classifier.fit(xTrain.values, yTrain.values)

# predict
predictions = classifier.predict(xTest)
print(classification_report(yTest, predictions))

accuracy_score(yTest,predictions)

             precision    recall  f1-score   support

          0       0.00      0.00      0.00         0
          1       0.17      0.02      0.03       401
          2       0.38      0.02      0.04       430
          3       0.00      0.00      0.00         0
          4       0.24      0.03      0.06       313
          5       0.14      0.03      0.04       236
          6       0.17      0.01      0.02       629
          7       0.00      0.00      0.00       676
          8       0.00      0.00      0.00         0
          9       0.21      0.02      0.03       536
         10       0.07      0.01      0.01       364

avg / total       0.16      0.01      0.03      3585



  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


0.31062394603709947

In [13]:
# using Label Powerset

# initialize Label Powerset multi-label classifier
# with a gaussian naive bayes base classifier
classifier = LabelPowerset(GaussianNB())

# train
classifier.fit(xTrain.values, yTrain.values)

# predict
predictions = classifier.predict(xTest)
print(classification_report(yTest, predictions))

accuracy_score(yTest.values,predictions)

             precision    recall  f1-score   support

          0       0.00      0.00      0.00         0
          1       0.13      0.58      0.22       401
          2       0.14      0.61      0.23       430
          3       0.00      0.00      0.00         0
          4       0.11      0.29      0.16       313
          5       0.09      0.29      0.14       236
          6       0.20      0.57      0.30       629
          7       0.25      0.53      0.34       676
          8       0.00      0.00      0.00         0
          9       0.18      0.39      0.25       536
         10       0.12      0.68      0.20       364

avg / total       0.17      0.51      0.25      3585



  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


0.00033726812816188871

In [14]:
%%time

classifier = MLkNN(k=20)

# train
classifier.fit(xTrain.values, yTrain.values)

# predict
predictions = classifier.predict(xTest)
print(classification_report(yTest, predictions))

print(accuracy_score(yTest,predictions))

             precision    recall  f1-score   support

          0       0.00      0.00      0.00         0
          1       0.00      0.00      0.00       401
          2       0.00      0.00      0.00       430
          3       0.00      0.00      0.00         0
          4       0.33      0.00      0.01       313
          5       0.00      0.00      0.00       236
          6       0.40      0.00      0.01       629
          7       0.11      0.00      0.01       676
          8       0.00      0.00      0.00         0
          9       0.12      0.00      0.01       536
         10       0.00      0.00      0.00       364

avg / total       0.14      0.00      0.00      3585

0.31197301855
CPU times: user 30.2 s, sys: 17.5 ms, total: 30.2 s
Wall time: 30.5 s


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


## 2) 

In [15]:
x_dataset2 = pd.read_pickle("./x_dataset.pkl")
y_dataset2 = pd.read_pickle("./y_dataset2.pkl")


In [16]:
del x_dataset2["id"]
del y_dataset2["user_id"]

In [17]:
y_dataset2 = y_dataset2.astype('category')
y_dataset2 = y_dataset2.astype('float32')

In [18]:
xTrain2, xTest2, yTrain2, yTest2 = train_test_split(x_dataset2, y_dataset2, test_size = 0.2, random_state = 0)

This model is similar to the one before but in this case there is only 1 class on the output

In [42]:
(xTrain2.shape[1], 16, 32, yTrain2.shape[1])

(6, 16, 32, 1)

In [43]:
model2 = Classifier(xTrain2.shape[1], 16, 32, yTrain2.shape[1]).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.003)

In [70]:
%%time

epochs = 15
model2 = model2.double()
for e in range(epochs):
    running_loss = 0
    for x, y in zip(xTrain2.values, yTrain2.values):
        x = torch.from_numpy(x).to(device)
        y = torch.from_numpy(y).to(device)
        y = y.double()
        log_ps = model2(x)

        loss = criterion(log_ps, y)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    else:
        print(f"Training loss: {running_loss/len(xTrain2)}")

Training loss: 0.46055288286122276
Training loss: 0.4597547679409771
Training loss: 0.46333511777290276
Training loss: 0.46155119092803887
Training loss: 0.46172660176029084
Training loss: 0.45398309065244313
Training loss: 0.45834524047632014
Training loss: 0.4638425834917475
Training loss: 0.459261414452881
Training loss: 0.4643605820263096
Training loss: 0.46147775661121954
Training loss: 0.4558530888989802
Training loss: 0.4630111590192187
Training loss: 0.45953423477297345
Training loss: 0.46160491654638014
CPU times: user 1min 47s, sys: 8.77 s, total: 1min 56s
Wall time: 1min 57s


Trying an xgboos clasification to clasify only 1 label (the top rewarded)

In [71]:
model.eval()
running_loss = 0
for x, y in zip(xTest2.values, yTest2.values):
    x = torch.from_numpy(x).to(device)
    y = torch.from_numpy(y).to(device)
    y = y.double()

    log_ps = model2(x)

    loss = criterion(log_ps, y)

    optimizer.zero_grad()
    running_loss += loss.item()
else:
    print(f"Test loss: {running_loss/len(xTest2)}")

Test loss: 0.45583873448677675


In [33]:

dtrain = xgb.DMatrix(data=xTrain2, label=yTrain2.values)
dtest = xgb.DMatrix(data=xTest2)

In [34]:
params = {
    'objective': 'multi:softmax',  # error evaluation for multiclass training
    'num_class': 10,
    'n_gpus': 0
}

In [35]:
bst = xgb.train(params, dtrain)
pred = bst.predict(dtest)

from sklearn.metrics import classification_report

print(classification_report(yTest2, pred))

             precision    recall  f1-score   support

          0       0.41      0.97      0.58      1209
          1       0.20      0.02      0.04       423
          3       0.17      0.01      0.01       198
          4       0.00      0.00      0.00       109
          5       0.12      0.01      0.02       313
          6       0.26      0.02      0.03       275
          8       0.38      0.01      0.02       349
          9       0.00      0.00      0.00        89

avg / total       0.29      0.40      0.25      2965



  'precision', 'predicted', average, warn_for)


## 4)

In [25]:
x_dataset4 = pd.read_pickle("./y_dataset4.pkl")
x_dataset4 = x_dataset4.reset_index()
y_dataset4 = x_dataset4["class"]

del x_dataset4["user_id"]
del x_dataset4["class"]

In [26]:
x_dataset4 = x_dataset4.astype('float64')
y_dataset4 = y_dataset4.astype('float64')

In [27]:
x_dataset4.head()

Unnamed: 0,index,age,income,user_day,ae264e3637204a6fb9bb56bc8210ddfd,4d5c57ea9a6940dd891ad53e9dbe8da0,3f207df678b143eea3cee63160fa8bed,9b98b8c7a33c4b65b9aebfe6a799e6d9,0b1e1539f2cc45b7b9fa7c272da2e1d7,2298d6c36e964ae4a3e7e9706d1fb8c2,fafdcd668e3743c1bb461111dcafc2a4,5a8bc65990b245e5a138643cd4eb9837,f19421c1d4aa40978ebb69ca19b0e20d,2906b810c7d4411798c6938adc9daaa5,gender_F,gender_M,gender_O
0,0.0,0.445783,0.911111,0.206253,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.445783,0.911111,0.206253,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.445783,0.911111,0.206253,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.445783,0.911111,0.206253,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.445783,0.911111,0.206253,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [29]:
xTrain4, xTest4, yTrain4, yTest4 = train_test_split(x_dataset4, y_dataset4, test_size = 0.2, random_state = 0)

In [30]:
model = xgb.XGBClassifier()
train_model = model.fit(xTrain4, yTrain4)


In [32]:
pred = train_model.predict(xTest4)


In [34]:
print(classification_report(yTest4, pred))


             precision    recall  f1-score   support

        0.0       0.88      1.00      0.94     26068
        1.0       0.00      0.00      0.00      3582

avg / total       0.77      0.88      0.82     29650



  'precision', 'predicted', average, warn_for)
