In [3]:
import pandas as pd
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable
import torch.nn.init as weight_init

In [59]:
pd.read_csv("../data/complete_dataset.csv").dropna().astype(int).drop(columns=["customer_id"]).drop_duplicates()

Unnamed: 0,car_id,user_id,used_label,price,exteriorColor,interiorColor,drivetrain,minMPG,maxMPG,fuelType,...,engine,mileage,brand,cluster_id,gender,married,age,graduated,profession,familySize
0,84,0,1,62,11,80,1,17,27,3,...,75,35652,22,25,0,1,79,1,0,1
1,84,0,1,62,11,80,1,17,27,3,...,75,35652,22,25,1,1,49,1,0,3
2,84,0,1,62,11,80,1,17,27,3,...,75,35652,22,25,1,1,76,0,8,1
3,84,0,1,62,11,80,1,17,27,3,...,75,35652,22,25,0,0,36,1,1,2
4,84,0,1,62,11,80,1,17,27,3,...,75,35652,22,25,1,0,30,0,5,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20580,5,74,1,49,53,9,4,20,27,3,...,60,28137,19,29,1,0,31,1,0,2
20581,5,74,1,49,53,9,4,20,27,3,...,60,28137,19,29,1,1,67,1,4,2
20582,5,74,1,49,53,9,4,20,27,3,...,60,28137,19,29,0,0,26,1,5,3
20583,5,74,1,49,53,9,4,20,27,3,...,60,28137,19,29,1,1,36,1,3,6


In [60]:
data = pd.read_csv("../data/dataset.csv").dropna().astype(int).drop_duplicates()

In [61]:
data

Unnamed: 0,user_id,car_id,interaction
0,0,0,1
1,1,0,0
2,2,0,0
3,3,0,0
4,4,0,0
...,...,...,...
8170,70,402,0
8171,71,402,0
8172,72,402,0
8173,73,402,0


In [24]:
class Encoder(nn.Module):
    """
    Class for Symmetrical AutoEncoder Network
    :param L: List of int, contains sizes of encoding layers and starts with feature size
    For example: [500, 20, 10] will result in:
      - encoder 2 layers: 500x20 and 20x10. Representation layer (z) will be 10
      - decoder 2 layers: 10x20 and 20x500, output size is 500, reconstructed.
    :param activation_fn: (default 'sigmoid') Type of activation function
    :param drop_prob: (default: 0.0) Dropout probability
    """
    def __init__(self, L, activation_fn='sigmoid', drop_prob=0.0):
        super(Encoder, self).__init__()
        layers = self.create_nn_structure(L)
        self.num_layers = len(L)
        # create controller for activation function
        self.activation_fn_nm = activation_fn
        # create dropout module
        self._drop_prob = drop_prob
        if drop_prob > 0.0:
            self.dropout = nn.Dropout(drop_prob)
        #initialize with empty list to store layers
        self.linears = nn.ModuleList([])
        self.linears.extend([nn.Linear(i[0], i[1]) for i in layers])
        
    def get_activation_fn(self):
        # user selected activation function at layers except for last layer
        if self.activation_fn_nm == 'relu':
            return nn.ReLU()
        elif self.activation_fn_nm == 'lrelu':
            return nn.LeakyReLU()
        elif self.activation_fn_nm == 'sigmoid':
            return nn.Sigmoid()
        else:
            raise ValueError('Activation function type not defined')
    
    def forward(self, x):
        for i,layer in enumerate(self.linears):
            if i <= self.num_layers-1:
                # create instance of activation function
                act_fn = self.get_activation_fn()
                # pass in the input
                x = act_fn(self.linears[i](x))
                if self._drop_prob > 0.0 and i <= int(self.num_layers/2): 
                    # apply dropout only on encode layer by control of i
                    x = self.dropout(x)
        # No activation on the last decoding layer
        x = self.linears[-1](x)
        return x

    def create_nn_structure(self, L):
        max_ind = len(L)-1
        layers = []
        for i,v in enumerate(L):
            if i < max_ind:
                #still have i+1 available, create layer tuple
                layer = [v,L[i+1]]
                layers.append(layer)
        #then inverse the layers for decoder size
        encoder_layers = layers[:]
        for l in encoder_layers[::-1]:
            decoder_layer = l[::-1]
            layers.append(decoder_layer)
        return layers

In [26]:
def create_index_mapping(L):
    '''
    return reindexed dict on user and items
    encoded indices starts from 1
    input: 
    * L: list of str
    outputs:
    * ind_2_item,item_2_ind: tuple of dictionary
    '''
    L = set(L)
    ind_2_item = {}
    
    for i,v in enumerate(L):
        #index start from 1
        ind_2_item[i+1] = v
    #invert the map
    item_2_ind = {v: k for k, v in ind_2_item.items()}
    return ind_2_item,item_2_ind
    
def reindexer(ratings_df,user_col,item_col,rating_col):
    '''
    inputs:
    * ratings_df: pandas df containing ratings/affinity for user-item pairs
    * user_col: actual col name for users
    * item_col: actual col name for items
    * rating_col: actual col name for ratings
    output:
    * ratings_df: reindexed user and item column, pandas df
    '''
    users_list = ratings_df[user_col].tolist()
    item_list = ratings_df[item_col].tolist()
    
    ind_2_user,user_2_ind = create_index_mapping(users_list)
    ind_2_item,item_2_ind = create_index_mapping(item_list)
    
    #rename ratings df
    ratings_df = ratings_df.rename(columns={user_col:'user_col',
                                            item_col:'item_col',
                                            rating_col:'rating_col'})

    #encode df using the 2 mappings
    ratings_df['encoded_users'] = ratings_df['user_col'].apply(lambda x:user_2_ind[x])
    ratings_df['encoded_items'] = ratings_df['item_col'].apply(lambda x:item_2_ind[x])
    
    return ratings_df[['encoded_users','encoded_items','rating_col']]
  
ratings = pd.read_csv('../data/dataset.csv')
ratings.columns = ['user_id','car_id','interaction']
ratings_reindex = reindexer(ratings,'user_id','car_id','interaction')


from sklearn.model_selection import train_test_split
train, test = train_test_split(ratings_reindex,
                               stratify=ratings_reindex['encoded_users'],
                               test_size=0.1,
                               random_state=42)

training_set = np.array(train, dtype = 'int')
test_set = np.array(test, dtype = 'int')

nb_users = int(max(max(training_set[:,0]), max(test_set[:,0])))
nb_movies = int(max(max(training_set[:,1]), max(test_set[:,1])))

def convert(data):
    new_data = []
    for id_users in range(nb_users+1):
        # each user's watched movies
        # data[:,0], first column, all rows column users
        id_items = data[:,1][data[:,0] == id_users]
        # each user's rating for that item
        id_ratings = data[:,2][data[:,0] == id_users]
        ratings = np.zeros(nb_movies)
        # the positions of these items are filled with ratings, creating the matrix
        ratings[id_items-1] = id_ratings
        new_data.append(list(ratings))
    return new_data
  
training_set = convert(training_set)
test_set = convert(test_set)

training_set = torch.FloatTensor(training_set)
test_set = torch.FloatTensor(test_set)

In [122]:
autoencoder_network = Encoder([nb_movies,20,10],'sigmoid',0.1)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.RMSprop(autoencoder_network.parameters(), lr = 0.01, weight_decay = 0.5)

nb_epoch = 10
for epoch in range(1, nb_epoch + 1):
    train_loss = 0
    s = 0.
    # s is the number of users who rated at least 1 movies
    for id_user in range(nb_users):
        input = Variable(training_set[id_user]).unsqueeze(0)
        target = input.clone()
        if torch.sum(target.data > 0) > 0:
            output = autoencoder_network(input)
            target.require_grad = False
            output[target == 0] = 0
            loss = criterion(output, target)
            mean_corrector = nb_movies/float(torch.sum(target.data > 0) + 1e-10) #making this anyway not equal to 0, as this will be a denominator
            #mean_corrector is the avg of the error, only considering the movies having ratings (non-zero ratings) for computing mean of error
            loss.backward() # decide the direction the increment of weights
            #this call will just computing all the gradients required
            train_loss += np.sqrt(loss.data*mean_corrector)
            s += 1.
            optimizer.step() # decide the amount to update the weights
            
    print('epoch: '+str(epoch)+' loss: '+ str(train_loss.item()/s))
    
    
test_loss = 0
s = 0.

res = []
targets = []

# averaged difference between real rating and predicted rating
for id_user in range(nb_users):
    input = Variable(training_set[id_user]).unsqueeze(0) # should keep the training set
    target = Variable(test_set[id_user]).unsqueeze(0) # to predict the other movies user not seen yet
    
    if torch.sum(target.data > 0) > 0:
        # make predictions
        output = autoencoder_network(input)
        targets.append(target.detach().numpy())
        res.append(output.detach().numpy()) 
        target.require_grad = False
        output[target == 0] = 0 # dont want to measure the loss on the movies didnt get the actual rating from user 
        # force to 0 and difference / loss will be 0 for those entries
        loss = criterion(torch.sigmoid(output), target)
        
        mean_corrector = nb_movies/float(torch.sum(target.data > 0) + 1e-10) 
        # only consider the movies that are rated in the test set, to be included in the loss
        test_loss += np.sqrt(loss.data*mean_corrector)
        s += 1.
print('test loss: '+str(test_loss.item()/s))


#making top k recommendation
def make_top_k_recommendations(encoder,evidence,k,filter_seen=True):
    '''
    :param encoder: autoencoder instance
    :param evidence: full set of seen ratings from all users
    :param k: top k items (by output score)
    :param filter_seen: (default True) filter controller to remove seen items from top k list
    '''     
    res = []
    nb_users = evidence.shape[0]
    # to find top scored items for each user
    for id_user in range(nb_users):
        encoder_input = Variable(evidence[id_user]).unsqueeze(0) # should keep the training set 
        encoder_output = encoder(encoder_input)
        
        target = Variable(evidence[id_user]).unsqueeze(0) # mask to find items not seen yet
        if filter_seen:
            encoder_output[target != 0] = 0 # force seen items scores to 0, will never get recommended
        res.append(encoder_output.detach().numpy())
        
    res = [a[0] for a in res]
    final_itemsets = []    
    for each in res:
        full_ratings_predicted = list(each)
        full_ratings_indexed = list(enumerate(full_ratings_predicted))
        final_itemsets.append(sorted(full_ratings_indexed,key=lambda x:x[1],reverse =True)[:k])
        
    return final_itemsets

epoch: 1 loss: 3.655687074403505
epoch: 2 loss: 3.630061793971706
epoch: 3 loss: 3.615192310230152
epoch: 4 loss: 3.6053796716638513
epoch: 5 loss: 3.598804370777027
epoch: 6 loss: 3.5941380681218327
epoch: 7 loss: 3.590579058672931
epoch: 8 loss: 3.5880184689083614
epoch: 9 loss: 3.5857247017525338
epoch: 10 loss: 3.5838796254750847
test loss: 8.813988318810097


In [123]:
targets[0][0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0.], dtype=float32)

In [124]:
input = Variable(training_set[id_user]).unsqueeze(0) # should keep the training set
target = Variable(test_set[id_user]).unsqueeze(0) # to predict the other movies user not seen yet
output = autoencoder_network(input)

In [125]:
torch.sigmoid(output)

tensor([[0.9640, 0.8805, 0.8471, 0.9191, 0.9419, 0.9023, 0.9763, 0.9674, 0.9768,
         0.9396, 0.9284, 0.9862, 0.7074, 0.6876, 0.8019, 0.7829, 0.8626, 0.8418,
         0.6900, 0.6817, 0.8929, 0.7195, 0.7103, 0.6697, 0.6800, 0.9380, 0.7088,
         0.9692, 0.6768, 0.9829, 0.7055, 0.9678, 0.7919, 0.7053, 0.7052, 0.6729,
         0.7098, 0.6822, 0.7097, 0.7137, 0.9812, 0.6880, 0.7898, 0.7884, 0.5014,
         0.7100, 0.7156, 0.9779, 0.6868, 0.6727, 0.7080, 0.4978, 0.8400, 0.9563,
         0.9617, 0.6880, 0.6826, 0.6814, 0.9754, 0.6818, 0.7129, 0.6819, 0.9313,
         0.6748, 0.7865, 0.7073, 0.6814, 0.6825, 0.6831, 0.9525, 0.6720, 0.9767,
         0.9766, 0.9611, 0.7928, 0.6816, 0.6768, 0.7993, 0.6812, 0.6826, 0.9579,
         0.5033, 0.7089, 0.9662, 0.8956, 0.6913, 0.9643, 0.6909, 0.7129, 0.7070,
         0.4983, 0.9664, 0.7050, 0.9653, 0.7055, 0.6831, 0.9449, 0.7973, 0.6797,
         0.6747, 0.6883, 0.7062, 0.9474, 0.5006, 0.9485, 0.6698, 0.9645, 0.7078,
         0.8805]], grad_fn=<

In [127]:
from sklearn import metrics
import plotly.express as px

y_pred_proba = torch.sigmoid(output[0]).detach().numpy()
fpr, tpr, thresholds = metrics.roc_curve(targets[0][0], y_pred_proba)

# The histogram of scores compared to true labels
fig_hist = px.histogram(
    x=y_pred_proba, color=targets[0][0], nbins=50,
    labels=dict(color='True Labels', x='Score')
)

fig_hist.show()


# Evaluating model performance at various thresholds
df = pd.DataFrame({
    'False Positive Rate': fpr,
    'True Positive Rate': tpr
}, index=thresholds)
df.index.name = "Thresholds"
df.columns.name = "Rate"

fig_thresh = px.line(
    df, title='TPR and FPR at every threshold',
    width=700, height=500
)

fig_thresh.update_yaxes(scaleanchor="x", scaleratio=1)
fig_thresh.update_xaxes(range=[0, 1], constrain='domain')
fig_thresh.show()

In [129]:
from sklearn.metrics import auc
fig = px.area(
    x=fpr, y=tpr,
    title=f'ROC Curve (AUC={auc(fpr, tpr):.4f}) ::: AutoEncoder',
    labels=dict(x='False Positive Rate', y='True Positive Rate'),
    width=700, height=500
)
fig.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=0, y1=1
)

fig.update_yaxes(scaleanchor="x", scaleratio=1)
fig.update_xaxes(constrain='domain')
fig.show()