<h2>Content-Based Filtering Algorithm</h2>

In [64]:
import numpy as np
import pandas as pd
import tensorflow as tf
import sklearn.preprocessing
import sklearn.model_selection
import matplotlib.pyplot as plt
import csv
from collections import defaultdict

<h4>Content-Based Filtering Algorithm with TensorFlow</h4>

In [55]:
def content_based_filtering(output_units):
    
    tf.random.set_seed(42)
    num_user_features = 14
    num_item_features = 17
    
    user_neural_network = tf.keras.models.Sequential([
        
        tf.keras.layers.Dense(units=256, activation="relu"),
        tf.keras.layers.Dense(units=124, activation="relu"),
        tf.keras.layers.Dense(units=output_units, activation="linear")
        
    ])
    
    item_neural_network = tf.keras.models.Sequential([
        
        tf.keras.layers.Dense(units=10, activation="relu"),
        tf.keras.layers.Dense(units=10, activation="relu"),
        tf.keras.layers.Dense(units=output_units, activation="linear")
        
    ])
    
    input_user = tf.keras.layers.Input(shape=(num_user_features))
    vu = user_neural_network(input_user)
    vu = tf.linalg.l2_normalize(vu, axis=1)
    
    input_item = tf.keras.layers.Input(shape=(num_item_features))
    vm = item_neural_network(input_item)
    vm = tf.linalg.l2_normalize(vm, axis=1)

    output = tf.keras.layers.Dot(axes=1)([vu, vm])
    model = tf.keras.Model([input_user, input_item], output)

    return model.summary()    

<h4>Analyze Data</h4>

In [6]:
top_10_data = pd.read_csv("/home/sam/projects/machine-learning/data/content_based/content_top10_df.csv")
by_genre_data = pd.read_csv("/home/sam/projects/machine-learning/data/content_based/content_bygenre_df.csv")
top_10_data

Unnamed: 0,movie id,num ratings,ave rating,title,genres
0,4993,198,4.106061,"Lord of the Rings: The Fellowship of the Ring,...",Adventure|Fantasy
1,5952,188,4.021277,"Lord of the Rings: The Two Towers, The",Adventure|Fantasy
2,7153,185,4.118919,"Lord of the Rings: The Return of the King, The",Action|Adventure|Drama|Fantasy
3,4306,170,3.867647,Shrek,Adventure|Animation|Children|Comedy|Fantasy|Ro...
4,58559,149,4.238255,"Dark Knight, The",Action|Crime|Drama
5,6539,149,3.778523,Pirates of the Caribbean: The Curse of the Bla...,Action|Adventure|Comedy|Fantasy
6,79132,143,4.066434,Inception,Action|Crime|Drama|Mystery|Sci-Fi|Thriller
7,6377,141,3.960993,Finding Nemo,Adventure|Animation|Children|Comedy
8,4886,132,3.871212,"Monsters, Inc.",Adventure|Animation|Children|Comedy|Fantasy
9,7361,131,4.160305,Eternal Sunshine of the Spotless Mind,Drama|Romance|Sci-Fi


In [7]:
by_genre_data

Unnamed: 0,genre,num movies,ave rating/genre,ratings per genre
0,Action,321,3.37,10377
1,Adventure,234,3.42,8785
2,Animation,76,3.63,2588
3,Children,69,3.44,2472
4,Comedy,326,3.36,8911
5,Crime,139,3.54,4671
6,Documentary,13,3.81,280
7,Drama,342,3.61,10201
8,Fantasy,124,3.37,4468
9,Horror,56,3.2,1345


In [16]:
def load_data():
    
    item_train = np.genfromtxt('/home/sam/projects/machine-learning/data/content_based/content_item_train.csv', delimiter=',')
    user_train = np.genfromtxt('/home/sam/projects/machine-learning/data/content_based/content_user_train.csv', delimiter=',')
    y_train    = np.genfromtxt('/home/sam/projects/machine-learning/data/content_based/content_y_train.csv', delimiter=',')
    
    with open('/home/sam/projects/machine-learning/data/content_based/content_item_train_header.txt', newline='') as f:  
        item_features = list(csv.reader(f))[0]
        
    with open('/home/sam/projects/machine-learning/data/content_based/content_user_train_header.txt', newline='') as f:
        user_features = list(csv.reader(f))[0]
        
    item_vecs = np.genfromtxt('/home/sam/projects/machine-learning/data/content_based/content_item_vecs.csv', delimiter=',')

    movie_dict = defaultdict(dict)
    count = 0
    
    with open('/home/sam/projects/machine-learning/data/content_based/content_movie_list.csv', newline='') as csvfile:
        reader = csv.reader(csvfile, delimiter=',', quotechar='"')
        
        for line in reader:
            if count == 0:
                count += 1    
            else:
                count += 1
                movie_id = int(line[0])
                movie_dict[movie_id]["title"] = line[1]
                movie_dict[movie_id]["genres"] = line[2]

    #with open('/home/sam/projects/machine-learning/data/content_based/content_user_to_genre.pickle', 'rb') as f:
        #user_to_genre = pickle.load(f)

    return(item_train, user_train, y_train, item_features, user_features, item_vecs, movie_dict)


In [26]:
item_train, user_train, y_train, item_features, user_features, item_vecs, movie_dict = load_data()
print(item_train.shape)
print(user_train.shape)
print(y_train.shape)
print(len(item_features))
print(len(user_features))
print(item_vecs.shape)
print(len(movie_dict))

(50884, 17)
(50884, 17)
(50884,)
17
17
(847, 17)
847


In [19]:
item_train

array([[6.87400000e+03, 2.00300000e+03, 3.96183206e+00, ...,
        0.00000000e+00, 0.00000000e+00, 1.00000000e+00],
       [8.79800000e+03, 2.00400000e+03, 3.76136364e+00, ...,
        0.00000000e+00, 0.00000000e+00, 1.00000000e+00],
       [4.69700000e+04, 2.00600000e+03, 3.25000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [1.68250000e+05, 2.01700000e+03, 3.63333333e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.68250000e+05, 2.01700000e+03, 3.63333333e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.68252000e+05, 2.01700000e+03, 4.28000000e+00, ...,
        0.00000000e+00, 1.00000000e+00, 0.00000000e+00]])

<h4>Feature Scaling with scikit-learn</h4>

In [33]:
scaler_item = sklearn.preprocessing.StandardScaler()
scaler_item.fit(item_train)
item_train = scaler_item.transform(item_train)

In [35]:
scaler_user = sklearn.preprocessing.StandardScaler()
scaler_user.fit(user_train)
user_train = scaler_user.transform(user_train)

In [37]:
scaler_target = sklearn.preprocessing.MinMaxScaler()
scaler_target.fit(y_train.reshape(-1, 1))
y_train = scaler_target.transform(y_train.reshape(-1, 1))

In [40]:
print(item_train.shape)
print(user_train.shape)
print(y_train.shape)

(50884, 17)
(50884, 17)
(50884, 1)


<h4>Split data into train and test sets</h4>

In [42]:
item_train, item_test = sklearn.model_selection.train_test_split(item_train, train_size=0.8, shuffle=True, random_state=1)
user_train, user_test = sklearn.model_selection.train_test_split(user_train, train_size=0.8, shuffle=True, random_state=1)
y_train, y_test = sklearn.model_selection.train_test_split(y_train, train_size=0.8, shuffle=True, random_state=1)

In [44]:
print(item_train.shape)
print(item_test.shape)
print(user_train.shape)
print(user_test.shape)
print(y_train.shape)
print(y_test.shape)

(40707, 17)
(10177, 17)
(40707, 17)
(10177, 17)
(40707, 1)
(10177, 1)


In [56]:
summary = content_based_filtering(output_units=32)
summary

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_4 (InputLayer)        [(None, 14)]                 0         []                            
                                                                                                  
 input_5 (InputLayer)        [(None, 17)]                 0         []                            
                                                                                                  
 sequential_5 (Sequential)   (None, 32)                   39708     ['input_4[0][0]']             
                                                                                                  
 sequential_6 (Sequential)   (None, 32)                   642       ['input_5[0][0]']             
                                                                                            

In [68]:
def content_based_filtering(user_train, item_train, y_train, user_test, 
                            item_test, y_test, output_units, epochs=100, learning_rate=1e-2):
    
    tf.random.set_seed(42)
    num_user_features = user_train.shape[1] - 3 
    num_item_features = item_train.shape[1] - 1
    u_s = 3  # start of columns to use in training, user
    i_s = 1  # start of columns to use in training, items
    user_neural_network = tf.keras.models.Sequential([
        
        tf.keras.layers.Dense(units=256, activation="relu"),
        tf.keras.layers.Dense(units=124, activation="relu"),
        tf.keras.layers.Dense(units=output_units, activation="linear")
        
    ])
    
    item_neural_network = tf.keras.models.Sequential([
        
        tf.keras.layers.Dense(units=10, activation="relu"),
        tf.keras.layers.Dense(units=10, activation="relu"),
        tf.keras.layers.Dense(units=output_units, activation="linear")
        
    ])
    
    input_user = tf.keras.layers.Input(shape=(num_user_features))
    vu = user_neural_network(input_user)
    vu = tf.linalg.l2_normalize(vu, axis=1)
    
    input_item = tf.keras.layers.Input(shape=(num_item_features))
    vm = item_neural_network(input_item)
    vm = tf.linalg.l2_normalize(vm, axis=1)

    output = tf.keras.layers.Dot(axes=1)([vu, vm])
    model = tf.keras.Model([input_user, input_item], output)
    cost_function = tf.keras.losses.MeanSquaredError()
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss=cost_function)
    model.fit([[user_train[:, u_s:], item_train[:, i_s:]]], y_train, epochs=epochs)
    model.evaluate([user_test[:, u_s:], item_test[:, i_s:]], y_test)

<h4>Train and Evaluate the Model</h4>

In [69]:
content_based_filtering(
    user_train=user_train, 
    item_train=item_train,
    y_train=y_train,
    user_test=user_test, 
    item_test=item_test, 
    y_test=y_test, 
    output_units=32, 
    epochs=30, 
    learning_rate=1e-2
)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
