In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler

## Data Manipulation

In [2]:
df = pd.read_csv('Trimmed Jester Data.csv')

# Creating an ID column
ID = [i for i in range(1,4095)]
ID

df['ID'] = ID

In [3]:
df

Unnamed: 0.1,Unnamed: 0,0,6,7,12,14,15,16,17,18,...,149,150,151,152,153,154,155,156,157,ID
0,3,47.0,,,,,,,,-5.41,...,,5.61,-4.51,0.00,0.00,,0.00,5.93,4.19,1
1,4,13.0,,,,,,,,-7.72,...,,,,,0.00,,,,0.00,2
2,5,33.0,,,,,,,,4.39,...,3.19,,0.00,3.41,,-2.32,,0.00,2.93,3
3,6,112.0,-4.45,7.54,-9.65,-7.26,7.83,-8.19,0.00,0.00,...,1.89,0.00,0.00,7.38,3.19,-9.33,-7.26,-9.13,-8.19,4
4,7,34.0,,,,1.71,,,,6.63,...,,2.22,6.08,10.00,,,6.30,4.11,8.25,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4089,7690,50.0,-2.24,-4.59,-4.51,-9.07,-5.37,,-9.63,-4.84,...,,-2.76,0.45,-2.64,-5.57,-1.59,-3.29,-3.58,0.41,4090
4090,7693,5.0,,,,,,,,-1.22,...,,,,,,,,,,4091
4091,7694,27.0,,,,,,,,0.00,...,,0.00,0.00,0.00,,0.00,0.00,0.00,,4092
4092,7696,26.0,,,,,,,,-7.93,...,8.63,,6.76,0.00,3.50,6.26,,-2.66,0.65,4093


In [4]:
# Renaming columns
df.columns = ['joke{}'.format(i) for i in df.columns]
df = df.rename(columns = {'joke0':'jokes_rated', 'jokeUnnamed: 0':'joke1', 'jokeID':'ID'})

df = df.drop('jokes_rated', axis = 1)
df

Unnamed: 0,joke1,joke6,joke7,joke12,joke14,joke15,joke16,joke17,joke18,joke20,...,joke149,joke150,joke151,joke152,joke153,joke154,joke155,joke156,joke157,ID
0,3,,,,,,,,-5.41,-4.59,...,,5.61,-4.51,0.00,0.00,,0.00,5.93,4.19,1
1,4,,,,,,,,-7.72,,...,,,,,0.00,,,,0.00,2
2,5,,,,,,,,4.39,,...,3.19,,0.00,3.41,,-2.32,,0.00,2.93,3
3,6,-4.45,7.54,-9.65,-7.26,7.83,-8.19,0.00,0.00,9.90,...,1.89,0.00,0.00,7.38,3.19,-9.33,-7.26,-9.13,-8.19,4
4,7,,,,1.71,,,,6.63,5.61,...,,2.22,6.08,10.00,,,6.30,4.11,8.25,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4089,7690,-2.24,-4.59,-4.51,-9.07,-5.37,,-9.63,-4.84,,...,,-2.76,0.45,-2.64,-5.57,-1.59,-3.29,-3.58,0.41,4090
4090,7693,,,,,,,,-1.22,,...,,,,,,,,,,4091
4091,7694,,,,,,,,0.00,0.00,...,,0.00,0.00,0.00,,0.00,0.00,0.00,,4092
4092,7696,,,,,,,,-7.93,,...,8.63,,6.76,0.00,3.50,6.26,,-2.66,0.65,4093


In [5]:
# Transforming the data from wide to long
df_long = pd.melt(df, id_vars = ['ID'], var_name = 'Joke', value_name = 'Rating')
df_long

Unnamed: 0,ID,Joke,Rating
0,1,joke1,3.00
1,2,joke1,4.00
2,3,joke1,5.00
3,4,joke1,6.00
4,5,joke1,7.00
...,...,...,...
560873,4090,joke157,0.41
560874,4091,joke157,
560875,4092,joke157,
560876,4093,joke157,0.65


In [111]:
# Use this if you need to re-append counts

# rating_count = (df_long.
#      groupby(by = ['ID'])['Rating'].
#      count().
#      reset_index().
#      rename(columns = {'Rating': 'NumRatings'})
#      [['ID', 'NumRatings']]
#     )

# rating_count

Unnamed: 0,ID,NumRatings
0,1,48
1,2,14
2,3,34
3,4,111
4,5,35
...,...,...
4089,4090,51
4090,4091,6
4091,4092,28
4092,4093,27


## Building a collaborative filtering model

In [6]:
# Normalizing the ratings
scaler = MinMaxScaler()
df_long['Rating'] = df_long['Rating'].values.astype(float)
rating_scaled = pd.DataFrame(scaler.fit_transform(df_long['Rating'].values.reshape(-1,1)))
df_long['Rating'] = rating_scaled

In [7]:
# Creating a user, joke matrix
df_long = df_long.drop_duplicates(['ID', 'Joke'])
user_joke_matrix = df_long.pivot(index = 'ID', columns = 'Joke', values = 'Rating')
user_joke_matrix.fillna(0, inplace=True)
users = user_joke_matrix.index.tolist()
jokes = user_joke_matrix.columns.tolist()
user_joke_matrix = user_joke_matrix.values

In [8]:
# Importing a previous version of tensorflow so we can use the placeholder function
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

Instructions for updating:
non-resource variables are not supported in the long term


In [9]:
# Setting up neural network parameters (e.g. dimensions of each hidden layer)
num_input = df_long['Joke'].nunique()
num_hidden_1 = 10
num_hidden_2 = 5

# Initializing tensorflow placeholder
X = tf.placeholder(tf.float64, [None, num_input])

# Randomly initializing weights and biases
weights = {
    'encoder_h1': tf.Variable(tf.random_normal([num_input, num_hidden_1], dtype = tf.float64)),
    'encoder_h2': tf.Variable(tf.random_normal([num_hidden_1, num_hidden_2], dtype = tf.float64)),
    'decoder_h1': tf.Variable(tf.random_normal([num_hidden_2, num_hidden_1], dtype = tf.float64)),
    'decoder_h2': tf.Variable(tf.random_normal([num_hidden_1, num_input], dtype = tf.float64)),
}

biases = {
    'encoder_b1': tf.Variable(tf.random_normal([num_hidden_1], dtype = tf.float64)),
    'encoder_b2': tf.Variable(tf.random_normal([num_hidden_2], dtype = tf.float64)),
    'decoder_b1': tf.Variable(tf.random_normal([num_hidden_1], dtype = tf.float64)),
    'decoder_b2': tf.Variable(tf.random_normal([num_input], dtype = tf.float64)),
}

In [10]:
# Building encoder and decoder models
def encoder(x):
    layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(x, weights['encoder_h1']), biases['encoder_b1']))
    layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, weights['encoder_h2']), biases['encoder_b2']))
    return layer_2

def decoder(x):
    layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(x, weights['decoder_h1']), biases['decoder_b1']))
    layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, weights['decoder_h2']), biases['decoder_b2']))
    return layer_2

In [11]:
# Generating predictions
encoder_op = encoder(X)
decoder_op = decoder(encoder_op)
y_pred = decoder_op
y_true = X

In [12]:
# Loss function
loss = tf.losses.mean_squared_error(y_true, y_pred)
# Optimizing to minimize squared error
optimizer = tf.train.RMSPropOptimizer(0.03).minimize(loss)
# Defining evaluation metrics
eval_x = tf.placeholder(tf.int32, )
eval_y = tf.placeholder(tf.int32, )
pre, pre_op = tf.metrics.precision(labels=eval_x, predictions=eval_y)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [13]:
# Initializing variables and creating a df that will store the top 10 joke recommendations for each user
init = tf.global_variables_initializer()
local_init = tf.local_variables_initializer()
pred_data = pd.DataFrame()

In [14]:
with tf.Session() as session:
    # Splitting training data into batches of size 35 and feeding the neural network 100 times
    epochs = 100
    batch_size = 35

    session.run(init)
    session.run(local_init)

    num_batches = int(user_joke_matrix.shape[0] / batch_size)
    user_joke_matrix = np.array_split(user_joke_matrix, num_batches)
    
    for i in range(epochs):

        avg_cost = 0
        for batch in user_joke_matrix:
            _, l = session.run([optimizer, loss], feed_dict={X: batch})
            avg_cost += l

        avg_cost /= num_batches

        print("epoch: {} Loss: {}".format(i + 1, avg_cost))

    user_joke_matrix = np.concatenate(user_joke_matrix, axis=0)

    preds = session.run(decoder_op, feed_dict={X: user_joke_matrix})

    pred_data = pred_data.append(pd.DataFrame(preds))

    pred_data = pred_data.stack().reset_index(name='Rating')
    pred_data.columns = ['ID', 'Joke', 'Rating']
    pred_data['ID'] = pred_data['ID'].map(lambda value: users[value])
    pred_data['Joke'] = pred_data['Joke'].map(lambda value: jokes[value])
    
    keys = ['ID', 'Joke']
    index_1 = pred_data.set_index(keys).index
    index_2 = df_long.set_index(keys).index

    top_ten_ranked = pred_data[~index_1.isin(index_2)]
    top_ten_ranked = top_ten_ranked.sort_values(['ID', 'Rating'], ascending=[True, False])
    top_ten_ranked = top_ten_ranked.groupby('ID').head(10)

epoch: 1 Loss: 0.29818954447220114
epoch: 2 Loss: 0.029385855204336935
epoch: 3 Loss: 0.0002533467038966623
epoch: 4 Loss: 0.00023538376680970282
epoch: 5 Loss: 0.00021954271116932244
epoch: 6 Loss: 0.00020105500737074557
epoch: 7 Loss: 0.00018357266486744046
epoch: 8 Loss: 0.00016781929076712694
epoch: 9 Loss: 0.0001546021127923864
epoch: 10 Loss: 0.00014381267667745148
epoch: 11 Loss: 0.00013560431369404765
epoch: 12 Loss: 0.00013080567539506757
epoch: 13 Loss: 0.00012874784913502276
epoch: 14 Loss: 0.00012741821665562431
epoch: 15 Loss: 0.00012559310085019456
epoch: 16 Loss: 0.0001242569182110252
epoch: 17 Loss: 0.00012495620821574715
epoch: 18 Loss: 0.000127575152490067
epoch: 19 Loss: 0.0001308194076649482
epoch: 20 Loss: 0.00013216165074950446
epoch: 21 Loss: 0.00012997841403030805
epoch: 22 Loss: 0.00012502704249010125
epoch: 23 Loss: 0.00011875459000962031
epoch: 24 Loss: 0.0001123252940917139
epoch: 25 Loss: 0.0001062254157322341
epoch: 26 Loss: 0.00010043625465429833
epoch: 2

In [18]:
top_ten_ranked.loc[top_ten_ranked['ID'] == 3000]

Unnamed: 0,ID,Joke,Rating


In [26]:
type(users[1])

int