In [151]:
import sys
import os
import math
import pandas as pd
import numpy as np
import json

# Python visualisation library
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap

# Another much simpler viz library
import seaborn as sns

# Adding root of git repo to path so we can import pyfireanalytics
sys.path.append('..')


# Load Data

In [152]:
data = pd.read_csv('~/desktop/movielens/UDATA.csv', encoding="latin-1")
item = pd.read_csv('~/desktop/movielens/UITEM.csv', encoding="latin-1")
genre = pd.read_csv('~/desktop/movielens/UGENRE.csv', encoding="latin-1")
user = pd.read_csv('~/desktop/movielens/UUSER.csv', encoding="latin-1")
#"ISO-8859-1"

# Data Cleaning

In [153]:
# rename itemid to movieid to join data and item together
data.rename(columns={'itemid':'movieid'}, inplace=True)

#join
df = pd.merge(data, item, on='movieid', how='outer')
df = pd.merge(df, user, on='userid', how='outer')

#change unix timestamp to readable date
df['reviewtimestamp'] = pd.to_datetime(df['timestamp'],unit='s')

#Split reviewtimestamp into year month day
df['reviewyear'] = df['reviewtimestamp'].dt.year
df['reviewmonth'] = df['reviewtimestamp'].dt.month
df['reviewday'] = df['reviewtimestamp'].dt.day


# Collaborative Filtering Model

In [154]:
#Step 1: Find the top similar users using Centred Cosine Similarity

#centred cosine
ccs = data.groupby(['userid'], as_index=False, sort=False).mean()
ccs = ccs.rename(columns={"rating": "mean_rating"})
data = pd.merge(data, ccs, on='userid', how='left', sort="False")

data['rating'] = data['rating'].astype(float)
data = data.copy()
data['new_rating'] = data["rating"]-data['mean_rating']

In [155]:
data2=data.copy()
umr = pd.DataFrame({"user": data2['userid'], "movie":data2['movieid_x'], "rating":data2['new_rating']})
umr2 = umr.pivot_table(index='user', columns='movie', values='rating').fillna(0)

In [157]:
allusers = umr2.values
user1 = allusers[0]
denominator1 = np.sqrt(sum([np.square(x) for x in user1]))

cosinesimilarity = [(1,1)]
i=1
for user in allusers[1:]:
        numerator = [x*y for x,y in zip(user1, user)]
        denominator2 = np.sqrt(sum([np.square(x) for x in user]))
        costheta = sum(numerator) / (denominator1 * denominator2)
        cosinesimilarity.append((umr2.index[i], costheta))
        i += 1

cosinesimilarity.sort(key=lambda x: x[1], reverse=True)

similar10users = cosinesimilarity[0:10]

top10users = pd.DataFrame()
for user in similar10users:
    top10users = top10users.append(umr2.loc[user[0]])
top10users['costheta'] = [user[1] for user in similar10users]

all_values = top10users.values


In [158]:
#Step 2: Predict the user ratings on an item based on other users

denominator = sum([x[1] for x in similar10users])

inx = 0
values= []
for x in top10users.loc[1]:
    totalsum=0
    if x ==0.0:
        for v in range(1,10):
            totalsum += all_values[v-1][inx]*all_values[v-1][1682]
        top10users.loc[1][inx+1] = totalsum/denominator
    inx += 1


In [159]:
#Step 3: Recommend the items which have higher predicted value

top10rec = []

for inx,x in enumerate(top10users.loc[1].values):
    if x > 0.0:
        top10rec.append((inx,x))
        
top10rec.sort(key=lambda x: x[1], reverse=True)

top10rec[:10]

[(0, 1.3897058823529411),
 (5, 1.3897058823529411),
 (8, 1.3897058823529411),
 (11, 1.3897058823529411),
 (12, 1.3897058823529411),
 (13, 1.3897058823529411),
 (14, 1.3897058823529411),
 (15, 1.3897058823529411),
 (18, 1.3897058823529411),
 (31, 1.3897058823529411)]