##### Import the required packages

In [36]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import os
import seaborn as sns
import pickle
from collections import Counter
from datetime import datetime
# We can override the default matplotlib styles with those of Seaborn
sns.set()## Importing the relevant libraries


In [4]:
# Load the data from a .csv
ratings_data = pd.read_csv(os.path.join(os.path.pardir,'data','raw','ratings.csv'))
movies_data = pd.read_csv(os.path.join(os.path.pardir,'data','raw','movies.csv'))

In [5]:
ratings_df = ratings_data.copy()
movies_df = movies_data.copy()

In [6]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [7]:
ratings_df.describe(include='all')

Unnamed: 0,userId,movieId,rating,timestamp
count,100836.0,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557,1205946000.0
std,182.618491,35530.987199,1.042529,216261000.0
min,1.0,1.0,0.5,828124600.0
25%,177.0,1199.0,3.0,1019124000.0
50%,325.0,2991.0,3.5,1186087000.0
75%,477.0,8122.0,4.0,1435994000.0
max,610.0,193609.0,5.0,1537799000.0


In [8]:
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
userId       100836 non-null int64
movieId      100836 non-null int64
rating       100836 non-null float64
timestamp    100836 non-null int64
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [6]:
# Make the user IDs go from 0 to N-1
ratings_df['userId'] = ratings_df['userId'] - 1

In [7]:
# Create a mapper for movie IDs
unique_movie_ids = list(ratings_df['movieId'].unique())
movie_id_mapper = {}
count = 0
for movie_id in unique_movie_ids:
    movie_id_mapper[movie_id] = count
    count +=1

# Add them to the databrame
ratings_df['movieId'] = ratings_df['movieId'].apply(lambda x: movie_id_mapper[x])
ratings_df = ratings_df.drop(columns='timestamp', axis=1)

In [35]:
N = ratings_df['userId'].max() + 1 # Number of users
M = ratings_df['movieId'].max() + 1 # number of movies


# user_id_count = dict(ratings_df['userId'].value_counts())
# movie_id_count = dict(ratings_df['movieId'].value_counts())

user_id_count = Counter(ratings_df['userId'])
movie_id_count = Counter(ratings_df['movieId'])

# Number of users and moviies we would like to keep
n = 10000
m = 2000

user_ids = [u for u, c in user_id_count.most_common(n)]
movie_ids = [m for m, c in user_id_count.most_common(m)]

# Make a copy, otherwise the original df won't be overwritten
ratings_df_small = ratings_df[ratings_df['userId'].isin(user_ids) & ratings_df['movieId'].isin(movie_ids)].copy()

# Need to remake user_ids and movie_ids since they are no longer sequential
new_user_id_mapper = {}
i = 0
for old in user_ids:
    new_user_id_mapper[old] = i
    i +=1

new_movie_id_mapper = {}
j = 0
for old in movie_ids:
    new_movie_id_mapper[old] = i
    
    
ratings_df_small['userId'] = ratings_df_small['userId'].apply(lambda x: new_user_id_mapper[x])
ratings_df_small['movieId'] = ratings_df_small['movieId'].apply(lambda x: new_movie_id_mapper[x])

ratings_df_small.to_csv(os.path.join(os.path.pardir,'data','processed','small_ratings.csv'))

In [53]:
ratings_df_processed = ratings_df_small.copy()
N_ = ratings_df_processed['userId'].max() + 1 #Number of users
M_ = ratings_df_processed['movieId'].max() + 1 # Number of movies

# Split the df into train and test
ratings_df_train, ratings_df_test = train_test_split(ratings_df_processed, test_size=0.2)

# A dictionary to tell us, which users have rated which movie
user_movie = {}

# A dictionary to tell us, which movies have been rated by which users
movie_user = {}

# A dictionary to lookup ratings
user_movie_ratings = {}

cutoff = int(0.8 * len(ratings_df_processed))
count = 0
def update_user_movie_and_movie_user(row):
    global count
    count += 1
    if count % 100000 == 0:
        print("Processed: %.3f" % (float(count)/cutoff))
    i = int(row['userId'])
    j = int(row['movieId'])
    if i not in user_movie:
        user_movie[i] = [j]
    else:
        user_movie[i].append(j)
    
    if j not in movie_user:
        movie_user[j] = [i]
    else:
        movie_user[j].append(i)
    user_movie_ratings[(i,j)] = row['rating']

ratings_df_train.apply(update_user_movie_and_movie_user, axis=1)
    

46536    None
462      None
85686    None
53217    None
30050    None
5219     None
506      None
9945     None
29378    None
72526    None
55       None
80441    None
27635    None
48603    None
33205    None
91334    None
32788    None
1064     None
10048    None
11666    None
53228    None
67725    None
63186    None
73056    None
20658    None
8692     None
90012    None
85492    None
19738    None
66860    None
         ... 
97467    None
73416    None
59177    None
63285    None
23679    None
6436     None
91728    None
84247    None
97514    None
23274    None
8358     None
82523    None
7138     None
6126     None
82490    None
86034    None
42775    None
52039    None
55871    None
27703    None
87752    None
54130    None
55941    None
86001    None
8        None
89124    None
39883    None
75668    None
90335    None
85576    None
Length: 24972, dtype: object

In [58]:
user_movie_ratings_test = {}

def update_user_movie_and_movie_user_test(row):
    global count
    count += 1
    if count % 100000 == 0:
        print("Processed: %.3f" % (float(count)/cutoff))
    i = int(row['userId'])
    j = int(row['movieId'])
    user_movie_ratings_test[(i, j)] = row['rating']


ratings_df_test.apply(update_user_movie_and_movie_user_test, axis=1)

with open(os.path.join(os.path.pardir,'data','interim','user_movie.json'), 'wb') as f:
    pickle.dump(user_movie, f)

with open(os.path.join(os.path.pardir,'data','interim','movie_user.json'), 'wb') as f:
    pickle.dump(movie_user, f)

with open(os.path.join(os.path.pardir,'data','interim','user_movie_rating.json'), 'wb') as f:
    pickle.dump(user_movie_ratings_test, f)
    

In [59]:
with open(os.path.join(os.path.pardir,'data','interim','user_movie.json'), 'rb') as f:
    user_movie = pickle.load(f)

with open(os.path.join(os.path.pardir,'data','interim','movie_user.json'), 'rb') as f:
    movie_user = pickle.load(f)

with open(os.path.join(os.path.pardir,'data','interim','user_movie_rating.json'), 'rb') as f:
    user_movie_ratings_test = pickle.load(f)