In [1]:
!pip install kaggle



In [0]:
mkdir .kaggle

In [6]:
!cp /content/.kaggle/kaggle.json ~/.kaggle/kaggle.json
!kaggle config set -n path -v{/content}

- path is now set to: {/content}


In [0]:
!chmod 600 /root/.kaggle/kaggle.json

In [8]:
!kaggle datasets list -s movielens

ref                                                    title                                             size  lastUpdated          downloadCount  
-----------------------------------------------------  -----------------------------------------------  -----  -------------------  -------------  
grouplens/movielens-20m-dataset                        MovieLens 20M Dataset                            195MB  2018-08-15 23:09:34          10548  
prajitdatta/movielens-100k-dataset                     MovieLens 100K Dataset                             5MB  2017-01-05 12:37:32           5539  
rounakbanik/the-movies-dataset                         The Movies Dataset                               228MB  2017-11-10 02:40:38          69008  
sherinclaudia/movielens                                Movielens                                          6MB  2019-01-21 13:20:43            477  
jneupane12/movielens                                   MovieLens                                         12MB  2

In [9]:
!kaggle datasets download -d grouplens/movielens-20m-dataset -p /content

Downloading movielens-20m-dataset.zip to /content
 92% 180M/195M [00:07<00:00, 23.5MB/s]
100% 195M/195M [00:07<00:00, 28.3MB/s]


In [10]:
!unzip \*.zip

Archive:  movielens-20m-dataset.zip
  inflating: genome_scores.csv       
  inflating: genome_tags.csv         
  inflating: link.csv                
  inflating: movie.csv               
  inflating: rating.csv              
  inflating: tag.csv                 


In [0]:
from __future__ import print_function, division
from builtins import range, input

In [0]:
import pandas as pd

In [0]:
df = pd.read_csv('rating.csv')

In [0]:
# make the user ids go from 0...N-1
df.userId = df.userId - 1

In [15]:
df.userId

0                0
1                0
2                0
3                0
4                0
             ...  
20000258    138492
20000259    138492
20000260    138492
20000261    138492
20000262    138492
Name: userId, Length: 20000263, dtype: int64

In [0]:
# create a mapping for movie ids
unique_movie_ids = set(df.movieId.values)
movie2idx = {}
count = 0
for movie_id in unique_movie_ids:
  movie2idx[movie_id] = count
  count += 1

# add them to the data frame
# takes awhile
df['movie_idx'] = df.apply(lambda row: movie2idx[row.movieId], axis=1)

df = df.drop(columns=['timestamp'])

df.to_csv('edited_rating.csv', index=False)

In [0]:
import pickle
import numpy as np
import pandas as pd
from collections import Counter

In [18]:
df = pd.read_csv('edited_rating.csv')
print("original dataframe size:", len(df))

original dataframe size: 20000263


In [0]:
N = df.userId.max() + 1 # number of users
M = df.movie_idx.max() + 1 # number of movies

user_ids_count = Counter(df.userId)
movie_ids_count = Counter(df.movie_idx)

In [0]:
# number of users and movies we would like to keep
n = 10000
m = 2000

user_ids = [u for u, c in user_ids_count.most_common(n)]
movie_ids = [m for m, c in movie_ids_count.most_common(m)]

In [21]:
# make a copy, otherwise ids won't be overwritten
df_small = df[df.userId.isin(user_ids) & df.movie_idx.isin(movie_ids)].copy()

# need to remake user ids and movie ids since they are no longer sequential
new_user_id_map = {}
i = 0
for old in user_ids:
  new_user_id_map[old] = i
  i += 1
print("i:", i)

new_movie_id_map = {}
j = 0
for old in movie_ids:
  new_movie_id_map[old] = j
  j += 1
print("j:", j)

i: 10000
j: 2000


In [22]:
print("Setting new ids")
df_small.loc[:, 'userId'] = df_small.apply(lambda row: new_user_id_map[row.userId], axis=1)
df_small.loc[:, 'movie_idx'] = df_small.apply(lambda row: new_movie_id_map[row.movie_idx], axis=1)
# df_small.drop(columns=['userId', 'movie_idx'])
# df_small.rename(index=str, columns={'new_userId': 'userId', 'new_movie_idx': 'movie_idx'})
print("max user id:", df_small.userId.max())
print("max movie id:", df_small.movie_idx.max())

Setting new ids
max user id: 9999
max movie id: 1999


In [23]:
print("small dataframe size:", len(df_small))
df_small.to_csv('small_rating.csv', index=False)

small dataframe size: 5392025


In [0]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import shuffle

In [0]:
df = pd.read_csv('small_rating.csv')

N = df.userId.max() + 1 # number of users
M = df.movie_idx.max() + 1 # number of movies

In [0]:
# split into train and test
df = shuffle(df)
cutoff = int(0.8*len(df))
df_train = df.iloc[:cutoff]
df_test = df.iloc[cutoff:]

In [0]:
# a dictionary to tell us which users have rated which movies
user2movie = {}
# a dicationary to tell us which movies have been rated by which users
movie2user = {}
# a dictionary to look up ratings
usermovie2rating = {}

In [29]:
print("Calling: update_user2movie_and_movie2user")
count = 0
def update_user2movie_and_movie2user(row):
  global count
  count += 1
  if count % 100000 == 0:
    print("processed: %.3f" % (float(count)/cutoff))

  i = int(row.userId)
  j = int(row.movie_idx)
  if i not in user2movie:
    user2movie[i] = [j]
  else:
    user2movie[i].append(j)

  if j not in movie2user:
    movie2user[j] = [i]
  else:
    movie2user[j].append(i)

  usermovie2rating[(i,j)] = row.rating
df_train.apply(update_user2movie_and_movie2user, axis=1)

Calling: update_user2movie_and_movie2user
processed: 0.023
processed: 0.046
processed: 0.070
processed: 0.093
processed: 0.116
processed: 0.139
processed: 0.162
processed: 0.185
processed: 0.209
processed: 0.232
processed: 0.255
processed: 0.278
processed: 0.301
processed: 0.325
processed: 0.348
processed: 0.371
processed: 0.394
processed: 0.417
processed: 0.440
processed: 0.464
processed: 0.487
processed: 0.510
processed: 0.533
processed: 0.556
processed: 0.580
processed: 0.603
processed: 0.626
processed: 0.649
processed: 0.672
processed: 0.695
processed: 0.719
processed: 0.742
processed: 0.765
processed: 0.788
processed: 0.811
processed: 0.835
processed: 0.858
processed: 0.881
processed: 0.904
processed: 0.927
processed: 0.950
processed: 0.974
processed: 0.997


815435     None
1453086    None
3443241    None
4605686    None
1964943    None
           ... 
1028406    None
1220161    None
4557186    None
5217453    None
190377     None
Length: 4313620, dtype: object

In [30]:
# test ratings dictionary
usermovie2rating_test = {}
print("Calling: update_usermovie2rating_test")
count = 0
def update_usermovie2rating_test(row):
  global count
  count += 1
  if count % 100000 == 0:
    print("processed: %.3f" % (float(count)/len(df_test)))

  i = int(row.userId)
  j = int(row.movie_idx)
  usermovie2rating_test[(i,j)] = row.rating
df_test.apply(update_usermovie2rating_test, axis=1)

Calling: update_usermovie2rating_test
processed: 0.093
processed: 0.185
processed: 0.278
processed: 0.371
processed: 0.464
processed: 0.556
processed: 0.649
processed: 0.742
processed: 0.835
processed: 0.927


927849     None
2132141    None
4302001    None
1466343    None
4220148    None
           ... 
3615550    None
287480     None
5086912    None
5183025    None
1902566    None
Length: 1078405, dtype: object

In [32]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
# note: these are not really JSONs
with open('/content/drive/My Drive/Colab Notebooks/Json_Files/user2movie.json', 'wb') as f:
  pickle.dump(user2movie, f)

with open('/content/drive/My Drive/Colab Notebooks/Json_Files/movie2user.json', 'wb') as f:
  pickle.dump(movie2user, f)

with open('/content/drive/My Drive/Colab Notebooks/Json_Files/usermovie2rating.json', 'wb') as f:
  pickle.dump(usermovie2rating, f)

with open('/content/drive/My Drive/Colab Notebooks/Json_Files/usermovie2rating_test.json', 'wb') as f:
  pickle.dump(usermovie2rating_test, f)
