# Download data

In [1]:
!apt install unzip

Reading package lists... Done
Building dependency tree       
Reading state information... Done
unzip is already the newest version (6.0-21ubuntu1).
0 upgraded, 0 newly installed, 0 to remove and 62 not upgraded.


In [2]:
%%bash
mkdir -p data
cd data
if [ ! -f "ml-25m.zip" ]; then
    echo "Downloading data"
    wget http://files.grouplens.org/datasets/movielens/ml-25m.zip
    unzip ml-25m.zip
fi


Downloading data
Archive:  ml-25m.zip
   creating: ml-25m/
  inflating: ml-25m/tags.csv         
  inflating: ml-25m/links.csv        
  inflating: ml-25m/README.txt       
  inflating: ml-25m/ratings.csv      
  inflating: ml-25m/genome-tags.csv  
  inflating: ml-25m/genome-scores.csv  
  inflating: ml-25m/movies.csv       


--2020-10-16 06:30:16--  http://files.grouplens.org/datasets/movielens/ml-25m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 261978986 (250M) [application/zip]
Saving to: ‘ml-25m.zip’

     0K .......... .......... .......... .......... ..........  0%  380K 11m13s
    50K .......... .......... .......... .......... ..........  0%  741K 8m29s
   100K .......... .......... .......... .......... ..........  0%  174M 5m40s
   150K .......... .......... .......... .......... ..........  0%  857K 5m29s
   200K .......... .......... .......... .......... ..........  0% 14.2M 4m27s
   250K .......... .......... .......... .......... ..........  0% 15.4M 3m45s
   300K .......... .......... .......... .......... ..........  0% 14.8M 3m15s
   350K .......... .......... .......... .......... ..........  0%  846K 3m29s
   400K ...

In [3]:
!ls ./data

ml-25m	ml-25m.zip


In [4]:
from argparse import ArgumentParser
import pandas as pd
import numpy as np
import torch
import tqdm

In [30]:
LIKE_THRESHOLD = 3.0
MIN_RATINGS = 2600
USER_COLUMN = 'userId'
ITEM_COLUMN = 'movieId'

In [31]:
df = pd.read_csv('./data/ml-25m/ratings.csv')

df = df[df.rating > LIKE_THRESHOLD]
print("Filtering out users with less than {} ratings".format(MIN_RATINGS))
grouped = df.groupby(USER_COLUMN)
df = grouped.filter(lambda x: len(x) >= MIN_RATINGS)

Filtering out users with less than 2600 ratings


In [32]:
df['userId'].value_counts()

72315     12802
75309      5525
80974      4131
137293     4116
110971     4111
92046      3991
20055      3576
85757      3062
24610      2890
24869      2890
93424      2817
17783      2706
90691      2685
140862     2678
8619       2667
29104      2650
Name: userId, dtype: int64

In [33]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
1274794,8619,1,4.5,1148833449
1274799,8619,6,4.5,1111519330
1274800,8619,7,4.0,1111704317
1274803,8619,14,3.5,1149291323
1274804,8619,16,4.0,1111458149


In [34]:
df_movies = pd.read_csv('./data/ml-25m/movies.csv')
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [35]:
# Categorize movies genres properly. Working later with +20MM rows of strings proved very resource consuming
genres_unique = pd.DataFrame(df_movies.genres.str.split('|').tolist()).stack().unique()
genres_unique = pd.DataFrame(genres_unique, columns=['genre']) # Format into DataFrame to store later
df_movies = df_movies.join(df_movies.genres.str.get_dummies().astype(bool))
df_movies.drop('genres', inplace=True, axis=1)

In [36]:
df_movies.head()

Unnamed: 0,movieId,title,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),False,False,True,True,True,True,False,False,...,False,False,False,False,False,False,False,False,False,False
1,2,Jumanji (1995),False,False,True,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,3,Grumpier Old Men (1995),False,False,False,False,False,True,False,False,...,False,False,False,False,False,True,False,False,False,False
3,4,Waiting to Exhale (1995),False,False,False,False,False,True,False,False,...,False,False,False,False,False,True,False,False,False,False
4,5,Father of the Bride Part II (1995),False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False


In [37]:
data = df.merge(df_movies,on='movieId', how='left')
data.head(5)

Unnamed: 0,userId,movieId,rating,timestamp,title,(no genres listed),Action,Adventure,Animation,Children,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,8619,1,4.5,1148833449,Toy Story (1995),False,False,True,True,True,...,False,False,False,False,False,False,False,False,False,False
1,8619,6,4.5,1111519330,Heat (1995),False,True,False,False,False,...,False,False,False,False,False,False,False,True,False,False
2,8619,7,4.0,1111704317,Sabrina (1995),False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
3,8619,14,3.5,1149291323,Nixon (1995),False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,8619,16,4.0,1111458149,Casino (1995),False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [38]:
data.columns

Index(['userId', 'movieId', 'rating', 'timestamp', 'title',
       '(no genres listed)', 'Action', 'Adventure', 'Animation', 'Children',
       'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
       'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller',
       'War', 'Western'],
      dtype='object')

In [39]:
data[['(no genres listed)', 'Action', 'Adventure', 'Animation', 'Children',
       'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
       'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller',
       'War', 'Western']] = data[['(no genres listed)', 'Action', 'Adventure', 'Animation', 'Children',
       'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
       'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller',
       'War', 'Western']].astype(int)
data.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,(no genres listed),Action,Adventure,Animation,Children,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,8619,1,4.5,1148833449,Toy Story (1995),0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,8619,6,4.5,1111519330,Heat (1995),0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,8619,7,4.0,1111704317,Sabrina (1995),0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,8619,14,3.5,1149291323,Nixon (1995),0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,8619,16,4.0,1111458149,Casino (1995),0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [40]:
# Produces Pandas DataFrame
grouped_data = data.groupby('userId',as_index=False)[['(no genres listed)', 'Action', 'Adventure', 'Animation', 'Children',
       'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
       'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller',
       'War', 'Western']].sum()
grouped_data.head()

Unnamed: 0,userId,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,8619,0,435,323,75,98,707,546,26,1586,...,167,185,20,109,240,479,171,627,229,158
1,17783,8,288,267,66,81,569,491,31,1841,...,139,191,30,71,235,441,153,612,148,173
2,20055,0,367,361,127,161,1005,479,253,2232,...,94,202,45,197,234,680,204,588,245,111
3,24610,46,1063,293,0,16,504,798,9,1525,...,25,180,58,6,213,200,252,1263,286,35
4,24869,28,363,299,113,135,870,471,257,1558,...,122,137,57,117,240,413,247,546,139,116


In [41]:
grouped_data = grouped_data.set_index('userId')
user_profiles = grouped_data.div(grouped_data.sum(axis=1), axis=0)

user_profiles.reset_index()
user_profiles

Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
8619,0.0,0.068873,0.05114,0.011875,0.015516,0.111938,0.086447,0.004117,0.251108,0.021374,0.026441,0.029291,0.003167,0.017258,0.037999,0.075839,0.027074,0.099272,0.036257,0.025016
17783,0.001344,0.048371,0.044844,0.011085,0.013604,0.095566,0.082466,0.005207,0.309204,0.019987,0.023346,0.032079,0.005039,0.011925,0.039469,0.074068,0.025697,0.102788,0.024857,0.029056
20055,0.0,0.047124,0.046353,0.016307,0.020673,0.129045,0.061505,0.032486,0.286595,0.026066,0.01207,0.025937,0.005778,0.025295,0.030046,0.087314,0.026194,0.075501,0.031459,0.014253
24610,0.006732,0.155569,0.04288,0.0,0.002342,0.07376,0.116786,0.001317,0.223182,0.008927,0.003659,0.026343,0.008488,0.000878,0.031172,0.02927,0.03688,0.184838,0.041856,0.005122
24869,0.004394,0.056959,0.046917,0.017731,0.021183,0.136513,0.073906,0.040326,0.244469,0.022752,0.019143,0.021497,0.008944,0.018359,0.037659,0.064805,0.038757,0.085674,0.021811,0.018202
29104,0.00153,0.078006,0.050168,0.010554,0.016672,0.100948,0.041756,0.002141,0.173753,0.034567,0.001071,0.118232,0.016366,0.005812,0.049862,0.067758,0.054298,0.166718,0.008259,0.00153
72315,0.033765,0.040554,0.032967,0.027553,0.017482,0.120153,0.04783,0.090159,0.316931,0.023294,0.006922,0.02041,0.00315,0.014953,0.022983,0.072722,0.020055,0.052622,0.026977,0.008519
75309,0.024103,0.080891,0.035241,0.008947,0.015156,0.12225,0.053775,0.047841,0.20588,0.021547,0.002556,0.09203,0.003287,0.007213,0.033872,0.057336,0.04145,0.122067,0.013969,0.010591
80974,0.0,0.054728,0.047506,0.011961,0.018055,0.143421,0.066689,0.020424,0.291808,0.020199,0.016475,0.011623,0.005304,0.024035,0.027533,0.09851,0.015459,0.073685,0.031483,0.021101
85757,0.010041,0.055227,0.052126,0.054341,0.028204,0.147667,0.051536,0.032634,0.244684,0.033816,0.00443,0.021855,0.008269,0.022298,0.030715,0.071618,0.034997,0.073538,0.015505,0.006497
