In [2]:
import numpy as np
import pandas as pd 
import os

dfs = []
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        path = os.path.join(dirname, filename)
        df = pd.read_csv(path)
        dfs.append(df)

In [3]:
# import data
rating = dfs[0]
rating.head()
rating.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000263 entries, 0 to 20000262
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  object 
dtypes: float64(1), int64(2), object(1)
memory usage: 610.4+ MB


In [4]:
# make the user ids go from 0...(n-1)

rating['userId'] = rating['userId'] - 1

In [5]:
# create a mapping for movie ids
# {'movie_id':0, 'movie_id':1}

len(rating['movieId'].values) # 20000263

unique_movie_ids = set(rating['movieId'].values)

movie_idx = {}
start = 0
for unique_movie_id in unique_movie_ids:
    movie_idx[unique_movie_id] = start
    start += 1

In [6]:
rating['movie_idx'] = rating.apply(lambda x: movie_idx[x['movieId']], axis=1)

In [8]:
rating = rating.drop(columns = ['timestamp'])
rating.to_csv('/kaggle/input/movielens-20m-dataset')

KeyError: "['timestamp'] not found in axis"

In [11]:
rating.to_csv('edited_rating.csv')

In [12]:
# Shrinking the data

import pickle
from collections import Counter

In [67]:
df = pd.read_csv('/kaggle/working/edited_rating.csv')

In [74]:
df.head()

Unnamed: 0.1,Unnamed: 0,userId,movieId,rating,movie_idx
0,0,0,2,3.5,2
1,1,0,29,3.5,29
2,2,0,32,3.5,32
3,3,0,47,3.5,47
4,4,0,50,3.5,50


In [68]:
# original data row value
print('original df size', len(df))

original df size 20000263


In [69]:
# number of users
N = df['userId'].max() + 1
print(N)
# number of movies
M = df['movie_idx'].max() + 1
print(M)

138493
26744


In [70]:
# {userId : count number or userId}
user_ids_count = Counter(df['userId'])
movie_idx_count = Counter(df['movie_idx'])

In [71]:
# number of users and movies we'd like to keep

n = 10000
m = 2000

In [72]:
# figure out which user_ids_count/movie_idx_count common

user_ids = [u for u, c in user_ids_count.most_common(n)]
movie_idx = [m for m, c in movie_idx_count.most_common(m)]

In [75]:
# make a copy in case ids won't be over written

df_small = df[df['userId'].isin(user_ids) & df['movie_idx'].isin(movie_idx)].copy()
df_small.head()

Unnamed: 0.1,Unnamed: 0,userId,movieId,rating,movie_idx
960,960,10,1,4.5,1
961,961,10,10,2.5,10
962,962,10,19,3.5,19
963,963,10,32,5.0,32
964,964,10,39,4.5,39


In [76]:
# remake userId, movie_idx => sequential
# user_ids = [1, 2, 3] -> {1:0, 2:1, 3:2}
# movie_idx

new_user_ids = {}
i = 0
for user_id in user_ids:
    new_user_ids[user_id] = i
    i += 1
print("i:", i)

new_movie_idxs = {}
j = 0
for m_idx in movie_idx:
    new_movie_idxs[m_idx] = j
    j += 1
print("j:", j)

i: 10000
j: 2000


In [57]:
for i in df_small['userId']:
    if type(i) == float:
        print(i)

In [62]:
df_small

Unnamed: 0.1,Unnamed: 0,userId,movieId,rating,movie_idx
960,960,10,1,4.5,1
961,961,10,10,2.5,10
962,962,10,19,3.5,19
963,963,10,32,5.0,32
964,964,10,39,4.5,39
...,...,...,...,...,...
19998246,19998246,138473,3608,5.0,3519
19998247,19998247,138473,3615,3.0,3526
19998250,19998250,138473,3646,3.0,3557
19998265,19998265,138473,3949,5.0,3857


In [77]:
# convert userId, movieId -> new_user_ids, new_movie_idxs

df_small.loc[:,'userId'] = df_small.apply(lambda x: new_user_ids[x['userId']], axis = 1)

In [78]:
df_small.loc[:,'movieId'] = df_small.apply(lambda x: new_movie_idxs[x['movie_idx']], axis = 1)

In [79]:
print("max user id : ", df_small.userId.max())
print("max movie id :", df_small.movieId.max())

max user id :  9999
max movie id : 1999


5392025