In [22]:
# Library

import numpy as np
import pandas as pd
import os

In [2]:
# import dataframe from path

dfs = []
for dir_name, _, csv_files in os.walk('/kaggle/input'):
    for csv_file in csv_files:
        path = os.path.join(dir_name, csv_file)
        df = pd.read_csv(path)
        dfs.append(df)

In [3]:
# rating dataframe

rating = dfs[0]
rating.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [4]:
# userId = 0, 1, ...

rating['userId'] = rating['userId'] - 1

In [5]:
# definition
from typing import Dict, List


def create_movie_idx_dict(ids:List)-> Dict:
    
    unique_movie_ids = set(ids)
    movie_idx_dict = {}
    idx_start = 0
    for movie_id in unique_movie_ids:
        movie_idx_dict[movie_id] = idx_start
        idx_start += 1
    return movie_idx_dict

def mapping_df(movie_idx_dict:Dict, target:str) -> pd.DataFrame:
    
    df['movie_id_idx'] = df.apply(lambda x: movie_idx_dict[x[target]], axis = 1)
    
    return df

In [6]:
ids = rating['movieId'].tolist()
movie_idx_dict = create_movie_idx_dict(ids)

df = rating
new_df = mapping_df(movie_idx_dict, 'movieId')
new_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,movie_id_idx
0,0,2,3.5,2005-04-02 23:53:47,2
1,0,29,3.5,2005-04-02 23:31:16,29
2,0,32,3.5,2005-04-02 23:33:39,32
3,0,47,3.5,2005-04-02 23:32:07,47
4,0,50,3.5,2005-04-02 23:29:40,50


In [7]:
new_df.to_csv('new_df.csv')

In [8]:
# Definition

import pickle
from collections import Counter


def common_values(user_ids:pd.Series, movie_idxs:pd.Series, N:int, M:int) -> List[int]:
    
    common_user_ids = Counter(user_ids)
    common_movie_idxs = Counter(movie_idxs)
    
    most_common_user_ids = [u for u, c in common_user_ids.most_common(N)]
    most_common_movie_idxs = [m for m, c in common_movie_idxs.most_common(M)]
    
    return most_common_user_ids, most_common_movie_idxs

def create_new_idx_dict(target:List) -> Dict:
    
    unique_values = set(target)

    new_idx_dict = {}
    idx_start = 0
    for val in unique_values:
        new_idx_dict[val] = idx_start
        idx_start += 1
    
    return new_idx_dict

In [9]:
df = pd.read_csv('/kaggle/working/new_df.csv')
print('original df size', len(df))

original df size 20000263


In [10]:
user_ids = df['userId']
movie_idxs = df['movie_id_idx']
res = common_values(user_ids, movie_idxs, 10000, 2000)

In [11]:
user_ids = res[0]
movie_idxs = res[1]

In [12]:
df_small = df[df['userId'].isin(user_ids) & df['movie_id_idx'].isin(movie_idxs)].copy()
print('shrinked data length is', df_small.shape[0])

shrinked data length is 5392025


In [13]:
user_id_lst = df_small['userId']
movie_id_lst = df_small['movie_id_idx']

In [14]:
new_user_id_dict = create_new_idx_dict(user_id_lst)
new_movie_id_dict = create_new_idx_dict(movie_id_lst)

In [15]:
df_small.loc[:, 'userId'] = df_small.apply(lambda x : new_user_id_dict[x['userId']], axis = 1)
df_small.loc[:, 'movie_id_idx'] = df_small.apply(lambda x : new_movie_id_dict[x['movie_id_idx']], axis = 1)

In [19]:
print("max user id : ", df_small.userId.max())
print("max movie id :", df_small.movie_id_idx.max())
print("df_small size :", df_small.shape[0])

max user id :  9999
max movie id : 1999
df_small size : 5392025
