In [1]:
import pandas as pd
import numpy as np
import requests
import os
from sklearn.model_selection import train_test_split
from collections import defaultdict
from tqdm import tqdm
from typing import Dict, List, Any

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')
%cd /content/gdrive/MyDrive/Project/Side_Project/H&M_recommendation_system/data

Mounted at /content/gdrive
/content/gdrive/MyDrive/Project/Side_Project/H&M_recommendation_system/data


In [3]:
from glob import glob
files = glob('./*.csv')

In [4]:
dfs = {file:pd.read_csv(file) for file in files}
dfs

{'./df_small.csv':          Unnamed: 0  Unnamed: 0.1  userId  movieId  rating  \
 0               960           960       5        1     4.5   
 1               961           961       5       10     2.5   
 2               962           962       5       19     3.5   
 3               963           963       5       32     5.0   
 4               964           964       5       39     4.5   
 ...             ...           ...     ...      ...     ...   
 5392020    19998291      19998291    2704     4993     5.0   
 5392021    19998292      19998292    2704     5349     3.0   
 5392022    19998293      19998293    2704     5378     4.0   
 5392023    19998295      19998295    2704     5449     4.0   
 5392024    19998296      19998296    2704     5459     4.0   
 
                    timestamp  movie_id_idx  
 0        2009-01-02 01:13:41             0  
 1        2009-01-02 01:15:59             9  
 2        2009-01-01 04:21:44            18  
 3        2009-01-01 04:11:35           

In [5]:
df = dfs['./df_small.csv']

In [6]:
df.drop(['Unnamed: 0', 'Unnamed: 0.1', 'movieId', 'timestamp'], axis = 1)

Unnamed: 0,userId,rating,movie_id_idx
0,5,4.5,0
1,5,2.5,9
2,5,3.5,18
3,5,5.0,29
4,5,4.5,33
...,...,...,...
5392020,2704,5.0,1730
5392021,2704,3.0,1769
5392022,2704,4.0,1772
5392023,2704,4.0,1780


In [7]:
df = df[['userId', 'movie_id_idx', 'rating']]

In [8]:
# Information of Dataset

num_of_users = df['userId'].nunique()
num_of_movies = df['movie_id_idx'].nunique()
print('num_of_users:', num_of_users)
print('num_of_movies:', num_of_movies)

num_of_users: 10000
num_of_movies: 2000


In [9]:
# Split into train and test

train, test = train_test_split(df, test_size = 0.2)

In [10]:
# user_movie = {user_id : [movie1, movie2, ...]}
# movie_user = {movie_idx : [user1, user2, ...]}
# user_movie_rating = {(user, movie) : rating, (user2, movie2) : rating, ...}

def create_dict_1(target_1:List) -> Dict:
    res_dict_1 = defaultdict(list)
    for v1, v2 in target_1:
        res_dict_1[v1].append(v2)
    return res_dict_1

def create_dict_2(target_2:zip()) -> Dict:
    res_dict_2 = defaultdict(int)
    for v1, v2, v3 in target_2:
        res_dict_2[(v1, v2)] = v3
    return res_dict_2

In [11]:
user_movie = list(zip(df['userId'], df['movie_id_idx']))
movie_user = list(zip(df['movie_id_idx'], df['userId']))
user_movie_rating = zip(df['userId'], df['movie_id_idx'], df['rating'])

user_movie_dict = create_dict_1(user_movie)
movie_user_dict = create_dict_1(movie_user)
user_movie_rating_dict = create_dict_2(user_movie_rating)

In [12]:
train['user_movie'] = train['userId'].apply(lambda x : user_movie_dict[x])
train['movie_user'] = train['movie_id_idx'].apply(lambda x : movie_user_dict[x])

In [13]:
train

Unnamed: 0,userId,movie_id_idx,rating,user_movie,movie_user
4540639,5701,379,2.0,"[0, 1, 5, 9, 15, 18, 21, 27, 29, 30, 37, 40, 4...","[15, 17, 27, 39, 56, 86, 88, 97, 100, 102, 119..."
1659519,3282,1893,1.0,"[0, 1, 2, 5, 11, 14, 15, 17, 21, 24, 27, 29, 3...","[30, 39, 92, 97, 100, 119, 131, 139, 144, 145,..."
2526403,9875,379,1.5,"[0, 5, 23, 27, 29, 40, 43, 45, 50, 55, 78, 84,...","[15, 17, 27, 39, 56, 86, 88, 97, 100, 102, 119..."
3289110,5966,1682,4.0,"[6, 16, 21, 29, 33, 40, 41, 43, 45, 84, 85, 96...","[27, 78, 116, 119, 133, 139, 200, 209, 230, 24..."
3313082,6169,1894,4.0,"[1, 6, 9, 14, 20, 27, 29, 40, 43, 56, 73, 74, ...","[78, 107, 116, 119, 131, 133, 139, 157, 200, 2..."
...,...,...,...,...,...
1631686,3083,792,3.0,"[1, 10, 13, 15, 16, 24, 26, 27, 29, 30, 34, 36...","[15, 44, 47, 56, 102, 122, 131, 139, 147, 157,..."
1834726,4633,1979,3.0,"[0, 5, 9, 15, 18, 20, 24, 29, 30, 33, 37, 40, ...","[5, 17, 27, 30, 39, 92, 95, 100, 107, 119, 122..."
286587,2548,723,3.0,"[0, 1, 5, 6, 8, 9, 15, 18, 19, 20, 22, 23, 27,...","[10, 15, 17, 27, 30, 39, 44, 47, 78, 88, 95, 9..."
1189833,9118,558,1.0,"[0, 15, 17, 18, 27, 29, 40, 43, 74, 80, 84, 85...","[27, 33, 39, 47, 56, 78, 86, 88, 100, 115, 122..."


In [14]:
test.head()

Unnamed: 0,userId,movie_id_idx,rating
2317088,8309,1793,3.0
4036728,1749,336,3.5
5051714,9564,165,2.5
1548732,2409,748,3.0
5011424,9238,529,3.0


In [22]:
test['user_movie'] = list(zip(test.userId, test.movie_id_idx))

In [26]:
test['user_movie_rating'] = test['user_movie'].apply(lambda x: user_movie_rating_dict[x])

In [27]:
test

Unnamed: 0,userId,movie_id_idx,rating,user_movie,user_movie_rating
2317088,8309,1793,3.0,"(8309, 1793)",3.0
4036728,1749,336,3.5,"(1749, 336)",3.5
5051714,9564,165,2.5,"(9564, 165)",2.5
1548732,2409,748,3.0,"(2409, 748)",3.0
5011424,9238,529,3.0,"(9238, 529)",3.0
...,...,...,...,...,...
4565740,5860,974,4.0,"(5860, 974)",4.0
4312236,4021,134,3.0,"(4021, 134)",3.0
604911,4863,31,4.0,"(4863, 31)",4.0
942561,7345,600,4.0,"(7345, 600)",4.0
