#### import block

In [1]:
import pandas as pd
import numpy as np
import os


In [17]:
from tqdm import tqdm

In [2]:
base_path = 'C:/Users/gmltj/Desktop/Experiments/ml-25m/'
file_list = os.listdir(base_path)
file_list

['genome-scores.csv',
 'genome-tags.csv',
 'links.csv',
 'movies.csv',
 'ratings.csv',
 'README.txt',
 'tags.csv']

In [3]:
ratings = pd.read_csv(os.path.join(base_path, "ratings.csv"))

In [4]:
movies = pd.read_csv(os.path.join(base_path, "movies.csv"))

#### make dataframes for nodes
- users_df
- movies_df
- genres_df

In [5]:
users_df = pd.DataFrame(ratings['userId'].unique(), columns=['userId'])
users_df.head()

Unnamed: 0,userId
0,1
1,2
2,3
3,4
4,5


In [6]:
movies_df = movies.drop('genres', axis = 1)

#평균 평점으로 aggregating
agg_rating_avg = ratings.groupby(['movieId']).agg({'rating': np.mean}).reset_index()
agg_rating_avg.columns = ['movieId', 'rating_mean']

#merge
movies_df = movies_df.merge(agg_rating_avg, left_on='movieId', right_on='movieId', how='left')
movies_df.head()

Unnamed: 0,movieId,title,rating_mean
0,1,Toy Story (1995),3.893708
1,2,Jumanji (1995),3.251527
2,3,Grumpier Old Men (1995),3.142028
3,4,Waiting to Exhale (1995),2.853547
4,5,Father of the Bride Part II (1995),3.058434


In [7]:
genres = movies['genres']
genres = genres.str.split("|", expand=True)
genres = pd.Series(genres.values.ravel()).unique()
genres_df =pd.DataFrame(genres, columns=['genres'])
genres_df


Unnamed: 0,genres
0,Adventure
1,Animation
2,Children
3,Comedy
4,Fantasy
5,
6,Romance
7,Drama
8,Action
9,Crime


#### make dataframes for edges
- users_movies_df
- movies_genres_df
- users_genres_df

In [8]:
users_movies_df = ratings.drop('timestamp', axis=1)
users_movies_df.head()

Unnamed: 0,userId,movieId,rating
0,1,296,5.0
1,1,306,3.5
2,1,307,5.0
3,1,665,5.0
4,1,899,3.5


In [9]:
movies_genres_df = movies.drop('title', axis = 1)
movies_genres_df.head()

Unnamed: 0,movieId,genres
0,1,Adventure|Animation|Children|Comedy|Fantasy
1,2,Adventure|Children|Fantasy
2,3,Comedy|Romance
3,4,Comedy|Drama|Romance
4,5,Comedy


영화 하나가 여러 장르에 속하므로 펼쳐야함
1. genres 컬럼에 있는 string을 단어(1 장르) 단위로 나누기
2. 한 장르당 하나의 새로운 컬럼에 저장(총 10개 컬럼 생성됨)
3. melt 함수와 stack()함수를 이용하여 한 컬럼으로 합치기




In [10]:
movies_genres_df.drop('genres', axis=1, inplace=True)

In [11]:
movies_genres_df = pd.merge(movies_genres_df, movies['genres'].str.split("|", expand=True),left_index=True,  right_index=True)
movies_genres_df= movies_genres_df.set_index('movieId').stack().reset_index().drop('level_1', 1)
movies_genres_df.rename(columns ={0: 'genres'}, inplace=True)
movies_genres_df

Unnamed: 0,movieId,genres
0,1,Adventure
1,1,Animation
2,1,Children
3,1,Comedy
4,1,Fantasy
...,...,...
112302,209163,Drama
112303,209169,(no genres listed)
112304,209171,Action
112305,209171,Adventure


In [12]:
users_genres_df = ratings.merge(movies, left_on='movieId', right_on='movieId', how='left')
users_genres_df.drop(['movieId', 'rating', 'timestamp', 'title'], axis=1, inplace=True)


In [13]:
users_genres_df.head(10)

Unnamed: 0,userId,genres
0,1,Comedy|Crime|Drama|Thriller
1,1,Drama
2,1,Drama
3,1,Comedy|Drama|War
4,1,Comedy|Musical|Romance
5,1,Drama|Musical|Romance
6,1,Comedy|Drama|Romance
7,1,Drama|War
8,1,Drama
9,1,Adventure|Drama|War


users_gernes_df에는 각 유저가 가장 선호하는 장르가 저장되어야 함.
1. movies_genres_df와 마찬가지로 genres 컬럼의 값을 펼쳐주는 작업이 필요하고
2. 각 userId당 genres의 word frequency를 계산해야 함.

In [14]:
users_genres_df.set_index('userId', inplace=True)

In [18]:
tmp = pd.DataFrame(columns = ['genre'])
for idx in tqdm(users_genres_df.index.unique()):
    top = users_genres_df.loc[idx].genres\
               .str.split("|", expand=True)\
               .stack().value_counts()[:1]
    tmp.loc[idx]=top.index[0]
users_genres_df = tmp
users_genres_df.head()

100%|██████████████████████████████████████████████████████████████████████████| 162541/162541 [31:20<00:00, 86.43it/s]


Unnamed: 0,genre
1,Drama
2,Drama
3,Action
4,Action
5,Comedy


#### save to files

In [21]:
users_df.to_csv('users.csv', sep='|', header=True, index=False)
movies_df.to_csv('movies.csv', sep='|', header=True, index=False)
genres_df.to_csv('genres.csv', sep='|', header=True, index=False)
users_movies_df.to_csv('users_movies.csv', sep='|', header=True, index=False)


In [20]:
movies_genres_df.to_csv('movies_genres.csv', sep='|', header=True, index=False)
users_genres_df.to_csv('users_genres.csv', sep='|', header=True, index=False)