# EDA


In [3]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append("../src")

In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy
from collections import Counter

## Data Preparation

In [5]:
ds_links = pd.read_csv("../ml-latest-small/links.csv")
ds_movies = pd.read_csv("../ml-latest-small/movies.csv")
ds_ratings = pd.read_csv("../ml-latest-small/ratings.csv")
ds_tags = pd.read_csv("../ml-latest-small/tags.csv")

## Ratings

In [32]:
ds_ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [7]:
ratings_mean_count = ds_ratings.groupby("movieId").agg({"rating": ["mean", "count"]})
ratings_mean_count.columns = ["ratings_mean", "ratings_count"]
ratings_mean_count

Unnamed: 0_level_0,ratings_mean,ratings_count
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3.920930,215
2,3.431818,110
3,3.259615,52
4,2.357143,7
5,3.071429,49
...,...,...
193581,4.000000,1
193583,3.500000,1
193585,3.500000,1
193587,3.500000,1


## User-Item Matrix

In [8]:
def create_user_item_matrix(ratings) -> pd.DataFrame:
  mat = ratings.pivot(index="userId", columns="movieId", values="rating")
  mat[~mat.isna()] = 1
  mat.fillna(0, inplace=True)
  return mat

def create_user_item_ranking_matrix(ratings) -> pd.DataFrame:
    return ratings.pivot(index="userId", columns="movieId", values="rating")

user_item_matrix = create_user_item_ranking_matrix(ds_ratings)
user_item_matrix.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,


유저마다 시간별로 rating 매긴걸 정렬한 다음
일정 비율로 나눠서 이전에 매긴 rating을 바탕으로 이후에 매긴 rating을 예측할 수 있는
모델을 만들고자 함

In [40]:
def create_user_item_matrix_train(ds_ratings, train_size=0.5):
    users = ds_ratings.userId.unique()
    train_sets = []
    for user in users:
        ratings = ds_ratings[ds_ratings.userId == user].sort_values("timestamp")
        q_value = ratings.timestamp.quantile(q=train_size, interpolation='nearest')
        train_set = ratings[ratings.timestamp <= q_value]
        train_sets.append(train_set)

    train_set = pd.concat(train_sets, axis=0)
    return train_set

train_set = create_user_item_matrix_train(ds_ratings, 0.5)
train_set = create_user_item_ranking_matrix(train_set)
train_set

movieId,1,2,3,4,5,6,7,8,9,10,...,186587,187541,187593,187595,188301,189111,193565,193567,193571,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,,,,,,,2.5,,,,...,,,,,,,,,,
607,,,,,,,,,,,...,,,,,,,,,,
608,2.5,2.0,,,,,,,,4.0,...,,,,,,,,,,
609,,,,,,,,,,4.0,...,,,,,,,,,,


In [39]:
train_set.to_csv("../data/user_item_matrix_X_0.5.csv")
create_user_item_matrix(ds_ratings).to_csv("../data/user_item_matrix_Y_0.5.csv")

## Tag 정보 EDA

In [23]:
tag_counts = ds_tags.tag.value_counts()
tag_counts[tag_counts >= 5]

In Netflix queue           131
atmospheric                 36
superhero                   24
thought-provoking           24
surreal                     23
                          ... 
death penalty                5
artificial intelligence      5
Civil War                    5
poignant                     5
dystopia                     5
Name: tag, Length: 168, dtype: int64

## Movie EDA

In [72]:
from sklearn.feature_extraction.text import CountVectorizer
# ds_movies.genres.str.split("|")
cv = CountVectorizer()
genres = cv.fit_transform(ds_movies.genres)
genres = pd.DataFrame(
    genres.toarray(),
    columns=list(sorted(cv.vocabulary_.keys(), key=lambda x: cv.vocabulary_[x]))
)
genres

Unnamed: 0,movieId,title,action,adventure,animation,children,comedy,crime,documentary,drama,...,listed,musical,mystery,no,noir,romance,sci,thriller,war,western
0,1,Toy Story (1995),0,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),0,0,0,0,1,0,0,1,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II (1995),0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),1,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9738,193583,No Game No Life: Zero (2017),0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9739,193585,Flint (2017),0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
9740,193587,Bungo Stray Dogs: Dead Apple (2018),1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


title 에서 연도를 지울 경우 중복되는게 있는지 확인해봤고, 282개 있음

In [70]:
titles = ds_movies.title.str[:-7]
titles.value_counts()

Hamlet                    5
Jane Eyre                 4
Three Musketeers, The     4
Christmas Carol, A        4
Misérables, Les           4
                         ..
Azumi                     1
Langoliers, The           1
Piper                     1
Sabotage                  1
Barefoot Contessa, The    1
Name: title, Length: 9460, dtype: int64

In [71]:
ds_movies[ds_movies.title.str.contains("Hamlet")]

Unnamed: 0,movieId,title,genres
1086,1411,Hamlet (1996),Crime|Drama|Romance
1419,1941,Hamlet (1948),Drama
2123,2820,Hamlet (1964),Drama
2687,3598,Hamlet (2000),Crime|Drama|Romance|Thriller
2782,3723,Hamlet (1990),Drama
6828,61246,Hamlet 2 (2008),Comedy


## 학습에 필요한 결과물 저장

In [141]:
movie_genre_matrix = pd.concat(
    [ds_movies.drop(columns="genres"), genres],
    axis=1
).drop(columns="movieId")
movie_genre_matrix.index = ds_movies.movieId
movie_genre_matrix = pd.concat([movie_genre_matrix, ratings_mean_count], axis=1)
movie_genre_matrix["years"] = movie_genre_matrix.title.str.strip().str[-5:-1]
movie_genre_matrix.years = movie_genre_matrix.years.apply(lambda x: int(x) if x.isdigit() else 0)

movie_genre_matrix.to_csv("../data/movies.csv")
movie_genre_matrix

Unnamed: 0_level_0,title,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,...,no,noir,romance,sci,thriller,war,western,ratings_mean,ratings_count,years
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story (1995),0,1,1,1,1,0,0,0,1,...,0,0,0,0,0,0,0,3.920930,215.0,1995
2,Jumanji (1995),0,1,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,3.431818,110.0,1995
3,Grumpier Old Men (1995),0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,3.259615,52.0,1995
4,Waiting to Exhale (1995),0,0,0,0,1,0,0,1,0,...,0,0,1,0,0,0,0,2.357143,7.0,1995
5,Father of the Bride Part II (1995),0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,3.071429,49.0,1995
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,Black Butler: Book of the Atlantic (2017),1,0,1,0,1,0,0,0,1,...,0,0,0,0,0,0,0,4.000000,1.0,2017
193583,No Game No Life: Zero (2017),0,0,1,0,1,0,0,0,1,...,0,0,0,0,0,0,0,3.500000,1.0,2017
193585,Flint (2017),0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,3.500000,1.0,2017
193587,Bungo Stray Dogs: Dead Apple (2018),1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,3.500000,1.0,2018


In [146]:
movie_genre_matrix.columns[1:25]

Index(['action', 'adventure', 'animation', 'children', 'comedy', 'crime',
       'documentary', 'drama', 'fantasy', 'fi', 'film', 'genres', 'horror',
       'imax', 'listed', 'musical', 'mystery', 'no', 'noir', 'romance', 'sci',
       'thriller', 'war', 'western'],
      dtype='object')