In [27]:
import os
from tqdm import tqdm
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.sparse import csr_matrix

import warnings
warnings.filterwarnings("ignore")

In [28]:
df = pd.read_csv('./ml-latest-small/movies.csv')
df

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


# 장르 TF-IDF
총 영화수

In [29]:
total_count = len(df)
total_count

9742

In [30]:
genres_tmp = [j for i in df.genres for j in i.split('|')]
genres_tmp

['Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Fantasy',
 'Adventure',
 'Children',
 'Fantasy',
 'Comedy',
 'Romance',
 'Comedy',
 'Drama',
 'Romance',
 'Comedy',
 'Action',
 'Crime',
 'Thriller',
 'Comedy',
 'Romance',
 'Adventure',
 'Children',
 'Action',
 'Action',
 'Adventure',
 'Thriller',
 'Comedy',
 'Drama',
 'Romance',
 'Comedy',
 'Horror',
 'Adventure',
 'Animation',
 'Children',
 'Drama',
 'Action',
 'Adventure',
 'Romance',
 'Crime',
 'Drama',
 'Drama',
 'Romance',
 'Comedy',
 'Comedy',
 'Action',
 'Comedy',
 'Crime',
 'Drama',
 'Thriller',
 'Comedy',
 'Crime',
 'Thriller',
 'Crime',
 'Drama',
 'Horror',
 'Mystery',
 'Thriller',
 'Action',
 'Crime',
 'Thriller',
 'Drama',
 'Sci-Fi',
 'Drama',
 'Romance',
 'Drama',
 'Children',
 'Drama',
 'Drama',
 'Romance',
 'Adventure',
 'Drama',
 'Fantasy',
 'Mystery',
 'Sci-Fi',
 'Crime',
 'Drama',
 'Drama',
 'Mystery',
 'Sci-Fi',
 'Thriller',
 'Children',
 'Drama',
 'Crime',
 'Drama',
 'Children',
 'Comedy',
 'Comedy',
 'Romance'

In [31]:
# 중복된 장르를 제거하기
total_genres = list(set(genres_tmp))
total_genres

['Action',
 'Crime',
 'Western',
 'Drama',
 'Horror',
 '(no genres listed)',
 'Musical',
 'Fantasy',
 'Romance',
 'Adventure',
 'Comedy',
 'Mystery',
 'Documentary',
 'War',
 'Film-Noir',
 'Sci-Fi',
 'Thriller',
 'IMAX',
 'Animation',
 'Children']

In [32]:
genre_count = {i:genres_tmp.count(i) for i in total_genres}
genre_count

{'Action': 1828,
 'Crime': 1199,
 'Western': 167,
 'Drama': 4361,
 'Horror': 978,
 '(no genres listed)': 34,
 'Musical': 334,
 'Fantasy': 779,
 'Romance': 1596,
 'Adventure': 1263,
 'Comedy': 3756,
 'Mystery': 573,
 'Documentary': 440,
 'War': 382,
 'Film-Noir': 87,
 'Sci-Fi': 980,
 'Thriller': 1894,
 'IMAX': 158,
 'Animation': 611,
 'Children': 664}

# IDF

In [33]:
genre_idf={i: np.log10(total_count/genre_count[i])for i in genre_count}
genre_idf

{'Action': 0.7266719338379385,
 'Crime': 0.9098289421369025,
 'Western': 1.7659316540881678,
 'Drama': 0.3490620385623247,
 'Horror': 0.9983092704481497,
 '(no genres listed)': 2.457169208193496,
 'Musical': 1.4649016584241867,
 'Fantasy': 1.0971106675631865,
 'Romance': 0.7856152382210405,
 'Adventure': 0.8872447746804204,
 'Comedy': 0.4139225416416778,
 'Mystery': 1.2304935032683613,
 'Documentary': 1.3451954487495636,
 'War': 1.4065847623240424,
 'Film-Noir': 2.0491288726171324,
 'Sci-Fi': 0.9974220495432563,
 'Thriller': 0.7112681505684965,
 'IMAX': 1.7899910382813284,
 'Animation': 1.2026069149931968,
 'Children': 1.1664800458677336}

In [37]:
df

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [44]:
# 가중치구하기
genre_representation=[]
for idx, genre in enumerate(df['genres']):
    dict_tmp = {i:genre_idf[i] for i in genre.split('|')}
    dict_tmp['movieId'] = df['movieId'][idx]
    genre_representation.append(dict_tmp)
    
genre_representation = pd.DataFrame(genre_representation)
genre_representation


Unnamed: 0,Adventure,Animation,Children,Comedy,Fantasy,movieId,Romance,Drama,Action,Crime,...,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
0,0.887245,1.202607,1.16648,0.413923,1.097111,1,,,,,...,,,,,,,,,,
1,0.887245,,1.16648,,1.097111,2,,,,,...,,,,,,,,,,
2,,,,0.413923,,3,0.785615,,,,...,,,,,,,,,,
3,,,,0.413923,,4,0.785615,0.349062,,,...,,,,,,,,,,
4,,,,0.413923,,5,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,,1.202607,,0.413923,1.097111,193581,,,0.726672,,...,,,,,,,,,,
9738,,1.202607,,0.413923,1.097111,193583,,,,,...,,,,,,,,,,
9739,,,,,,193585,,0.349062,,,...,,,,,,,,,,
9740,,1.202607,,,,193587,,,0.726672,,...,,,,,,,,,,


In [46]:
# NaN을 0으로 변경하기
genre_representation = genre_representation.fillna(0)
genre_representation

Unnamed: 0,Adventure,Animation,Children,Comedy,Fantasy,movieId,Romance,Drama,Action,Crime,...,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
0,0.887245,1.202607,1.16648,0.413923,1.097111,1,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.887245,0.000000,1.16648,0.000000,1.097111,2,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.000000,0.00000,0.413923,0.000000,3,0.785615,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.000000,0.00000,0.413923,0.000000,4,0.785615,0.349062,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.000000,0.00000,0.413923,0.000000,5,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,0.000000,1.202607,0.00000,0.413923,1.097111,193581,0.000000,0.000000,0.726672,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9738,0.000000,1.202607,0.00000,0.413923,1.097111,193583,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9739,0.000000,0.000000,0.00000,0.000000,0.000000,193585,0.000000,0.349062,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9740,0.000000,1.202607,0.00000,0.000000,0.000000,193587,0.000000,0.000000,0.726672,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [48]:
# 컬럼스로 정열하기
genre_representation = genre_representation.sort_index(axis=1)
genre_representation

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,...,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,movieId
0,0.0,0.000000,0.887245,1.202607,1.16648,0.413923,0.0,0.0,0.000000,1.097111,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,1
1,0.0,0.000000,0.887245,0.000000,1.16648,0.000000,0.0,0.0,0.000000,1.097111,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,2
2,0.0,0.000000,0.000000,0.000000,0.00000,0.413923,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.785615,0.0,0.0,0.0,0.0,3
3,0.0,0.000000,0.000000,0.000000,0.00000,0.413923,0.0,0.0,0.349062,0.000000,...,0.0,0.0,0.0,0.0,0.785615,0.0,0.0,0.0,0.0,4
4,0.0,0.000000,0.000000,0.000000,0.00000,0.413923,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,0.0,0.726672,0.000000,1.202607,0.00000,0.413923,0.0,0.0,0.000000,1.097111,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,193581
9738,0.0,0.000000,0.000000,1.202607,0.00000,0.413923,0.0,0.0,0.000000,1.097111,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,193583
9739,0.0,0.000000,0.000000,0.000000,0.00000,0.000000,0.0,0.0,0.349062,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,193585
9740,0.0,0.726672,0.000000,1.202607,0.00000,0.000000,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,193587


In [52]:
genre_representation = genre_representation.set_index('movieId')
genre_representation

Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,0.0,0.000000,0.887245,1.202607,1.16648,0.413923,0.0,0.0,0.000000,1.097111,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
2,0.0,0.000000,0.887245,0.000000,1.16648,0.000000,0.0,0.0,0.000000,1.097111,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
3,0.0,0.000000,0.000000,0.000000,0.00000,0.413923,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.785615,0.0,0.0,0.0,0.0
4,0.0,0.000000,0.000000,0.000000,0.00000,0.413923,0.0,0.0,0.349062,0.000000,0.0,0.0,0.0,0.0,0.0,0.785615,0.0,0.0,0.0,0.0
5,0.0,0.000000,0.000000,0.000000,0.00000,0.413923,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.0,0.726672,0.000000,1.202607,0.00000,0.413923,0.0,0.0,0.000000,1.097111,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
193583,0.0,0.000000,0.000000,1.202607,0.00000,0.413923,0.0,0.0,0.000000,1.097111,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
193585,0.0,0.000000,0.000000,0.000000,0.00000,0.000000,0.0,0.0,0.349062,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
193587,0.0,0.726672,0.000000,1.202607,0.00000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0


In [53]:
genre_representation.loc[1]

(no genres listed)    0.000000
Action                0.000000
Adventure             0.887245
Animation             1.202607
Children              1.166480
Comedy                0.413923
Crime                 0.000000
Documentary           0.000000
Drama                 0.000000
Fantasy               1.097111
Film-Noir             0.000000
Horror                0.000000
IMAX                  0.000000
Musical               0.000000
Mystery               0.000000
Romance               0.000000
Sci-Fi                0.000000
Thriller              0.000000
War                   0.000000
Western               0.000000
Name: 1, dtype: float64

# tag

In [56]:
df

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [70]:
#tags.csv : 태그
tags_df=pd.read_csv('./ml-latest-small/tags.csv')
tags_df

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200
...,...,...,...,...
3678,606,7382,for katie,1171234019
3679,606,7936,austere,1173392334
3680,610,3265,gun fu,1493843984
3681,610,3265,heroic bloodshed,1493843978


In [71]:
total_movie_count = len(set(tags_df['movieId']))
total_movie_count

1572

In [73]:
total_tags = [i.split(',') for i in tags_df['tag']]
total_tags

[['funny'],
 ['Highly quotable'],
 ['will ferrell'],
 ['Boxing story'],
 ['MMA'],
 ['Tom Hardy'],
 ['drugs'],
 ['Leonardo DiCaprio'],
 ['Martin Scorsese'],
 ['way too long'],
 ['Al Pacino'],
 ['gangster'],
 ['mafia'],
 ['Al Pacino'],
 ['Mafia'],
 ['holocaust'],
 ['true story'],
 ['twist ending'],
 ['Anthony Hopkins'],
 ['courtroom drama'],
 ['twist ending'],
 ['britpop'],
 ['indie record label'],
 ['music'],
 ['dumpster diving'],
 ['Sustainability'],
 ['romantic comedy'],
 ['wedding'],
 ['painter'],
 ['bloody'],
 ['black hole'],
 ['sci-fi'],
 ['time-travel'],
 ['fantasy'],
 ['magic board game'],
 ['Robin Williams'],
 ['beautiful scenery'],
 ['epic'],
 ['historical'],
 ['inspirational'],
 ['Medieval'],
 ['mel gibson'],
 ['Oscar (Best Cinematography)'],
 ['revenge'],
 ['sword fight'],
 ['black comedy'],
 ['Christina Ricci'],
 ['Christopher Lloyd'],
 ['dark comedy'],
 ['family'],
 ['gothic'],
 ['Al Pacino'],
 ['Andy Garcia'],
 ['Classic'],
 ['Francis Ford Coppola'],
 ['mafia'],
 ['black c

In [88]:
total_tags = [j.strip() for i in total_tags for j in i]
total_tags

['funny',
 'Highly quotable',
 'will ferrell',
 'Boxing story',
 'MMA',
 'Tom Hardy',
 'drugs',
 'Leonardo DiCaprio',
 'Martin Scorsese',
 'way too long',
 'Al Pacino',
 'gangster',
 'mafia',
 'Al Pacino',
 'Mafia',
 'holocaust',
 'true story',
 'twist ending',
 'Anthony Hopkins',
 'courtroom drama',
 'twist ending',
 'britpop',
 'indie record label',
 'music',
 'dumpster diving',
 'Sustainability',
 'romantic comedy',
 'wedding',
 'painter',
 'bloody',
 'black hole',
 'sci-fi',
 'time-travel',
 'fantasy',
 'magic board game',
 'Robin Williams',
 'beautiful scenery',
 'epic',
 'historical',
 'inspirational',
 'Medieval',
 'mel gibson',
 'Oscar (Best Cinematography)',
 'revenge',
 'sword fight',
 'black comedy',
 'Christina Ricci',
 'Christopher Lloyd',
 'dark comedy',
 'family',
 'gothic',
 'Al Pacino',
 'Andy Garcia',
 'Classic',
 'Francis Ford Coppola',
 'mafia',
 'black comedy',
 'Christina Ricci',
 'Christopher Lloyd',
 'Family',
 'gothic',
 'quirky',
 'family',
 'funny',
 'Macaula

In [89]:
len(total_tags)

3683

In [90]:
unique_tags = list(set(total_tags))
unique_tags

['superb soundtrack',
 'steve carell',
 'amazing dialogues',
 'poorly paced',
 'Star Wars',
 'los angeles',
 'ogres',
 'smart',
 'Heroic Bloodshed',
 'domestic violence',
 'stupid but funny',
 'ransom',
 'immigration',
 'Jason Biggs',
 'Queen Victoria',
 'mental hospital',
 'mythology',
 'Recap',
 'space',
 'Grace',
 'good and evil',
 'Bill Murray',
 'dancing',
 'Poor story',
 'joss whedon',
 'brutal',
 'bears',
 'inspirational',
 'nuns',
 'deaf',
 'Great movie',
 'Day and Hudson',
 'college',
 'Charlotte Bronte',
 'soundtrack',
 'philosopical',
 'Magneto',
 'building a family',
 'British',
 'cool',
 'societal criticism',
 'quick cuts',
 'lovely',
 'Steven Spielberg',
 'unpredictable',
 'drug abuse',
 'innovative',
 'rug',
 'interesting scenario',
 'Toto',
 'representation of children',
 'narnia',
 'Tom Clancy',
 'sweet',
 '2D animation',
 'children',
 'big name actors',
 'beautiful scenery',
 'special effects',
 'gun-fu',
 'artistic',
 'bad-ass',
 'muppets',
 '1900s',
 'fun family mov

In [91]:
len(unique_tags)

1589

In [92]:
tag_tf = {i:total_tags.count(i) for i in unique_tags }
tag_tf

{'superb soundtrack': 1,
 'steve carell': 1,
 'amazing dialogues': 1,
 'poorly paced': 1,
 'Star Wars': 2,
 'los angeles': 1,
 'ogres': 1,
 'smart': 3,
 'Heroic Bloodshed': 1,
 'domestic violence': 1,
 'stupid but funny': 1,
 'ransom': 1,
 'immigration': 1,
 'Jason Biggs': 1,
 'Queen Victoria': 1,
 'mental hospital': 1,
 'mythology': 1,
 'Recap': 1,
 'space': 14,
 'Grace': 1,
 'good and evil': 1,
 'Bill Murray': 1,
 'dancing': 1,
 'Poor story': 1,
 'joss whedon': 1,
 'brutal': 2,
 'bears': 1,
 'inspirational': 7,
 'nuns': 1,
 'deaf': 1,
 'Great movie': 1,
 'Day and Hudson': 1,
 'college': 2,
 'Charlotte Bronte': 1,
 'soundtrack': 4,
 'philosopical': 1,
 'Magneto': 1,
 'building a family': 1,
 'British': 1,
 'cool': 1,
 'societal criticism': 1,
 'quick cuts': 1,
 'lovely': 1,
 'Steven Spielberg': 2,
 'unpredictable': 1,
 'drug abuse': 2,
 'innovative': 1,
 'rug': 1,
 'interesting scenario': 1,
 'Toto': 1,
 'representation of children': 1,
 'narnia': 1,
 'Tom Clancy': 2,
 'sweet': 3,
 '2

In [93]:
tag_idf={i: np.log10(total_movie_count/tag_tf[i]) for i in unique_tags}
tag_idf

{'superb soundtrack': 3.196452541703389,
 'steve carell': 3.196452541703389,
 'amazing dialogues': 3.196452541703389,
 'poorly paced': 3.196452541703389,
 'Star Wars': 2.895422546039408,
 'los angeles': 3.196452541703389,
 'ogres': 3.196452541703389,
 'smart': 2.7193312869837265,
 'Heroic Bloodshed': 3.196452541703389,
 'domestic violence': 3.196452541703389,
 'stupid but funny': 3.196452541703389,
 'ransom': 3.196452541703389,
 'immigration': 3.196452541703389,
 'Jason Biggs': 3.196452541703389,
 'Queen Victoria': 3.196452541703389,
 'mental hospital': 3.196452541703389,
 'mythology': 3.196452541703389,
 'Recap': 3.196452541703389,
 'space': 2.050324506025151,
 'Grace': 3.196452541703389,
 'good and evil': 3.196452541703389,
 'Bill Murray': 3.196452541703389,
 'dancing': 3.196452541703389,
 'Poor story': 3.196452541703389,
 'joss whedon': 3.196452541703389,
 'brutal': 2.895422546039408,
 'bears': 3.196452541703389,
 'inspirational': 2.351354501689132,
 'nuns': 3.196452541703389,
 'dea

In [94]:
tag_tmp=tags_df.sort_values('movieId').reset_index(drop=True)
tag_tmp

Unnamed: 0,userId,movieId,tag,timestamp
0,567,1,fun,1525286013
1,474,1,pixar,1137206825
2,336,1,pixar,1139045764
3,62,2,Robin Williams,1528843907
4,62,2,magic board game,1528843932
...,...,...,...,...
3678,62,187595,star wars,1528934552
3679,184,193565,comedy,1537098587
3680,184,193565,anime,1537098582
3681,184,193565,remaster,1537098592


In [101]:
tag_dict = {}
tmp = 0
for idx,i in enumerate(tag_tmp['movieId']):
    if tmp != i:
        tmp = i
        tag_dict[i]=[]
    tag_dict[i].append(tag_tmp['tag'][idx].strip())
tag_dict

{1: ['fun', 'pixar', 'pixar'],
 2: ['Robin Williams', 'magic board game', 'fantasy', 'game'],
 3: ['old', 'moldy'],
 5: ['remake', 'pregnancy'],
 7: ['remake'],
 11: ['president', 'politics'],
 14: ['politics', 'president'],
 16: ['Mafia'],
 17: ['Jane Austen'],
 21: ['Hollywood'],
 22: ['serial killer'],
 25: ['alcoholism'],
 26: ['Shakespeare'],
 28: ['Jane Austen', 'In Netflix queue'],
 29: ['kidnapping'],
 31: ['teacher', 'high school'],
 32: ['Brad Pitt',
  'Bruce Willis',
  'mindfuck',
  'twist ending',
  'time travel',
  'Post apocalyptic',
  'remake',
  'post-apocalyptic',
  'time travel',
  'time travel'],
 34: ['Animal movie',
  'pigs',
  'villain nonexistent or not needed for good story'],
 36: ['death penalty', 'Nun'],
 38: ['twins'],
 39: ['Emma',
  'Jane Austen',
  'seen more than once',
  'quotable',
  'Paul Rudd',
  'funny',
  'chick flick'],
 40: ['South Africa', 'In Netflix queue'],
 41: ['Shakespeare'],
 43: ['England'],
 45: ['Journalism'],
 46: ['wedding'],
 47: ['

In [116]:
tag_representation = []
for movie_id, tags in tag_dict.items():
    dict_tmp = {i:tag_idf[i] for i in tags}
    dict_tmp['movieId']=movie_id
    tag_representation.append(dict_tmp)
tag_representation

[{'fun': 2.4974825373673704, 'pixar': 2.895422546039408, 'movieId': 1},
 {'Robin Williams': 2.7193312869837265,
  'magic board game': 3.196452541703389,
  'fantasy': 2.4183012913197452,
  'game': 3.196452541703389,
  'movieId': 2},
 {'old': 3.196452541703389, 'moldy': 3.196452541703389, 'movieId': 3},
 {'remake': 2.196452541703389, 'pregnancy': 2.4183012913197452, 'movieId': 5},
 {'remake': 2.196452541703389, 'movieId': 7},
 {'president': 2.5943925503754266,
  'politics': 1.941180036600083,
  'movieId': 11},
 {'politics': 1.941180036600083,
  'president': 2.5943925503754266,
  'movieId': 14},
 {'Mafia': 2.196452541703389, 'movieId': 16},
 {'Jane Austen': 2.5943925503754266, 'movieId': 17},
 {'Hollywood': 2.7193312869837265, 'movieId': 21},
 {'serial killer': 2.2933625547114453, 'movieId': 22},
 {'alcoholism': 2.4974825373673704, 'movieId': 25},
 {'Shakespeare': 2.1172712956557644, 'movieId': 26},
 {'Jane Austen': 2.5943925503754266,
  'In Netflix queue': 1.0791812460476249,
  'movieId'

In [117]:
tag_representation = pd.DataFrame(tag_representation)
tag_representation

Unnamed: 0,fun,pixar,movieId,Robin Williams,magic board game,fantasy,game,old,moldy,remake,...,Dwayne Johnson,bad music,Rachel McAdams,Alicia Vikander,video game adaptation,Josh Brolin,Emilia Clarke,star wars,remaster,gintama
0,2.497483,2.895423,1,,,,,,,,...,,,,,,,,,,
1,,,2,2.719331,3.196453,2.418301,3.196453,,,,...,,,,,,,,,,
2,,,3,,,,,3.196453,3.196453,,...,,,,,,,,,,
3,,,5,,,,,,,2.196453,...,,,,,,,,,,
4,,,7,,,,,,,2.196453,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1567,,,183611,,,,,,,,...,,,3.196453,,,,,,,
1568,,,184471,,,,,,,,...,,,,3.196453,3.196453,,,,,
1569,,,187593,,,,,,,,...,,,,,,3.196453,,,,
1570,,,187595,,,,,,,,...,,,,,,,3.196453,3.196453,,


In [118]:
tag_representation = tag_representation.set_index('movieId',drop=True)
tag_representation

Unnamed: 0_level_0,fun,pixar,Robin Williams,magic board game,fantasy,game,old,moldy,remake,pregnancy,...,Dwayne Johnson,bad music,Rachel McAdams,Alicia Vikander,video game adaptation,Josh Brolin,Emilia Clarke,star wars,remaster,gintama
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2.497483,2.895423,,,,,,,,,...,,,,,,,,,,
2,,,2.719331,3.196453,2.418301,3.196453,,,,,...,,,,,,,,,,
3,,,,,,,3.196453,3.196453,,,...,,,,,,,,,,
5,,,,,,,,,2.196453,2.418301,...,,,,,,,,,,
7,,,,,,,,,2.196453,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
183611,,,,,,,,,,,...,,,3.196453,,,,,,,
184471,,,,,,,,,,,...,,,,3.196453,3.196453,,,,,
187593,,,,,,,,,,,...,,,,,,3.196453,,,,
187595,,,,,,,,,,,...,,,,,,,3.196453,3.196453,,


In [119]:
tag_representation = tag_representation.sort_index(axis=1)
tag_representation

Unnamed: 0_level_0,"""artsy""",06 Oscar Nominated Best Movie - Animation,1900s,1920s,1950s,1960s,1970s,1980s,1990s,2001-like,...,women,wonderwoman,workplace,writing,wrongful imprisonment,wry,younger men,zither,zoe kazan,zombies
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
183611,,,,,,,,,,,...,,,,,,,,,,
184471,,,,,,,,,,,...,,,,,,,,,,
187593,,,,,,,,,,,...,,,,,,,,,,
187595,,,,,,,,,,,...,,,,,,,,,,


In [120]:
tag_representation = tag_representation.fillna(0)
tag_representation

Unnamed: 0_level_0,"""artsy""",06 Oscar Nominated Best Movie - Animation,1900s,1920s,1950s,1960s,1970s,1980s,1990s,2001-like,...,women,wonderwoman,workplace,writing,wrongful imprisonment,wry,younger men,zither,zoe kazan,zombies
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
183611,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
184471,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
187593,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
187595,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [121]:
movie_representation = pd.concat([genre_representation, tag_representation], axis=1).fillna(0)
movie_representation

Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,...,women,wonderwoman,workplace,writing,wrongful imprisonment,wry,younger men,zither,zoe kazan,zombies
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.000000,0.887245,1.202607,1.16648,0.413923,0.0,0.0,0.000000,1.097111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.000000,0.887245,0.000000,1.16648,0.000000,0.0,0.0,0.000000,1.097111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.000000,0.000000,0.000000,0.00000,0.413923,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.000000,0.000000,0.000000,0.00000,0.413923,0.0,0.0,0.349062,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.000000,0.000000,0.000000,0.00000,0.413923,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.0,0.726672,0.000000,1.202607,0.00000,0.413923,0.0,0.0,0.000000,1.097111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193583,0.0,0.000000,0.000000,1.202607,0.00000,0.413923,0.0,0.0,0.000000,1.097111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193585,0.0,0.000000,0.000000,0.000000,0.00000,0.000000,0.0,0.0,0.349062,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193587,0.0,0.726672,0.000000,1.202607,0.00000,0.000000,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [122]:
from sklearn.metrics.pairwise import cosine_similarity
cos_sim = cosine_similarity(movie_representation, movie_representation)

In [123]:
cos_sim

array([[1.        , 0.12443836, 0.00840264, ..., 0.        , 0.23255287,
        0.09351939],
       [0.12443836, 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.00840264, 0.        , 1.        , ..., 0.        , 0.        ,
        0.08984921],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.23255287, 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.09351939, 0.        , 0.08984921, ..., 0.        , 0.        ,
        1.        ]])

In [124]:
result_df = pd.DataFrame(data=cos_sim, index=movie_representation.index, columns=movie_representation.index)
result_df

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.124438,0.008403,0.040571,0.011755,0.000000,0.016339,0.331122,0.000000,0.131794,...,0.064466,0.260941,0.071492,0.271710,0.0,0.348295,0.379492,0.000000,0.232553,0.093519
2,0.124438,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.240843,0.000000,0.095861,...,0.000000,0.000000,0.000000,0.000000,0.0,0.108082,0.117763,0.000000,0.000000,0.000000
3,0.008403,0.000000,1.000000,0.179391,0.011294,0.000000,0.072246,0.000000,0.000000,0.000000,...,0.006560,0.000000,0.068686,0.000000,0.0,0.020322,0.022142,0.000000,0.000000,0.089849
4,0.040571,0.000000,0.179391,1.000000,0.054530,0.000000,0.348828,0.000000,0.000000,0.000000,...,0.031674,0.101979,0.567487,0.000000,0.0,0.098119,0.106908,0.365843,0.000000,0.433821
5,0.011755,0.000000,0.011294,0.054530,1.000000,0.000000,0.640342,0.000000,0.000000,0.000000,...,0.009177,0.000000,0.096091,0.000000,0.0,0.028429,0.030976,0.000000,0.000000,0.125697
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.348295,0.108082,0.020322,0.098119,0.028429,0.211466,0.039515,0.000000,0.397065,0.213809,...,0.206804,0.631077,0.172901,0.657123,0.0,1.000000,0.917791,0.000000,0.767770,0.226174
193583,0.379492,0.117763,0.022142,0.106908,0.030976,0.000000,0.043055,0.000000,0.000000,0.000000,...,0.169874,0.687605,0.188388,0.715984,0.0,0.917791,1.000000,0.000000,0.612800,0.246433
193585,0.000000,0.000000,0.000000,0.365843,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.278750,0.644671,0.000000,0.0,0.000000,0.000000,1.000000,0.000000,0.000000
193587,0.232553,0.000000,0.000000,0.000000,0.000000,0.275428,0.000000,0.000000,0.517166,0.278480,...,0.247849,0.821961,0.000000,0.855885,0.0,0.767770,0.612800,0.000000,1.000000,0.000000


In [128]:
result_df[1].sort_values(ascending=False)

movieId
1         1.000000
122918    0.542857
136016    0.503643
65577     0.503643
2294      0.503643
            ...   
26195     0.000000
26176     0.000000
26172     0.000000
26169     0.000000
7301      0.000000
Name: 1, Length: 9742, dtype: float64

In [129]:
print(df.set_index('movieId').loc[1])
print('---------------------------------------------------------------')
print(df.set_index('movieId').loc[122918])
print('---------------------------------------------------------------')
print(df.set_index('movieId').loc[136016])
print('---------------------------------------------------------------')
print(df.set_index('movieId').loc[65577])
print('---------------------------------------------------------------')
print(df.set_index('movieId').loc[2294])

title                                Toy Story (1995)
genres    Adventure|Animation|Children|Comedy|Fantasy
Name: 1, dtype: object
---------------------------------------------------------------
title     Guardians of the Galaxy 2 (2017)
genres             Action|Adventure|Sci-Fi
Name: 122918, dtype: object
---------------------------------------------------------------
title                        The Good Dinosaur (2015)
genres    Adventure|Animation|Children|Comedy|Fantasy
Name: 136016, dtype: object
---------------------------------------------------------------
title                  Tale of Despereaux, The (2008)
genres    Adventure|Animation|Children|Comedy|Fantasy
Name: 65577, dtype: object
---------------------------------------------------------------
title                                     Antz (1998)
genres    Adventure|Animation|Children|Comedy|Fantasy
Name: 2294, dtype: object
