In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

## Load the data

In [2]:
teleplay_data = pd.read_csv('Teleplay.csv', sep=',')
rating_data = pd.read_csv('Rating.csv', sep=',')

#eliminate the data without rating
rating_data = rating_data[rating_data.rating > 0][['user_id','teleplay_id','rating']]

teleplay_data = teleplay_data.dropna(how='any',axis=0)
teleplay_data = teleplay_data.reset_index(drop=True)


## Merge genre_id to teleplay_id

In [3]:
#find how many genres are in Teleplay.csv
genres = []
for line in teleplay_data.itertuples(index=False):
    for genre in line.genre.split(","):
        if genre.strip() not in genres:
            genres.append(genre.strip())

#use a dictionary to hold the genres of each movie
genreid_per_movie = set()
genres_dict = {}

for line in teleplay_data.itertuples(): 
    tid=line.teleplay_id
    for each_genre in line.genre.split(","):
        temp_genre = each_genre.strip()
        if (temp_genre in genres):
            genre_id = str(genres.index(temp_genre)+1)
            if (tid in genres_dict):
                genres_dict[tid] = genres_dict[tid] +" "+ genre_id.strip()
            else:
                genres_dict[tid] = genre_id.strip()

#merge the genre_id of each movie to teleplay_data
teleplay_data['genre_id'] = teleplay_data['teleplay_id'].map(genres_dict)          
teleplay_data.head()

Unnamed: 0,teleplay_id,name,genre,type,episodes,rating,members,genre_id
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",long,1,9.37,200630,1 2 3 4
1,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",medium,51,9.25,114262,5 6 7 8 9 10 11
2,9253,Steins;Gate,"Sci-Fi, Thriller",medium,24,9.17,673572,10 12
3,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",medium,51,9.16,151266,5 6 7 8 9 10 11
4,32935,Haikyuu!!: Karasuno Koukou VS Shiratorizawa Ga...,"Comedy, Drama, School, Shounen, Sports",medium,10,9.15,93351,6 1 3 11 13


## Movie genres per movie

In [4]:
def _concatenate_tags_of_movie(tags):
    tags_as_str = ' '.join(set(tags))
    return tags_as_str

df_genres_per_movie = teleplay_data.groupby('teleplay_id')['genre_id'].agg(_concatenate_tags_of_movie)
df_genres_per_movie.name = 'movie_genres'

df_genres_per_movie.head()

teleplay_id
1    5 14 6 1 10 17
5      5 1 21 10 17
6            5 6 10
7    5 1 33 21 27 4
8        14 18 11 4
Name: movie_genres, dtype: object

## Group the data by mean, median and size

In [13]:
df_avg_ratings  = rating_data.groupby('teleplay_id')['rating'].agg(['mean', 'median', 'size'])
df_avg_ratings.columns = ['rating_mean', 'rating_median', 'num_ratings']
df_avg_ratings = df_avg_ratings.reset_index()

df_teleplay_with_ratings = pd.merge(teleplay_data, df_avg_ratings, how='left', on='teleplay_id')

df_teleplay_with_ratings = df_teleplay_with_ratings.dropna(how='any',axis=0)
df_teleplay_with_ratings = df_teleplay_with_ratings.reset_index(drop=True)

df_data = pd.merge(df_teleplay_with_ratings, df_genres_per_movie, how='left', on='teleplay_id')

df_data_with_genres = df_data[~df_data.movie_genres.isnull()].reset_index(drop=True)
df_data_with_genres = df_data_with_genres.reset_index(drop=True)

df_data_with_genres.drop(['type', 'episodes','rating','members','genre_id'], axis=1,inplace=True)
df_data_with_genres

Unnamed: 0,teleplay_id,name,genre,rating_mean,rating_median,num_ratings,movie_genres
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",9.426313,10.0,1961.0,1 2 3 4
1,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",9.449495,10.0,1188.0,5 6 7 8 9 10 11
2,9253,Steins;Gate,"Sci-Fi, Thriller",9.261326,10.0,17151.0,10 12
3,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",9.272552,10.0,3115.0,5 6 7 8 9 10 11
4,32935,Haikyuu!!: Karasuno Koukou VS Shiratorizawa Ga...,"Comedy, Drama, School, Shounen, Sports",9.171484,9.0,1038.0,6 1 3 11 13
...,...,...,...,...,...,...,...
8195,5541,The Satisfaction,Restricted,1.000000,1.0,2.0,41
8196,9316,Toushindai My Lover: Minami tai Mecha-Minami,Restricted,4.000000,4.0,2.0,41
8197,5543,Under World,Restricted,2.500000,2.5,2.0,41
8198,5621,Violence Gekiga David no Hoshi,Restricted,6.000000,6.0,1.0,41


## TF-IDF vectors

In [6]:
tf_idf = TfidfVectorizer()
df_movies_tf_idf_described = tf_idf.fit_transform(df_data_with_genres.genre)
m2m = cosine_similarity(df_movies_tf_idf_described)
df_tfidf_m2m = pd.DataFrame(cosine_similarity(df_movies_tf_idf_described))
df_tfidf_m2m

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,8190,8191,8192,8193,8194,8195,8196,8197,8198,8199
0,1.000000,0.000000,0.000000,0.000000,0.425402,0.000000,0.164436,0.000000,0.000000,0.549804,...,0.000000,0.000000,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000000,1.000000,0.213644,1.000000,0.175547,0.187201,0.190782,1.000000,1.000000,0.000000,...,0.409109,0.104628,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.213644,1.000000,0.213644,0.000000,0.000000,0.269367,0.213644,0.213644,0.000000,...,0.000000,0.000000,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,1.000000,0.213644,1.000000,0.175547,0.187201,0.190782,1.000000,1.000000,0.000000,...,0.409109,0.104628,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.425402,0.175547,0.000000,0.175547,1.000000,0.154327,0.148082,0.175547,0.175547,0.138202,...,0.096492,0.147594,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8195,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.558980,0.855010,0.49101,1.0,1.0,1.0,1.0,1.0,1.0,1.0
8196,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.558980,0.855010,0.49101,1.0,1.0,1.0,1.0,1.0,1.0,1.0
8197,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.558980,0.855010,0.49101,1.0,1.0,1.0,1.0,1.0,1.0,1.0
8198,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.558980,0.855010,0.49101,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Use teleplay id as index

In [7]:
index_to_teleplay_id = df_data_with_genres['teleplay_id']
df_tfidf_m2m.columns = [str(index_to_teleplay_id[int(col)]) for col in df_tfidf_m2m.columns]
df_tfidf_m2m.index = [index_to_teleplay_id[idx] for idx in df_tfidf_m2m.index]
df_tfidf_m2m

Unnamed: 0,32281,28977,9253,9969,32935,11061,820,15335,15417,4181,...,18197,12397,17833,10368,9352,5541,9316,5543,5621,6133
32281,1.000000,0.000000,0.000000,0.000000,0.425402,0.000000,0.164436,0.000000,0.000000,0.549804,...,0.000000,0.000000,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28977,0.000000,1.000000,0.213644,1.000000,0.175547,0.187201,0.190782,1.000000,1.000000,0.000000,...,0.409109,0.104628,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9253,0.000000,0.213644,1.000000,0.213644,0.000000,0.000000,0.269367,0.213644,0.213644,0.000000,...,0.000000,0.000000,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9969,0.000000,1.000000,0.213644,1.000000,0.175547,0.187201,0.190782,1.000000,1.000000,0.000000,...,0.409109,0.104628,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
32935,0.425402,0.175547,0.000000,0.175547,1.000000,0.154327,0.148082,0.175547,0.175547,0.138202,...,0.096492,0.147594,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5541,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.558980,0.855010,0.49101,1.0,1.0,1.0,1.0,1.0,1.0,1.0
9316,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.558980,0.855010,0.49101,1.0,1.0,1.0,1.0,1.0,1.0,1.0
5543,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.558980,0.855010,0.49101,1.0,1.0,1.0,1.0,1.0,1.0,1.0
5621,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.558980,0.855010,0.49101,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Most similar movies of first movie

In [8]:
df_tfidf_m2m.iloc[0].sort_values(ascending=False)[:10]

32281    1.000000
547      1.000000
546      1.000000
14669    0.953518
10067    0.876060
20903    0.876060
355      0.866504
2787     0.866504
6572     0.866504
713      0.855518
Name: 32281, dtype: float64

## Find user 53698's favourite teleplays

In [9]:

favourite_teleplay_ids = rating_data[rating_data.user_id == 53698][['user_id','teleplay_id','rating']]\
.sort_values(['rating'], ascending=False)

print(favourite_teleplay_ids.head(10))
index = favourite_teleplay_ids.iloc[0]['teleplay_id']
print()
print(index)
df_tfidf_m2m.loc[index].sort_values(ascending=False)[:10]

         user_id  teleplay_id  rating
5728876    53698          587      10
5728932    53698          694      10
5728884    53698          610      10
5728804    53698          433      10
5729313    53698         2105      10
5729315    53698         2129      10
5728647    53698          101      10
5729329    53698         2167      10
5730397    53698        10380      10
5728631    53698           59      10

587


7578     1.0
1847     1.0
3484     1.0
1219     1.0
3483     1.0
2459     1.0
1552     1.0
587      1.0
10723    1.0
2460     1.0
Name: 587, dtype: float64

In [10]:
df_data_with_genres[df_data_with_genres.teleplay_id == 587]

Unnamed: 0,teleplay_id,name,genre,rating_mean,rating_median,num_ratingsdf_tags_per_movie,movie_genres
881,587,Hanbun no Tsuki ga Noboru Sora,"Comedy, Drama, Romance",7.821004,8.0,2609.0,6 1 2


In [11]:
df_data_with_genres[df_data_with_genres.teleplay_id == 7578]

Unnamed: 0,teleplay_id,name,genre,rating_mean,rating_median,num_ratingsdf_tags_per_movie,movie_genres
4001,7578,Gokinjo Monogatari the Movie,"Comedy, Drama, Romance",6.689655,6.0,29.0,6 1 2


In [12]:
df_data_with_genres[df_data_with_genres.teleplay_id == 1847]

Unnamed: 0,teleplay_id,name,genre,rating_mean,rating_median,num_ratingsdf_tags_per_movie,movie_genres
1886,1847,Rec: Yurusarezarumono,"Comedy, Drama, Romance",7.457399,7.0,892.0,6 1 2
