In [100]:
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter

df_rating = pd.read_csv('../dataset/rating_final.csv')
df_movie = pd.read_csv('../dataset/movie.csv')

In [101]:
df_movie.head(10)

Unnamed: 0,id,name,url,poster,average_rating
0,tt0107692,Ninja Scroll,https://www.imdb.com/title/tt0107692,https://m.media-amazon.com/images/M/MV5BNmEwYm...,7.8
1,tt0106364,Batman: Mask of the Phantasm,https://www.imdb.com/title/tt0106364,https://m.media-amazon.com/images/M/MV5BYTRiMW...,7.8
2,tt0275277,Cowboy Bebop: The Movie,https://www.imdb.com/title/tt0275277,https://m.media-amazon.com/images/M/MV5BNTE5YT...,7.8
3,tt0291350,Millennium Actress,https://www.imdb.com/title/tt0291350,https://m.media-amazon.com/images/M/MV5BNmY4ND...,7.8
4,tt0388473,Tokyo Godfathers,https://www.imdb.com/title/tt0388473,https://m.media-amazon.com/images/S/sash/4Fyxw...,7.8
5,tt2321405,My Life as a Zucchini,https://www.imdb.com/title/tt2321405,https://m.media-amazon.com/images/S/sash/4Fyxw...,7.8
6,tt0327597,Coraline,https://www.imdb.com/title/tt0327597,https://m.media-amazon.com/images/S/sash/4Fyxw...,7.7
7,tt0398286,Tangled,https://www.imdb.com/title/tt0398286,https://m.media-amazon.com/images/S/sash/4Fyxw...,7.7
8,tt1490017,The Lego Movie,https://www.imdb.com/title/tt1490017,https://m.media-amazon.com/images/S/sash/4Fyxw...,7.7
9,tt1979376,Toy Story 4,https://www.imdb.com/title/tt1979376,https://m.media-amazon.com/images/S/sash/4Fyxw...,7.7


In [102]:
df_rating.head(10)

Unnamed: 0,movie_id,user_id,rate
0,tt0032455,ur156820301,5.0
1,tt0032455,ur89008813,3.0
2,tt0104652,ur0281280,6.0
3,tt0104652,ur104436023,2.0
4,tt0104652,ur94900494,10.0
5,tt3901826,ur58133658,10.0
6,tt1488589,ur57257775,6.0
7,tt0441773,ur66705434,9.0
8,tt0097757,ur1192901,10.0
9,tt7979580,ur0337549,2.0


**Remove Duplicate Rows**

In [103]:
print(f'Rating has {len(df_rating)} rows, has duplicated rows: {df_rating.duplicated().any()}')
print(f'Movie has {len(df_movie)} rows, has duplicated rows: {df_movie.duplicated().any()}')

df_rating = df_rating.drop_duplicates(keep = 'last')
df_movie = df_movie.drop_duplicates(subset = ['id'], keep = 'last')

print('After removing duplicates rows')
print(f'Rating has {len(df_rating)} rows, has duplicated rows: {df_rating.duplicated().any()}')
print(f'Movie has {len(df_movie)} rows, has duplicated rows: {df_movie.duplicated().any()}')

Rating has 59241 rows, has duplicated rows: False
Movie has 653 rows, has duplicated rows: False
After removing duplicates rows
Rating has 59241 rows, has duplicated rows: False
Movie has 653 rows, has duplicated rows: False


In [104]:
print(f"Sparse ratio: {len(df_rating) / (len(df_rating['movie_id'].unique()) * len(df_rating['user_id'].unique()))})")

Sparse ratio: 0.1209)


Choose best candidates movie, user which has appeared most in dataset

In [105]:
number_of_user = 1000
number_of_movie = 500

user_id_count = Counter(df_rating['user_id'])
movie_ids_count = Counter(df_rating['movie_id'])

user_ids = [u for u, c in user_id_count.most_common(number_of_user)]
movie_ids = [m for m, c in movie_ids_count.most_common(number_of_movie)]

df_small = df_rating[df_rating['user_id'].isin(user_ids) & df_rating['movie_id'].isin(movie_ids)].copy()
df_small = df_small.drop_duplicates(keep='first')

df_movie_small = df_movie[df_movie['id'].isin(movie_ids)].drop('url', axis = 1, inplace = False)
df_user_small = pd.DataFrame([[u_id, u_id] for u_id in user_ids], columns=['id', 'username'])

print(len(df_small))
print(f'Sparse ratio: {len(df_small) / (number_of_user * number_of_movie)}')

59241
Sparse ratio: 0.118482


In [106]:
df_small.to_csv('../dataset/rating_final.csv', index = False, sep=',')
df_movie_small.to_csv('../dataset/movie_final.csv', index = False, sep=',')
df_user_small.to_csv('../dataset/user_final.csv', index = False, sep=',')