In [21]:
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter

df_movie = pd.read_csv('../dataset/movie.csv')
df_rating = pd.read_csv('../dataset/rating_final.csv')

In [2]:
df_rating.head(10)

Unnamed: 0,movie_id,user_id,rate
0,tt0107692,ur3071695,8.0
1,tt0107692,ur80388571,10.0
2,tt0107692,ur20552756,10.0
3,tt0107692,ur0892646,10.0
4,tt0107692,ur0453068,10.0
5,tt0107692,ur2685454,9.0
6,tt0107692,ur0740195,10.0
7,tt0107692,ur2810112,10.0
8,tt0107692,ur18145747,10.0
9,tt0107692,ur14923871,9.0


**Remove Duplicate Rows**

In [22]:
print(f'Rating has {len(df_rating)} rows, has duplicated rows: {df_rating.duplicated().any()}')
print(f'Movie has {len(df_movie)} rows, has duplicated rows: {df_movie.duplicated().any()}')

df_rating = df_rating.drop_duplicates(keep = 'last')
df_movie = df_movie.drop_duplicates(subset = ['id'], keep = 'last')

print('After removing duplicates rows')
print(f'Rating has {len(df_rating)} rows, has duplicated rows: {df_rating.duplicated().any()}')
print(f'Movie has {len(df_movie)} rows, has duplicated rows: {df_movie.duplicated().any()}')

Rating has 59241 rows, has duplicated rows: False
Movie has 653 rows, has duplicated rows: False
After removing duplicates rows
Rating has 59241 rows, has duplicated rows: False
Movie has 653 rows, has duplicated rows: False


In [23]:
print(f"Sparse ratio: {len(df_rating) / (len(df_rating['movie_id'].unique()) * len(df_rating['user_id'].unique()))})")

Sparse ratio: 0.1209)


Choose best candidates movie, user which has appeared most in dataset

In [24]:
number_of_user = 1000
number_of_movie = 500

user_id_count = Counter(df_rating['user_id'])
movie_ids_count = Counter(df_rating['movie_id'])

user_ids = [u for u, c in user_id_count.most_common(number_of_user)]
movie_ids = [m for m, c in movie_ids_count.most_common(number_of_movie)]

df_small = df_rating[df_rating['user_id'].isin(user_ids) & df_rating['movie_id'].isin(movie_ids)].copy()
df_small = df_small.drop_duplicates(keep='first')

print(len(df_small))
print(f'Sparse ratio: {len(df_small) / (number_of_user * number_of_movie)}')

59241
Sparse ratio: 0.118482


In [20]:
df_small.to_csv('../dataset/rating_final.csv', index = False, sep=',')