In [1]:
# Import required packages
import pandas as pd
import numpy as np
import math
import re
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

In [43]:
# Read one of the files for test purposes
df_file_1 = pd.read_csv('combined_data_1.txt', header=None, names=['customer_id', 'rating', 'date_given'])

In [44]:
df_file_1.head()

Unnamed: 0,customer_id,rating,date_given
0,1:,,
1,1488844,3.0,2005-09-06
2,822109,5.0,2005-05-13
3,885013,4.0,2005-10-19
4,30878,4.0,2005-12-26


In [45]:
# Drop the date_given column for now and convert rating column to float
df_file_1.drop(columns=['date_given'], inplace=True)

df_file_1['rating'] = df_file_1['rating'].astype('float')

In [46]:
# Find the indices of NaN values in the 'rating' column
nan_indices = df_file_1[df_file_1['rating'].isnull()].index

# Initialize movie_np array with zeros
movie_np = np.zeros(len(df_file_1), dtype=int)

# Assign movie ids to consecutive NaN values
movie_id = 1
prev_index = nan_indices[0]
for index in nan_indices[1:]:
    movie_np[prev_index+1:index] = movie_id
    movie_id += 1
    prev_index = index

# Assign movie id to the remaining NaN values after the last NaN index
movie_np[nan_indices[-1]+1:] = movie_id

# Remove leading zeros if any
movie_np = movie_np[movie_np != 0]

In [47]:
df_file_1.dropna(subset=['rating'], inplace=True)
df_file_1['movie_id'] = movie_np.astype(int)
df_file_1['customer_id'] = df_file_1['customer_id'].astype(int)

In [48]:
df_file_1.tail()

Unnamed: 0,customer_id,rating,movie_id
24058258,2591364,2.0,4499
24058259,1791000,2.0,4499
24058260,512536,5.0,4499
24058261,988963,3.0,4499
24058262,1704416,3.0,4499


In [49]:
funcs = ['count','mean']

df_movie_group = df_file_1.groupby('movie_id')['rating'].agg(funcs)
movie_count_sliced = round(df_movie_group['count'].quantile(0.8), 0)
movie_sliced_idx = df_movie_group[df_movie_group['count'] < movie_count_sliced].index

df_cust_group = df_file_1.groupby('customer_id')['rating'].agg(funcs)
cust_count_sliced = round(df_cust_group['count'].quantile(0.8), 0)
cust_sliced_idx = df_cust_group[df_cust_group['count'] < cust_count_sliced].index

In [50]:
cust_count_sliced

79.0

In [51]:
sliced_mvis = df_file_1['movie_id'].isin(movie_sliced_idx)
df_file_1 = df_file_1[~sliced_mvis]
sliced_cust = df_file_1['customer_id'].isin(cust_sliced_idx)
df_file_1 = df_file_1[~sliced_cust]

In [53]:
df_file_1.shape

(13528427, 3)

In [54]:
pivot_df = pd.pivot_table(df_file_1, values='rating', index='customer_id', columns='movie_id')

In [60]:
df_movie_titles = pd.read_csv('movie_titles.csv', encoding='ISO-8859-1', header=None, names=['movie_id', 'released_year','movie_name', 'dum','dum2', 'dum3'])

In [66]:
df_movie_titles[~df_movie_titles['dum'].isna()]

Unnamed: 0,movie_id,released_year,movie_name,dum,dum2,dum3
71,72,1974.0,At Home Among Strangers,A Stranger Among His Own,,
263,264,2002.0,Angelina Ballerina: Lights,Camera,Action!,
349,350,1993.0,Dr. Quinn,Medicine Woman: Season 3,,
365,366,2004.0,Still,We Believe: The Boston Red Sox Movie,,
393,394,1916.0,20,000 Leagues Under the Sea,,
...,...,...,...,...,...,...
17346,17347,2002.0,Read-Along: Monsters,Inc.,,
17418,17419,1974.0,It's the Easter Beagle,Charlie Brown,,
17572,17573,1981.0,Fort Apache,the Bronx,,
17596,17597,1969.0,They Shoot Horses,Don't They?,,
