In [None]:
#Importing necessary libraries
from surprise import NMF
from surprise import SVD
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate
from surprise.model_selection import GridSearchCV
import pandas as pd
import numpy as np
# Load Data
ml1m_dir = 'ratings.dat'
ml1m_rating = pd.read_csv(ml1m_dir, sep='::', header=None, names=['uid', 'mid', 'rating', 'timestamp'],  engine='python')


In [2]:
# Reindex and cleaning data to accurate format
user_id = ml1m_rating[['uid']].drop_duplicates().reindex()
user_id['userId'] = np.arange(len(user_id))
ml1m_rating = pd.merge(ml1m_rating, user_id, on=['uid'], how='left')
item_id = ml1m_rating[['mid']].drop_duplicates()
item_id['itemId'] = np.arange(len(item_id))
ml1m_rating = pd.merge(ml1m_rating, item_id, on=['mid'], how='left')
ml1m_rating = ml1m_rating[['userId', 'itemId', 'rating', 'timestamp']]


In [3]:
# Convert the timestamp column to a datetime object
ml1m_rating['date'] = pd.to_datetime(ml1m_rating['timestamp'], unit='s')  # Assuming Unix timestamp in seconds
# Extract the date and create a new column
ml1m_rating['date'] = ml1m_rating['date'].dt.date
df = ml1m_rating


In [4]:
# What the current data looks like
df

Unnamed: 0,userId,itemId,rating,timestamp,date
0,0,0,5,978300760,2000-12-31
1,0,1,3,978302109,2000-12-31
2,0,2,3,978301968,2000-12-31
3,0,3,4,978300275,2000-12-31
4,0,4,5,978824291,2001-01-06
...,...,...,...,...,...
1000204,6039,772,1,956716541,2000-04-26
1000205,6039,1106,5,956704887,2000-04-25
1000206,6039,365,5,956704746,2000-04-25
1000207,6039,152,4,956715648,2000-04-26


In [5]:
# Create columns for different ypes of users
df['Weak_user'] = np.nan
df['Average_user'] = np.nan
df['Strong_user'] = np.nan
df['Weak_item'] = np.nan
df['Average_item'] = np.nan
df['Strong_item'] = np.nan


In [7]:
# Define the thresholds for defining weak/average/strong users, and weak/average/strong items for each rating
# Rmin/Rmax - Min/Max of all ratings
# T1_u/T2_u - Threshold for weak user(Below T1_u) and strong user(Above T2_u); Average user(In between)
# T1_i/T2_i - Threshold for weak item(Below T1_u) and strong item(Above T2_u); Average item(In between)
# T1/T2     -
Rmin = df['rating'].min()  #
Rmax = df['rating'].max()  #
T1_u = Rmin + round((1/3) * (Rmax - Rmin)) #
T2_u = Rmax - round((1/3) * (Rmax - Rmin))#
T1_i = Rmin + round((1/3) * (Rmax - Rmin))#
T2_i = Rmax - round((1/3) * (Rmax - Rmin))#
T1 =   Rmin + round((1/3) * (Rmax - Rmin))
T2 =  Rmax - round((1/3) * (Rmax - Rmin))


In [8]:
# Classify users and items into Weak/Average/Strong respectively  for each rating
for index, row in df.iterrows():
    user_rating = row['rating']
    if movie_rating < T1_u:
        df.loc[index,'Weak_movie'] = 1
    elif T1_u <= user_rating < T2_u:
        df.loc[index,'Average_user'] = 1
    else:
        df.loc[index,'Strong_user'] = 1

    if user_rating < T1_i:
        df.loc[index,'Weak_item'] = 1
    elif T1_i <= user_rating < T2_i:
        df.loc[index,'Average_item'] = 1
    else:
        df.loc[index,'Strong_item'] = 1

KeyboardInterrupt: 

In [None]:
df

In [None]:
# Rename the userId and itemId columns to user_id and movie_id respectively(To better fit the code)
df = df.rename(columns={'userId': 'user_id', 'itemId': 'movie_id'})

# Create 2 separate dataframes:
# user - Containd userid and information on whether the user is weak/average/strong
# movie - Containd movieid and information on whether the movie is weak/average/strong
user = df[["user_id","Weak_user","Average_user","Strong_user"]]
movie = df[["movie_id","Weak_item","Average_item","Strong_item"]]

# For each user, find how many ratings there are in which the user is considered a weak user, average user, and a strong user respectively
user = user.groupby("user_id").agg({
    'Weak_user': 'sum',
    'Average_user': 'sum',
    'Strong_user': 'sum'
})

In [None]:
# Function that classifies each user into weak/average/strong user 
# Eg If for a user has 10 ratings, and 6 of which considers the user weak, 2 of whicg strong, and the remaining 2 weak.
# Then 6 > 2+2, and hence the user is considered weak
def classify_user(row):
    weak_count = row['Weak_user']
    average_count = row['Average_user']
    strong_count = row['Strong_user']

    if weak_count >= strong_count + average_count:
        return 0
    elif average_count >= strong_count + weak_count:
        return 1
    elif strong_count >= average_count + weak_count:
        return 2
    else:
        return -1


In [None]:
# Classify user as defined in function above 
user['User_Classification'] = user.apply(classify_user, axis=1)
user.reset_index(inplace=True)
user = user[['user_id',"User_Classification"]]


In [None]:
# Merge the classification for each user into the original df dataframe 
df = df[["movie_id","user_id","rating","date"]].merge(user,how="left",on="user_id")
del user


In [None]:
# For each movie, find how many ratings there are in which the movie is considered a weak movie, average movie, and a strong movie respectively
movie = movie.groupby("movie_id").agg({
    'Weak_item': 'sum',
    'Average_item': 'sum',
    'Strong_item': 'sum'
})


In [None]:
# Function that classifies each movie into weak/average/strong movie
# Eg If for a movie has 10 ratings, and 6 of which considers the movie weak, 2 of which strong, and the remaining 2 weak.
# Then 6 > 2+2, and hence the movie is considered weak
def classify_item(row):
    weak_count = row['Weak_item']
    average_count = row['Average_item']
    strong_count = row['Strong_item']

    if weak_count >= strong_count + average_count:
        return 0
    elif average_count >= strong_count + weak_count:
        return 1
    elif strong_count >= average_count + weak_count:
        return 2
    else:
        return -1


In [None]:
# Classify each movie as defined in function above 
movie['Item_Classification'] = movie.apply(classify_item, axis=1)
movie.reset_index(inplace=True)
movie = movie[['movie_id',"Item_Classification"]]


In [None]:
# Merge the classification for each movie into the original df dataframe 
df = df[["movie_id","user_id","rating","date","User_Classification"]].merge(movie,how="left",on="movie_id")
del movie


In [None]:
# Classify each rating based on whether the "type" of user matches with the "type" of movie
# Example: A critical user(Weak_user) rates a movie that is generally dislike(Weak_Movie) as very good(rating >= T1) - This type of movie can be considered as possible noise
def classify_rating(row):
    user_class = row['User_Classification']
    item_rec = row['Item_Classification']
    rating = row['rating']

    if user_class == 0 and item_rec == 0 and rating >= T1:
        return 1
    elif user_class == 1 and item_rec == 1 and (rating < T1 or rating >= T2):
        return 1
    elif user_class == 2 and item_rec == 2 and rating < T2:
        return 1
    else:
      return 0


In [None]:
# Classify each rating to check if there is noise or not as defined in function above 
df['possible_noise'] = df.apply(classify_rating, axis=1)

In [None]:
# Create a reader using surprise library and load dataframe
reader = Reader(rating_scale=(1,5))
data1 = Dataset.load_from_df(df[["user_id","movie_id","rating"]],reader)

In [None]:
# Set grid for hyperparameter tuning
param_grid = {
   'n_epochs': np.arange(10, 51, 10),
   'n_factors' : np.arange(10, 51, 10),
   'lr_all': [0.02,0.05,0.1,0.3,0.5,0.7],
   'reg_all':[0.02,0.05,0.1,0.3,0.5,0.7]
}

In [None]:
#Hyperparameter tune using 5-fold cv
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=5)
gs.fit(data1)

In [None]:
#Print the best parameters
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

In [None]:
#Use best hyperparameter and create model
best_factor = gs.best_params['rmse']['n_factors']
best_epoch = gs.best_params['rmse']['n_epochs']
best_lr_all= gs.best_params['rmse']['lr_all']
best_reg_all = gs.best_params['rmse']['reg_all']
# nmf_best_param uses SVD - No time to change the names
nmf_best_param = SVD(n_factors=best_factor, n_epochs=best_epoch,lr_all=best_lr_all,reg_all=best_reg_all)

In [None]:
# Split into 2 datasets, train_ and test_
# train_ are ratings that are found to have no noise
# test_ are ratings that are found to have noise
# (Not actually using them for any training/testing)
train_ = df[df['possible_noise'] == 0]
test_ = df[df['possible_noise'] == 1]

In [None]:
# Load the non-noisy and noisy ratings
non_noisy_ratings = Dataset.load_from_df(train_[["user_id","movie_id","rating"]],reader)
noisy_ratings = Dataset.load_from_df(test_[["user_id","movie_id","rating"]],reader)

In [None]:
#Predict the "cleaned" ratings for the possibly noisy ratings
nmf_non_noisy_ratings = nmf_best_param.fit(non_noisy_ratings.build_full_trainset())
prediction_on_noisy = nmf_best_param.test(noisy_ratings.build_full_trainset().build_testset())

In [None]:
# convert the predicted("cleaned") ratings into dataframe and retain the necessary columns only: user_id, movie_id, predicted_rating
prediction_df = pd.DataFrame(prediction_on_noisy, columns=["user_id", "movie_id", "actual_rating", "predicted_rating", "details"])
prediction_df = prediction_df[["user_id", "movie_id", "predicted_rating"]]
prediction_df['predicted_rating'] = prediction_df['predicted_rating'].round(2)
prediction_df

In [None]:
#Convert movie_id and user_id in prediction_df to integers in prediction_df
prediction_df['movie_id'] = prediction_df['movie_id'].astype(int)
prediction_df['user_id'] = prediction_df['user_id'].astype(int)


In [None]:
#Convert movie_id and user_id in df to integers in prediction_df
df['movie_id'] = df['movie_id'].astype(int)
df['user_id'] = df['user_id'].astype(int)

In [None]:
# Merge df with prediction_df
merged_df = df.merge(prediction_df, on=['user_id', 'movie_id'], how='left')
merged_df


In [None]:
# For each possibly noisy rating, if the difference between the predicted("cleaned") rating and the actual rating is more than 1,
# we deemed the rating to be actually noisy, and substitute the predicted rating with the actual rating.

#If the rating is not noisy, nothing happens and we keep the rating

for index, row in merged_df.iterrows():
    # Check if 'predicted_rating' is NaN
    if pd.isna(row['predicted_rating']):
        # Fill 'predicted_rating' with 'rating'
        merged_df.at[index, 'predicted_rating'] = row['rating']
    else:
        # Calculate the absolute difference between 'rating' and 'predicted_rating'
        abs_diff = abs(row['rating'] - row['predicted_rating'])

        # Check if the absolute difference is greater than 1
        if abs_diff < 1:
            # Substitute 'predicted_rating' with 'rating'
            merged_df.at[index, 'predicted_rating'] = row['rating']


In [None]:
# Saved the ratings into csv 
merged_df.to_csv('movielens1m_natural_noise_ratings.csv',index=False)