In [9]:
"""
Naive Bayes is a simple yet powerful probabilistic classifier based on Bayes' Theorem, with the assumption of 
independence between features. In the context of rating prediction, it treats the problem as a classification 
task. However, it's not commonly used for rating predictions due to its assumption of feature independence and 
its typical application in classification rather than regression tasks.
"""

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import mean_squared_error
import numpy as np

# Function to load and preprocess data
def load_and_preprocess_data(train_path, test_path):
    # Load data
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    return train_data, test_data

# Custom encoding for handling unseen items/users
def custom_encode(train_data, test_data, columns):
    for col in columns:
        unique_values = set(train_data[col].unique())
        test_data[col] = test_data[col].apply(lambda x: x if x in unique_values else None)
    return train_data, test_data

# Load your data
train_data, test_data = load_and_preprocess_data('train_data.csv', 'test_data.csv')

# Encode 'item_id' and 'user_id' with custom encoding
train_data, test_data = custom_encode(train_data, test_data, ['item_id', 'user_id'])

# Handling missing values after encoding
train_data.fillna(-1, inplace=True)
test_data.fillna(-1, inplace=True)

# Define features and target variable
features = ['item_id', 'user_id']
target = 'rating'

# Split the training data for model evaluation
X_train, y_train = train_data[features],train_data[target]
X_test, y_test = test_data[features],test_data[target]

# Train Naive Bayes model
model = GaussianNB()
model.fit(X_train, y_train)

# Predict on validation set
predictions = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, predictions)
print(f"MSE: {mse}")

MSE: 2.982056979867998


In [10]:
"""
SVD is a matrix factorization technique commonly used in collaborative filtering for recommendation systems. It 
decomposes the user-item rating matrix into lower-dimensional matrices, capturing latent factors representing 
user and item interactions. SVD is effective for capturing complex patterns but requires tuning parameters like 
the number of latent factors.
"""

from surprise import Dataset, Reader, SVD, accuracy
from surprise.model_selection import train_test_split, PredefinedKFold
import pandas as pd

# Load your data
train_data, test_data = load_and_preprocess_data('train_data.csv', 'test_data.csv')
train_data['rating'] /= 2
reader = Reader(rating_scale=(1, 5))  # rating scaled to 1 to 5

# Load data into Surprise's format
trainset = Dataset.load_from_df(train_data[['user_id', 'item_id', 'rating']], reader).build_full_trainset()

# Train SVD model
model = SVD()
model.fit(trainset)

# Prepare test set for prediction
testset = list(zip(test_data['user_id'], test_data['item_id'], [None]*len(test_data)))
predictions = model.test(testset)

# Extract real and predicted ratings
test_data['predicted_rating'] = [pred.est*2 for pred in predictions]

# Calculate MSE between actual and predicted ratings
mse = mean_squared_error(test_data['rating'], test_data['predicted_rating'])
print(f"MSE: {mse}")

MSE: 2.0224027639463267


In [11]:
"""
SVD++ is an enhancement of SVD that considers implicit feedback (the fact that a user rated an item, regardless 
of the rating value). It adds additional parameters to capture this implicit feedback, often leading to improved
accuracy over basic SVD, especially in datasets with a lot of implicit user feedback.
"""

from surprise import Dataset, Reader, SVDpp, accuracy
import pandas as pd

# Load your data
train_data, test_data = load_and_preprocess_data('train_data.csv', 'test_data.csv')
train_data['rating'] /= 2
reader = Reader(rating_scale=(1, 5))  # rating scaled to 1 to 5

# Load data into Surprise's format
trainset = Dataset.load_from_df(train_data[['user_id', 'item_id', 'rating']], reader).build_full_trainset()

# Train SVD++ model
model = SVDpp()
model.fit(trainset)

# Prepare test set for prediction
testset = list(zip(test_data['user_id'], test_data['item_id'], [None]*len(test_data)))
predictions = model.test(testset)

# Extract real and predicted ratings
test_data['predicted_rating'] = [pred.est*2 for pred in predictions]

# Calculate MSE between actual and predicted ratings
mse = mean_squared_error(test_data['rating'], test_data['predicted_rating'])
print(f"MSE: {mse}")

MSE: 2.0141989075159734


In [12]:
"""
K-Nearest Neighbors with cosine similarity is a collaborative filtering technique that calculates similarity 
between items or users based on their rating patterns. The cosine similarity measure is used to find 'neighbors',
either items or users that are similar. KNN with Means, a variant, adjusts for the mean rating of each user or 
item, potentially improving prediction accuracy.
"""

from surprise import Dataset, Reader, KNNWithMeans, accuracy
from surprise.model_selection import train_test_split
import pandas as pd

# Load your data
train_data, test_data = load_and_preprocess_data('train_data_2.csv', 'test_data.csv')
reader = Reader(rating_scale=(1, 5))  # rating scaled to 0 to 10

# Load data into Surprise's format
trainset = Dataset.load_from_df(train_data[['user_id', 'item_id', 'rating']], reader).build_full_trainset()

# Train KNNWithMeans model
sim_options = {
    'name': 'cosine',
    'user_based': False  # Compute similarities between items
}
model = KNNWithMeans(sim_options=sim_options)
model.fit(trainset)

# Prepare the test set for prediction
testset = list(zip(test_data['user_id'], test_data['item_id'], test_data['rating']))

# Predict ratings
predictions = model.test(testset)

# Round the predicted ratings to the nearest integer
test_data['predicted_rating'] = [pred.est*2 for pred in predictions]

# Calculate MSE between actual and predicted ratings
mse = mean_squared_error(test_data['rating'], test_data['predicted_rating'])
print(f"MSE: {mse}")

Computing the cosine similarity matrix...
Done computing similarity matrix.
MSE: 2.31007537764338


In [13]:
"""
NMF is another matrix factorization technique that factorizes the user-item matrix into two non-negative matrices.
It's particularly effective in scenarios requiring non-negative data representations, such as rating predictions. 
NMF can uncover latent features in user-item interactions, but like SVD, it needs careful tuning of hyperparameters 
such as the number of factors.
"""

from surprise import Dataset, Reader, NMF, accuracy
import pandas as pd

# Load your data
train_data, test_data = load_and_preprocess_data('train_data_2.csv', 'test_data.csv')
reader = Reader(rating_scale=(1, 5))  # rating scaled to 1 to 5

# Load data into Surprise's format
trainset = Dataset.load_from_df(train_data[['user_id', 'item_id', 'rating']], reader).build_full_trainset()

# Train NMF model
model = NMF()
model.fit(trainset)

# Prepare the test set for prediction
testset = list(zip(test_data['user_id'], test_data['item_id'], test_data['rating']))

# Predict ratings
predictions = model.test(testset)

# Round the predicted ratings to the nearest integer
test_data['predicted_rating'] = [pred.est*2 for pred in predictions]

# Calculate RMSE between actual and predicted ratings
mse = mean_squared_error(test_data['rating'], test_data['predicted_rating'])
print(f"MSE: {mse}")

MSE: 3.3296373055775326
