In [None]:
import pandas as pd
import numpy as np
from surprise import Dataset, Reader, SVD, SVDpp
from surprise.model_selection import train_test_split
from surprise.model_selection import cross_validate
from surprise import accuracy
from surprise.model_selection import KFold

In [2]:
# Load the dataset
train_df = pd.read_csv("train.csv")  # Ensure you have this dataset
test_df = pd.read_csv("test.csv")

In [3]:
# Define a reader for Surprise
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(train_df[['userId', 'movieId', 'rating']], reader)

# Split into train and test sets
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)


In [6]:
# Train SVD model
svd = SVD()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x183a4296a50>

In [5]:
# Train SVD++ model
svdpp = SVDpp()
svdpp.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x22b11aefbc0>

In [None]:
predictions_svdpp = svdpp.test(testset)

In [None]:
# Make predictions using both models
predictions_svd = svd.test(testset)


In [None]:
# perform KFold cross-validation to find the best rmse
kf = KFold(n_splits=5, random_state=42, shuffle=True)
cv_results = cross_validate(svdpp,data,measures=["RMSE"], cv=kf, verbose =True)

best_rmse = min(cv_results['test_rmse'])
print(f"best rmse: {best_rmse}")

In [9]:
# Ensemble approach: averaging predictions
ensemble_predictions = []
for pred_svd, pred_svdpp in zip(predictions_svd, predictions_svdpp):
    user, item, true_r, _, _ = pred_svd
    est_rating = (pred_svd.est + pred_svdpp.est) / 2
    ensemble_predictions.append((user, item, true_r, est_rating))

# Calculate RMSE
rmse = np.sqrt(np.mean([(true_r - est) ** 2 for _, _, true_r, est in ensemble_predictions]))
print(f"Ensemble RMSE: {rmse:.4f}")

Ensemble RMSE: 0.8141


In [10]:
# Predict ratings using the ensemble model
predictions = []
for _, row in test_df.iterrows():
    user, item = row['userId'], row['movieId']
    est_svd = svd.predict(user, item).est
    est_svdpp = svdpp.predict(user, item).est
    est_final = (est_svd + est_svdpp) / 2
    predictions.append({'Id': f"{user}_{item}", 'rating': round(est_final, 1)})

# Convert to DataFrame and display results
result_df = pd.DataFrame(predictions)
result_df.head()

Unnamed: 0,Id,rating
0,1_2011,2.8
1,1_4144,3.9
2,1_5767,3.6
3,1_6711,4.4
4,1_7318,2.5


In [None]:

# Convert and save

result_df.to_csv('submission9.csv', index=False)