# **Install and Import Modules**

In [None]:
!pip install scikit-surprise

In [None]:
from surprise import Dataset, Reader
from surprise.prediction_algorithms.matrix_factorization import SVD
from surprise import accuracy
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from sklearn.model_selection import train_test_split
import time

# **Load and Preprocess the Data**

In [None]:
# Import dataset
train = pd.read_csv(r"C:\Users\Mukaidziwa M\Desktop\ALX-DS\Unsupervised Learning\Week 4\Recommender Systems\ALX Movie Recommendation Project 2024\train.csv\train.csv")
test = pd.read_csv(r"C:\Users\Mukaidziwa M\Desktop\ALX-DS\Unsupervised Learning\Week 4\Recommender Systems\ALX Movie Recommendation Project 2024\test.csv\test.csv")
movies = pd.read_csv(r"C:\Users\Mukaidziwa M\Desktop\ALX-DS\Unsupervised Learning\Week 4\Recommender Systems\ALX Movie Recommendation Project 2024\movies.csv\movies.csv")

# Merge the training data with the movies data to include genres information for each movie
df = pd.merge(train, movies[['movieId', 'genres']], on = 'movieId', how = 'left')

df

In [None]:
#Missing values
df.isnull().sum()

In [None]:
# Initialize LabelEncoders for encoding user and movie IDs
user_encoder = LabelEncoder()
movie_encoder = LabelEncoder()

# Initialize MultiLabelBinarizer for encoding movie genres into binary format
mlb = MultiLabelBinarizer()

# Encode the userId and movieId columns using the respective LabelEncoders
df['userId'] = user_encoder.fit_transform(df['userId'])
df['movieId'] = movie_encoder.fit_transform(df['movieId'])

# Split the genres column into separate genre tags, encode them using MultiLabelBinarizer, 
# and join the resulting binary genre indicators back to the original dataframe
df = df.join(pd.DataFrame(mlb.fit_transform(df.pop('genres').str.split('|')), columns = mlb.classes_, index = df.index ))

In [None]:
df

In [None]:
# Drop the column representing movies with no genres listed from the dataframe
df.drop(columns = "(no genres listed)", inplace = True)


In [None]:
df

In [None]:
# Sample 10,000,000 rows from the dataframe for analysis or model training
sampled_df = df.sample(n=10000000, random_state=42)

In [None]:
sampled_df

# **Build the Model with Collabrative filtering**

In [None]:
%%time

# Define a reader object with the rating scale for the Surprise library
reader = Reader(rating_scale = (0.5, 5))

# Load the sampled dataframe into a Surprise Dataset object
data = Dataset.load_from_df(sampled_df[['userId', 'movieId', 'rating']], reader)

# Build the full trainset from the dataset
trainset = data.build_full_trainset()

In [None]:
%%time

# Initialize the SVD model and fit it to the training set
model_svd = SVD()
model_svd.fit(trainset)

# **Predictions on Test dataset**

In [None]:
%%time

import numpy as np

def predicted_rating(df, model_svd, chunk_size=100):
    # Create an empty list to store the predictions
    preds = []
    
    # Calculate the number of chunks
    num_chunks = (len(df) // chunk_size) + 1
    
    # Process each chunk separately
    for i in range(num_chunks):
        # Define the start and end index of the chunk
        start_idx = i * chunk_size
        end_idx = min((i + 1) * chunk_size, len(df))
        
        # Extract the chunk
        chunk = df.iloc[start_idx:end_idx]
        
        # Create a list of (user, movie) pairs
        user_movie_pairs = [(row['userId'], row['movieId'], 0) for _, row in chunk.iterrows()]
        
        # Get predictions for each user-movie pair in the chunk
        chunk_preds = model_svd.test(user_movie_pairs)
        
        # Extract the estimated ratings and append to the preds list
        preds.extend([pred.est for pred in chunk_preds])
    
    # Combine predictions into a single numpy array
    final_preds = np.array(preds)
    
    # Add the predicted ratings to the DataFrame
    df['predicted_rating'] = final_preds
    
    return df


In [None]:
%%time

#Predict movies ratings
test_final = predicted_rating(test, model_svd, chunk_size=100)

In [None]:
#Final test dataframe
test_final


In [None]:
%%time

# Ensure necessary imports
import pandas as pd

# Create the Id column by combining userId and movieId
test_final['userId'] = test_final['userId'].astype(str).str.replace('-', '')
test_final['movieId'] = test_final['movieId'].astype(str).str.replace('-', '')

test_final['Id'] = test_final['userId'] + '_' + test_final['movieId']

# Ensure predicted_rating is a Series of the same length as sampled_test
predicted_rating = pd.Series(predicted_rating, index=test_final.index)

# Create a DataFrame for submission
submission = pd.DataFrame({
    'Id': test_final['Id'],
    'rating': test_final['predicted_rating']
})

# Save to CSV
submission.to_csv('submission.csv', index=False)

# Print the first few rows to verify
submission