## Collaborative filtering recommendation system

In [11]:
import pandas as pd

# Load the test CSV file
test_data_path = '/content/test.csv'
test_data_df = pd.read_csv(test_data_path)

# Display the first few rows of the test data
print(test_data_df.head())

# Check for missing data
print(test_data_df.isnull().sum())

# Check the number of rows in the test data
num_rows_test_data = len(test_data_df)
print(f"Number of rows in test data: {num_rows_test_data}")


   userId  movieId
0       1     2011
1       1     4144
2       1     5767
3       1     6711
4       1     7318
userId     0
movieId    0
dtype: int64
Number of rows in test data: 5000019


In [12]:
# Load the training CSV file
train_data_path = '/content/train.csv'
train_data_df = pd.read_csv(train_data_path)

# Display the first few rows of the training data
print(train_data_df.head())

# Check for missing data
print(train_data_df.isnull().sum())

# Check the number of rows in the training data
num_rows_train_data = len(train_data_df)
print(f"Number of rows in training data: {num_rows_train_data}")

   userId  movieId  rating   timestamp
0    5163    57669     4.0  1518349992
1  106343        5     4.5  1206238739
2  146790     5459     5.0  1076215539
3  106362    32296     2.0  1423042565
4    9041      366     3.0   833375837
userId       0
movieId      0
rating       0
timestamp    0
dtype: int64
Number of rows in training data: 10000038


In [14]:
# Drop rows with missing values in movieId, rating, or timestamp columns
train_data_df.dropna(subset=['movieId', 'rating', 'timestamp'], inplace=True)

# Verify that there are no missing values left
print(train_data_df.isnull().sum())
print(f"Number of rows in training data after dropping missing values: {len(train_data_df)}")


userId       0
movieId      0
rating       0
timestamp    0
dtype: int64
Number of rows in training data after dropping missing values: 10000038


In [16]:

import pandas as pd
from surprise import Dataset, Reader, SVD
from surprise.model_selection import cross_validate, train_test_split

# Load the train and test CSV files
train_data_path = '/content/train.csv'
test_data_path = '/content/test.csv'

train_data_df = pd.read_csv(train_data_path)
test_data_df = pd.read_csv(test_data_path)

# Drop rows with missing values in movieId, rating, or timestamp columns in train_data_df
train_data_df.dropna(subset=['movieId', 'rating', 'timestamp'], inplace=True)

# Convert the training data into a format suitable for collaborative filtering
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(train_data_df[['userId', 'movieId', 'rating']], reader)

# Sample a subset of the training data
train_data_subset = train_data_df.sample(frac=0.4, random_state=42)  # Adjust frac as needed
data_subset = Dataset.load_from_df(train_data_subset[['userId', 'movieId', 'rating']], reader)

# Build and train the model (using SVD as an example)
algo = SVD()

# Perform cross-validation on the subset to evaluate RMSE
cross_validate_results = cross_validate(algo, data_subset, measures=['RMSE'], cv=5, verbose=True)

# Print the RMSE scores from cross-validation
print(f"RMSE scores: {cross_validate_results['test_rmse']}")

# Convert the subset into Surprise Dataset
data_subset = Dataset.load_from_df(train_data_subset[['userId', 'movieId', 'rating']], reader)

# Convert the subset into Surprise Dataset
data_subset = Dataset.load_from_df(train_data_subset[['userId', 'movieId', 'rating']], reader)

# Split the subset into train and test sets (optional)
train_subset, test_subset = train_test_split(data_subset, test_size=0.2, random_state=42)

# Train the algorithm on the subset
algo.fit(train_subset)

# Generate predictions for the test data using batch processing
user_ids = test_data_df['userId'].values
movie_ids = test_data_df['movieId'].values

batch_size = 10000  # Adjust batch size based on system's capabilities
predictions = []

for start_idx in range(0, len(user_ids), batch_size):
    end_idx = start_idx + batch_size
    user_batch = user_ids[start_idx:end_idx]
    movie_batch = movie_ids[start_idx:end_idx]

    batch_predictions = [algo.predict(user_id, movie_id).est for user_id, movie_id in zip(user_batch, movie_batch)]
    predictions.extend(batch_predictions)

# Create the submission DataFrame
submission_df = pd.DataFrame({
    'Id': test_data_df['userId'].astype(str) + '_' + test_data_df['movieId'].astype(str),
    'rating': predictions
})

# Save the submission file
submission_path = '/content/submission.csv'
submission_df.to_csv(submission_path, index=False)

print(f"Submission file saved to {submission_path}")

# Verify the number of rows in the submission file
num_rows_submission = len(submission_df)
print(f"Number of rows in submission file: {num_rows_submission}")

# Check the header
submission_header = list(submission_df.columns)
expected_header = ['Id', 'rating']
print(f"Submission header: {submission_header}")
print(f"Expected header: {expected_header}")

# Verify the number of rows
expected_rows = 5000019
if num_rows_submission == expected_rows:
    print("The submission file meets the requirements.")
else:
    print("The submission file does not meet the requirements.")


Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8667  0.8667  0.8668  0.8673  0.8666  0.8668  0.0003  
Fit time          167.13  170.22  170.77  180.35  174.04  172.50  4.50    
Test time         31.40   30.86   32.49   30.00   26.03   30.16   2.22    
RMSE scores: [0.86666628 0.86668463 0.86678662 0.86734576 0.86656945]
Submission file saved to /content/submission.csv
Number of rows in submission file: 5000019
Submission header: ['Id', 'rating']
Expected header: ['Id', 'rating']
The submission file meets the requirements.
