In [16]:

import pandas as pd
from surprise import Reader, Dataset, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

# Load the ratings data
ratings = pd.read_csv(r"C:\Users\nsrip\Downloads\ratings.csv")

# Sample a fraction (e.g., 10%) of the ratings DataFrame
ratings_sample = ratings.sample(frac=0.1, random_state=42)

# Load the movies data with explicit data types for problematic columns
movies = pd.read_csv(
    r"C:\Users\nsrip\Downloads\archive (2)\movies_metadata.csv",
    dtype={'id': 'str', 'popularity': 'str'},  # Specify data types for problematic columns
    low_memory=False
)

# Drop the 'poster_path' column for now
movies = movies.drop('poster_path', axis=1)

# Convert necessary columns to appropriate data types
movies['budget'] = pd.to_numeric(movies['budget'], errors='coerce')
movies['popularity'] = pd.to_numeric(movies['popularity'], errors='coerce')
movies['runtime'] = pd.to_numeric(movies['runtime'], errors='coerce')
movies['vote_average'] = pd.to_numeric(movies['vote_average'], errors='coerce')
movies['vote_count'] = pd.to_numeric(movies['vote_count'], errors='coerce')

# Create a Surprise Reader object
reader = Reader(rating_scale=(1, 5))

# Load data into Surprise Dataset using the sampled ratings
data = Dataset.load_from_df(ratings_sample[['userId', 'movieId', 'rating']], reader)

# Split the data into training and testing sets
trainset, testset = train_test_split(data, test_size=0.25)

# Use SVD algorithm (Singular Value Decomposition)
model = SVD(n_factors=100, n_epochs=20, verbose=True)

try:
    # Train the model
    model.fit(trainset)

    # Make predictions on the test set
    predictions = model.test(testset)

    # Evaluate the model
    rmse = accuracy.rmse(predictions)
    print(f"RMSE: {rmse}")

    # Function to get movie recommendations for a given user
    def get_recommendations(user_id, n=5):
        # Get a list of all movie ids
        all_movie_ids = ratings_sample['movieId'].unique()

        if len(all_movie_ids) == 0:
            raise ValueError("No movie IDs found in the ratings data.")

        # Remove movies the user has already seen
        movies_seen_by_user = ratings_sample[ratings_sample['userId'] == user_id]['movieId'].tolist()
        movies_to_predict = list(set(all_movie_ids) - set(movies_seen_by_user))

        if len(movies_to_predict) == 0:
            raise ValueError(f"No movies to predict for user {user_id}.")

        # Make predictions for the unseen movies
        predictions = [model.predict(user_id, movie_id) for movie_id in movies_to_predict]

        # Filter out predictions with missing movie titles
        predictions = [pred for pred in predictions if str(pred.iid) in movies['id'].values]

        if len(predictions) == 0:
            raise ValueError(f"No valid predictions found for user {user_id}.")

        # Sort predictions by estimated rating in descending order
        predictions.sort(key=lambda x: x.est, reverse=True)

        # Get the top N recommendations
        top_n = predictions[:n]

        # Get movie titles for the recommendations
        movie_titles = [movies[movies['id'] == str(pred.iid)]['title'].values[0] for pred in top_n]

        return movie_titles

    # Example: top 5 recommendations for user with userId = 2 . you can change user id to get next set of recommendations.
    user_id = 2
    recommendations = get_recommendations(user_id, n=5)
    print(f"Top 5 recommendations for user {user_id}: {recommendations}")

except Exception as e:
    print(f"An error occurred: {e}")


Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
RMSE: 0.9144
RMSE: 0.914415357622642
Top 5 recommendations for user 2: ['Dread', 'The Tunnel', 'The Million Dollar Hotel', "On Her Majesty's Secret Service", 'TMNT']


In [20]:
import pandas as pd
from surprise import Reader, Dataset, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy
import zipfile
import os

# Specify the path to the zip file
zip_file_path = r"C:\Users\nsrip\OneDrive\Documents\movie_recomm_zip"

# Specify the directory where you want to extract the contents
extracted_folder_path = r"C:\Users\nsrip\OneDrive\Documents\extracted_folder" #create a new file and paste its path here

# Create the output directory if it does not exist
os.makedirs(extracted_folder_path, exist_ok=True)

# Create a ZipFile object
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    # Extract all the contents to the specified directory
    zip_ref.extractall(extracted_folder_path)

# Load the movies data from the extracted CSV file
csv_file_path = os.path.join(extracted_folder_path, r"C:\Users\nsrip\Downloads\movies_metadata.csv\movies_metadata.csv")
movies = pd.read_csv(
    csv_file_path,
    dtype={'id': 'str', 'popularity': 'str'},  # Specify data types for problematic columns
    low_memory=False
)

# Drop the 'poster_path' column for now
movies = movies.drop('poster_path', axis=1)

# Convert necessary columns to appropriate data types
movies['budget'] = pd.to_numeric(movies['budget'], errors='coerce')
movies['popularity'] = pd.to_numeric(movies['popularity'], errors='coerce')
movies['runtime'] = pd.to_numeric(movies['runtime'], errors='coerce')
movies['vote_average'] = pd.to_numeric(movies['vote_average'], errors='coerce')
movies['vote_count'] = pd.to_numeric(movies['vote_count'], errors='coerce')

# Load the ratings data
ratings = pd.read_csv(r"C:\Users\nsrip\Downloads\ratings.csv")

# Sample a fraction (e.g., 10%) of the ratings DataFrame
ratings_sample = ratings.sample(frac=0.1, random_state=42)

# Create a Surprise Reader object
reader = Reader(rating_scale=(1, 5))

# Load data into Surprise Dataset using the sampled ratings
data = Dataset.load_from_df(ratings_sample[['userId', 'movieId', 'rating']], reader)

# Split the data into training and testing sets
trainset, testset = train_test_split(data, test_size=0.25)

# Use SVD algorithm (Singular Value Decomposition)
model = SVD(n_factors=100, n_epochs=20, verbose=True)

try:
    # Train the model
    model.fit(trainset)

    # Make predictions on the test set
    predictions = model.test(testset)

    # Evaluate the model
    rmse = accuracy.rmse(predictions)
    print(f"RMSE: {rmse}")

    # Function to get movie recommendations for a given user
    def get_recommendations(user_id, n=5):
        # Get a list of all movie ids
        all_movie_ids = ratings_sample['movieId'].unique()

        if len(all_movie_ids) == 0:
            raise ValueError("No movie IDs found in the ratings data.")

        # Remove movies the user has already seen
        movies_seen_by_user = ratings_sample[ratings_sample['userId'] == user_id]['movieId'].tolist()
        movies_to_predict = list(set(all_movie_ids) - set(movies_seen_by_user))

        if len(movies_to_predict) == 0:
            raise ValueError(f"No movies to predict for user {user_id}.")

        # Make predictions for the unseen movies
        predictions = [model.predict(user_id, movie_id) for movie_id in movies_to_predict]

        # Filter out predictions with missing movie titles
        predictions = [pred for pred in predictions if str(pred.iid) in movies['id'].values]

        if len(predictions) == 0:
            raise ValueError(f"No valid predictions found for user {user_id}.")

        # Sort predictions by estimated rating in descending order
        predictions.sort(key=lambda x: x.est, reverse=True)

        # Get the top N recommendations
        top_n = predictions[:n]

        # Get movie titles for the recommendations
        movie_titles = [movies[movies['id'] == str(pred.iid)]['title'].values[0] for pred in top_n]

        return movie_titles

    # Example: top 5 recommendations for user with userId = 2. You can change the user id to get the next set of recommendations.
    user_id = 2
    recommendations = get_recommendations(user_id, n=5)
    print(f"Top 5 recommendations for user {user_id}: {recommendations}")

except Exception as e:
    print(f"An error occurred: {e}")


Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
RMSE: 0.9146
RMSE: 0.9146143741959447
Top 5 recommendations for user 2: ['Dread', "We're No Angels", "On Her Majesty's Secret Service", 'Red River', 'The Kite Runner']
