In [None]:
import pandas as pd
import numpy as np

# Load your dataset (modify the file path as necessary)
df = pd.read_csv('netflix_titles1.csv')

# Filtering for movies only
movies_df = df[df['type'] == 'Movie'][['title']].head(500)  # Limit to 1000 movies

# Create a list of user IDs (e.g., User_1, User_2, ..., User_200)
user_ids = [f'User_{i+1}' for i in range(100)]

# Initialize an empty DataFrame for the user-item matrix
user_item_matrix = pd.DataFrame(columns=['title'] + user_ids)

# Copy movie titles to the matrix
user_item_matrix['title'] = movies_df['title'].values

# Generate random ratings between 1 and 5, with some NaNs to simulate missing ratings
np.random.seed(42)  # For reproducibility
for user in user_ids:
    user_item_matrix[user] = np.random.choice([1, 2, 3, 4, 5, np.nan], size=len(user_item_matrix), p=[0.15, 0.2, 0.25, 0.2, 0.15, 0.05])

# Save the user-item matrix to a new CSV file
user_item_matrix.to_csv('user_item_matrix.csv', index=False)


In [None]:
import pandas as pd
import numpy as np


In [None]:
# Check the index (user IDs) in the DataFrame
print("Available User IDs:")
print(ratings_df.index.tolist())


Available User IDs:
['Dick Johnson Is Dead', 'My Little Pony: A New Generation', 'Sankofa', 'The Starling', 'Je Suis Karl', 'Confessions of an Invisible Girl', "Europe's Most Dangerous Man: Otto Skorzeny in Spain", 'Intrusion', 'Avvai Shanmughi', 'Go! Go! Cory Carson: Chrissy Takes the Wheel', 'Jeans', 'Minsara Kanavu', 'Grown Ups', 'Dark Skies', 'Paranoia', 'Ankahi Kahaniya', 'The Father Who Moves Mountains', 'The Stronghold', 'Birth of the Dragon', 'Jaws', 'Jaws 2', 'Jaws 3', 'Jaws: The Revenge', 'My Heroes Were Cowboys', 'Safe House', 'Training Day', 'InuYasha the Movie 2: The Castle Beyond the Looking Glass', 'InuYasha the Movie 3: Swords of an Honorable Ruler', 'InuYasha the Movie 4: Fire on the Mystic Island', 'InuYasha the Movie: Affections Touching Across Time', 'Naruto Shippuden the Movie: Blood Prison', 'Naruto Shippûden the Movie: Bonds', 'Naruto Shippûden the Movie: The Will of Fire', 'Naruto Shippuden: The Movie', 'Naruto Shippuden: The Movie: The Lost Tower', 'Naruto the 

In [None]:
import pandas as pd
import numpy as np

class CoreRecommender:
    def __init__(self, csv_path: str, n_neighbors=2):
        """
        Recommender focusing on similarity and prediction based on user-item ratings.

        Args:
            csv_path (str): Path to the CSV file containing user-item ratings with movie titles as the first column.
            n_neighbors (int): Number of similar users to consider for predictions.
        """
        self.n_neighbors = n_neighbors
        # Load ratings data from CSV
        self.ratings = pd.read_csv(csv_path)
        self.ratings.set_index(self.ratings.columns[0], inplace=True)  # Set the first column (movie titles) as the index

    def calculate_similarity(self, user1: str, user2: str) -> float:
        """
        Calculate similarity between two users using Pearson correlation.

        Args:
            user1: First user ID
            user2: Second user ID

        Returns:
            Similarity score between -1 and 1
        """
        user1_ratings = self.ratings[user1]
        user2_ratings = self.ratings[user2]

        # Find common rated movies
        common_items = user1_ratings.dropna().index.intersection(user2_ratings.dropna().index)

        if len(common_items) < 2:
            return 0.0

        # Calculate means for common items
        user1_mean = user1_ratings[common_items].mean()
        user2_mean = user2_ratings[common_items].mean()

        # Calculate numerator and denominators for Pearson correlation
        numerator = sum((user1_ratings[common_items] - user1_mean) *
                        (user2_ratings[common_items] - user2_mean))
        denom1 = np.sqrt(sum((user1_ratings[common_items] - user1_mean) ** 2))
        denom2 = np.sqrt(sum((user2_ratings[common_items] - user2_mean) ** 2))

        if denom1 * denom2 == 0:
            return 0.0

        return numerator / (denom1 * denom2)

    def get_similar_users(self, target_user: str) -> dict:
        """
        Find similar users for a target user.

        Args:
            target_user: User to find similarities for

        Returns:
            Dictionary of {user: similarity_score}
        """
        similarities = {}

        for user in self.ratings.columns:
            if user != target_user:
                similarity = self.calculate_similarity(target_user, user)
                similarities[user] = round(similarity, 3)

        return dict(sorted(similarities.items(), key=lambda x: x[1], reverse=True))

    def predict_rating(self, target_user: str, movie_title: str) -> float:
        """
        Predict rating for a movie using similar users.

        Args:
            target_user: User to predict for
            movie_title: Movie title to predict rating for

        Returns:
            Predicted rating
        """
        similarities = self.get_similar_users(target_user)

        # Take top N neighbors
        neighbors = dict(list(similarities.items())[:self.n_neighbors])

        weighted_sum = 0
        similarity_sum = 0

        # Calculate weighted average
        for user, similarity in neighbors.items():
            rating = self.ratings.at[movie_title, user]

            if not pd.isna(rating):
                weighted_sum += rating * similarity
                similarity_sum += abs(similarity)

        if similarity_sum == 0:
            return None

        return round(weighted_sum / similarity_sum, 2)

# Sample usage
if __name__ == "__main__":
    # Load your dataset by specifying the CSV file path
    csv_path = "/content/user_item_matrix.csv"  # Replace with the actual file path
    recommender = CoreRecommender(csv_path, n_neighbors=2)

    # Choose a target user for examples
    target_user = "User_1"

    # Get similar users
    print("\nSimilar Users:")
    print("-" * 30)
    similar_users = recommender.get_similar_users(target_user)
    for user, similarity in similar_users.items():
        print(f"{user}: {similarity}")

    # Predict ratings for some movies
    print("\nPredicted Ratings:")
    print("-" * 30)
    for movie in ["My Little Pony: A New Generation", "The Starling"]:
        prediction = recommender.predict_rating(target_user, movie)
        if prediction:
            print(f"{movie}: {prediction}")
        else:
            print(f"{movie}: Unable to predict")



Similar Users:
------------------------------
User_99: 0.124
User_67: 0.113
User_31: 0.099
User_97: 0.099
User_60: 0.085
User_63: 0.08
User_16: 0.074
User_33: 0.07
User_30: 0.065
User_20: 0.063
User_46: 0.055
User_85: 0.054
User_28: 0.049
User_23: 0.047
User_56: 0.046
User_40: 0.044
User_42: 0.039
User_45: 0.038
User_59: 0.037
User_66: 0.036
User_74: 0.035
User_9: 0.033
User_2: 0.032
User_83: 0.032
User_8: 0.03
User_87: 0.03
User_19: 0.029
User_71: 0.029
User_93: 0.028
User_52: 0.021
User_77: 0.021
User_88: 0.018
User_3: 0.017
User_48: 0.016
User_12: 0.015
User_38: 0.015
User_27: 0.014
User_47: 0.012
User_69: 0.012
User_49: 0.011
User_89: 0.011
User_95: 0.011
User_62: 0.009
User_80: 0.009
User_36: 0.006
User_76: 0.004
User_100: 0.004
User_94: -0.001
User_72: -0.002
User_29: -0.006
User_57: -0.007
User_50: -0.01
User_70: -0.01
User_91: -0.011
User_51: -0.012
User_82: -0.012
User_86: -0.013
User_37: -0.014
User_13: -0.017
User_32: -0.017
User_65: -0.017
User_41: -0.018
User_64: -0.02
Us

In [2]:
import pandas as pd

# Load the dataset (update the path if necessary)
file_path = 'netflix_titles.csv'  # Replace with your actual file path
netflix_data = pd.read_csv(file_path)

# Inspect the first few rows of the dataset
print("First few rows of the dataset:")
print(netflix_data.head())

# Check for any discrepancies in the column names
print("Column names in the dataset:")
print(netflix_data.columns)

# 1. Fill missing values in `director`, `cast`, and `country` with "Unknown"
netflix_data['director'].fillna('Unknown', inplace=True)
netflix_data['cast'].fillna('Unknown', inplace=True)
netflix_data['country'].fillna('Unknown', inplace=True)

# 2. Convert `date_added` to a standardized date format
netflix_data['date_added'] = pd.to_datetime(netflix_data['date_added'], errors='coerce')

# 3. Define the rating mapping and apply it directly to each row in `rating`
rating_map = {
    'TV-MA': 5, 'R': 4, 'PG-13': 3, 'TV-14': 3, 'PG': 2, 'TV-PG': 2,
    'TV-Y7': 1, 'TV-Y7-FV': 1, 'TV-Y': 1, 'G': 1, 'NR': 0, 'UR': 0, 'TV-G': 1
}

# Ensure the `rating` column is correctly read and processed
netflix_data['rating'] = netflix_data['rating'].astype(str).str.strip()

# Print unique values in the `rating` column before mapping
unique_ratings = netflix_data['rating'].unique()
print("Unique values in 'rating' column before mapping:")
print(unique_ratings)

# Apply the mapping function, setting unmapped ratings to a default of 0
netflix_data['rating'] = netflix_data['rating'].map(rating_map).fillna(0).astype(int)

# Print a sample of the `rating` column after mapping
print("Sample of 'rating' column after mapping:")
print(netflix_data['rating'].head(10))

# 4. Standardize the `duration` column to represent integer values (minutes for movies, seasons for shows)
def clean_duration(row):
    if pd.isna(row['duration']):  # Handle missing values
        return 0
    if 'Season' in row['duration']:
        return int(row['duration'].split(' ')[0])  # Extract number of seasons for TV shows
    elif 'min' in row['duration']:
        return int(row['duration'].split(' ')[0])  # Extract duration in minutes for movies
    return 0  # Default case if format doesn't match

netflix_data['duration'] = netflix_data.apply(clean_duration, axis=1)

# Save the cleaned dataset to a new CSV file
output_file_path = 'cleaned_netflix_data.csv'  # Replace with your desired file path
netflix_data.to_csv(output_file_path, index=False)

print(f"Cleaned data saved to {output_file_path}")


First few rows of the dataset:
  show_id     type                  title         director  \
0      s1    Movie   Dick Johnson Is Dead  Kirsten Johnson   
1      s2  TV Show          Blood & Water              NaN   
2      s3  TV Show              Ganglands  Julien Leclercq   
3      s4  TV Show  Jailbirds New Orleans              NaN   
4      s5  TV Show           Kota Factory              NaN   

                                                cast        country  \
0                                                NaN  United States   
1  Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...   South Africa   
2  Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...            NaN   
3                                                NaN            NaN   
4  Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...          India   

           date_added  release_year rating   duration  \
0  September 25, 2021          2020  PG-13     90 min   
1  September 24, 2021          2021  TV-MA  2 Seasons  

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  netflix_data['director'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  netflix_data['cast'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we ar

Cleaned data saved to cleaned_netflix_data.csv


In [3]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# Load the user-item matrix (update the file path if necessary)
file_path = 'user_item_matrix.csv'  # Replace with your file path
user_item_matrix = pd.read_csv(file_path)

# Extract only the columns with user ratings (excluding the first column with movie titles)
user_ratings = user_item_matrix.iloc[:, 1:]  # Skip the first column which is the movie titles

# Transpose the matrix so that each row represents a user and each column represents a movie
# This is required for user-based collaborative filtering
user_ratings = user_ratings.T

# Calculate cosine similarity between users
user_cosine_similarity = cosine_similarity(user_ratings.fillna(0))  # Fill NaN values with 0 for similarity calculation

# Create a DataFrame for easy interpretation of results
user_cosine_similarity_df = pd.DataFrame(user_cosine_similarity, index=user_ratings.index, columns=user_ratings.index)

print("User-based Cosine Similarity Matrix:\n", user_cosine_similarity_df)

user_cosine_similarity_df.to_csv('user_cosine_similarity_matrix.csv')
print("Cosine similarity matrix saved as user_cosine_similarity_matrix.csv")



User-based Cosine Similarity Matrix:
             User_1    User_2    User_3    User_4    User_5    User_6  \
User_1    1.000000  0.805234  0.781814  0.776186  0.779039  0.767955   
User_2    0.805234  1.000000  0.797956  0.801752  0.791229  0.802401   
User_3    0.781814  0.797956  1.000000  0.786419  0.792039  0.801190   
User_4    0.776186  0.801752  0.786419  1.000000  0.796871  0.775981   
User_5    0.779039  0.791229  0.792039  0.796871  1.000000  0.794711   
...            ...       ...       ...       ...       ...       ...   
User_96   0.760835  0.781477  0.781346  0.796656  0.785779  0.784460   
User_97   0.791298  0.796440  0.800416  0.792939  0.782037  0.775025   
User_98   0.780928  0.814700  0.801847  0.809014  0.795191  0.797330   
User_99   0.832494  0.825103  0.807923  0.799378  0.795134  0.808588   
User_100  0.804317  0.819978  0.797646  0.805237  0.803414  0.802681   

            User_7    User_8    User_9   User_10  ...   User_91   User_92  \
User_1    0.798084  

In [4]:
import pandas as pd

# Load the user-item matrix from the CSV file
user_item_matrix = pd.read_csv('user_item_matrix.csv')

# Exclude the first column (movie names) and calculate the Pearson correlation
ratings = user_item_matrix.iloc[:, 1:]  # Assuming the first column is movie names
correlation_matrix = ratings.corr(method='pearson')

# Save the correlation matrix to a new CSV file (optional)
correlation_matrix.to_csv('user_correlation_matrix.csv')

# Display the correlation matrix
print(correlation_matrix)


            User_1    User_2    User_3    User_4    User_5    User_6  \
User_1    1.000000  0.032039  0.016568 -0.040390 -0.040063 -0.037961   
User_2    0.032039  1.000000 -0.014953  0.004427 -0.058437  0.039900   
User_3    0.016568 -0.014953  1.000000 -0.014012  0.057337  0.094094   
User_4   -0.040390  0.004427 -0.014012  1.000000 -0.012065 -0.071080   
User_5   -0.040063 -0.058437  0.057337 -0.012065  1.000000 -0.053962   
...            ...       ...       ...       ...       ...       ...   
User_96  -0.106534 -0.063835 -0.012027  0.035565 -0.052734  0.070577   
User_97   0.099374 -0.043164  0.049011  0.029625 -0.038632 -0.098980   
User_98  -0.062727  0.033071 -0.006070  0.082784 -0.042045 -0.024743   
User_99   0.123776  0.046728 -0.029887 -0.017415 -0.081196  0.013338   
User_100  0.003535  0.028041 -0.006117  0.003491 -0.034266 -0.005437   

            User_7    User_8    User_9   User_10  ...   User_91   User_92  \
User_1   -0.032731  0.029740  0.032714 -0.038757  ... -0.0

In [5]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# Load the user-item matrix (update the file path if necessary)
file_path = 'user_item_matrix.csv'  # Replace with your file path
user_item_matrix = pd.read_csv(file_path)

# Extract only the item ratings by excluding the first row (user names) and the first column (item names)
item_ratings = user_item_matrix.iloc[1:, 1:]  # Skip the first row and first column

# Fill NaN values with 0 for similarity calculation
item_ratings = item_ratings.fillna(0).astype(float)

# Calculate cosine similarity between items (rows)
item_cosine_similarity = cosine_similarity(item_ratings)

# Create a DataFrame for easy interpretation of results
item_cosine_similarity_df = pd.DataFrame(
    item_cosine_similarity,
    index=user_item_matrix.iloc[1:, 0],  # Use item names from the first column (excluding header)
    columns=user_item_matrix.iloc[1:, 0]  # Use item names as column labels as well
)

# Display the item-based cosine similarity matrix
print("Item-based Cosine Similarity Matrix:\n", item_cosine_similarity_df)

# Save the item-based cosine similarity matrix to a CSV file
item_cosine_similarity_df.to_csv('item_cosine_similarity_matrix.csv')
print("Cosine similarity matrix saved as item_cosine_similarity_matrix.csv")


Item-based Cosine Similarity Matrix:
 title                             My Little Pony: A New Generation   Sankofa  \
title                                                                          
My Little Pony: A New Generation                          1.000000  0.746572   
Sankofa                                                   0.746572  1.000000   
The Starling                                              0.805520  0.789050   
Je Suis Karl                                              0.762941  0.818096   
Confessions of an Invisible Girl                          0.793088  0.789849   
...                                                            ...       ...   
Company of Heroes                                         0.805898  0.809956   
Cradle 2 the Grave                                        0.761308  0.842853   
Domestic Disturbance                                      0.792421  0.818397   
Dream/Killer                                              0.821846  0.756962   
Fe

In [6]:
import pandas as pd

# Load the user-item matrix (update the file path if necessary)
file_path = 'user_item_matrix.csv'  # Replace with your file path
user_item_matrix = pd.read_csv(file_path)

# Step 1: Prepare the ratings matrix by excluding the first row (user names)
# Assuming the first column contains user names and the first row is the header
item_ratings = user_item_matrix.iloc[1:, 1:]  # Skip the first row and first column

# Step 2: Convert the DataFrame to numeric and handle errors
item_ratings = item_ratings.apply(pd.to_numeric, errors='coerce')

# Step 3: Fill NaN values (e.g., with 0 or the mean of each column)
item_ratings = item_ratings.fillna(0)  # You can also use item_ratings.fillna(item_ratings.mean())

# Step 4: Calculate Pearson correlation between items (rows)
item_pearson_correlation = item_ratings.corr(method='pearson')

# Create a DataFrame for easy interpretation of results
item_pearson_correlation_df = pd.DataFrame(
    item_pearson_correlation,
    index=user_item_matrix.iloc[1:, 0],  # Use item names from the first column (excluding header)
    columns=user_item_matrix.iloc[1:, 0]  # Use item names as column labels as well
)

# Display the item-based Pearson correlation matrix
print("Item-based Pearson Correlation Matrix:\n", item_pearson_correlation_df)

# Save the item-based Pearson correlation matrix to a CSV file
item_pearson_correlation_df.to_csv('item_pearson_correlation_matrix.csv')
print("Pearson correlation matrix saved as item_pearson_correlation_matrix.csv")

Item-based Pearson Correlation Matrix:
 title                             My Little Pony: A New Generation  Sankofa  \
title                                                                         
My Little Pony: A New Generation                               NaN      NaN   
Sankofa                                                        NaN      NaN   
The Starling                                                   NaN      NaN   
Je Suis Karl                                                   NaN      NaN   
Confessions of an Invisible Girl                               NaN      NaN   
...                                                            ...      ...   
Company of Heroes                                              NaN      NaN   
Cradle 2 the Grave                                             NaN      NaN   
Domestic Disturbance                                           NaN      NaN   
Dream/Killer                                                   NaN      NaN   
Felon       