In [61]:
import pandas as pd
import torch
import torch.nn as nn
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split, cross_validate
import os
import joblib

In [62]:
# for reproducibility
import os
import random

def seed_everything(seed=42):
  random.seed(seed)
  os.environ['PYTHONHASHSEED'] = str(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)
  torch.backends.cudnn.deterministic = True
seed_everything()

# Loading DataSets

In [64]:
directory = r"E:\Courses\recommendation system\project\project_descrption\ml-latest-small"

# List all CSV files in the directory
csv_files = [f for f in os.listdir(directory) if f.endswith('.csv')]

# Load each CSV file into a dictionary of DataFrames
dataframes = {file: pd.read_csv(os.path.join(directory, file)) for file in csv_files}

# Print loaded files
print(f"Loaded CSV files: {list(dataframes.keys())}")

# Access individual DataFrames
links_df = dataframes.get("links.csv")
movies_df = dataframes.get("movies.csv")
ratings_df = dataframes.get("ratings.csv")
tags_df = dataframes.get("tags.csv")

# Example: Display the first few rows of each CSV
for name, df in dataframes.items():
    print(f"\n{name}:\n", df.head())

Loaded CSV files: ['links.csv', 'movies.csv', 'ratings.csv', 'tags.csv']

links.csv:
    movieId  imdbId   tmdbId
0        1  114709    862.0
1        2  113497   8844.0
2        3  113228  15602.0
3        4  114885  31357.0
4        5  113041  11862.0

movies.csv:
    movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  

ratings.csv:
    userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       

# Data Preprocessing

### Get copy of dataframes Movies and ratings

In [67]:
movies = movies_df
ratings = ratings_df
tags = tags_df
links = links_df

### Split genres and one-hot encode

In [69]:
genres_list = movies['genres'].apply(lambda x: x.split('|'))
mlb = MultiLabelBinarizer()
genres_encoded = pd.DataFrame(mlb.fit_transform(genres_list), columns=mlb.classes_)
movies = movies.drop(columns=['genres']).join(genres_encoded)

### Process Tags (TF-IDF)

In [71]:
tags = tags.dropna()
tags['tag'] = tags['tag'].astype(str)
movie_tags = tags.groupby('movieId')['tag'].apply(lambda x: ' '.join(x)).reset_index()
vectorizer = TfidfVectorizer(stop_words='english', max_features=100)
tag_features = vectorizer.fit_transform(movie_tags['tag'])
tag_features_df = pd.DataFrame(tag_features.toarray(), columns=vectorizer.get_feature_names_out())
tag_features_df['movieId'] = movie_tags['movieId']
movies = pd.merge(movies, tag_features_df, on='movieId', how='left').fillna(0)


### Encode User and Movie IDs

In [73]:
user_encoder = LabelEncoder()
movie_encoder = LabelEncoder()
ratings['userId'] = user_encoder.fit_transform(ratings['userId'])
movies['movieId'] = movie_encoder.fit_transform(movies['movieId'])


### Prepare Training Data

In [75]:
df = ratings.merge(movies, on="movieId", how="left").dropna()
feature_columns = ['userId'] + [col for col in movies.columns if col not in ['movieId', 'title']]

X = torch.tensor(df[feature_columns].values, dtype=torch.float32)
y = torch.tensor(df['rating'].values, dtype=torch.float32).view(-1, 1)

scaler = StandardScaler()
X = torch.tensor(scaler.fit_transform(X), dtype=torch.float32)


# 1️⃣ Content-Based Filtering with Deep Learning

### Define Neural Network

In [78]:
class MovieRecommender(nn.Module):
    def __init__(self, input_dim, hidden_dim=128):
        super(MovieRecommender, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.bn1 = nn.BatchNorm1d(hidden_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
        self.fc2 = nn.Linear(hidden_dim, 64)
        self.bn2 = nn.BatchNorm1d(64)
        self.fc3 = nn.Linear(64, 1)

    def forward(self, x):
        x = self.fc1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.bn2(x)
        x = self.relu(x)
        x = self.fc3(x)
        return x


### Initialize Model

In [80]:
model = MovieRecommender(X.shape[1])
criterion = nn.MSELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)

### Training Loop

In [82]:
epochs = 1000
for epoch in range(epochs):
    optimizer.zero_grad()
    outputs = model(X)
    loss = criterion(outputs, y)
    if torch.isnan(loss):
        print("🚨 NaN detected in loss! Stopping training.")
        break
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    if epoch % 100 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item()}")

print("✅ Training Complete!")

Epoch 0, Loss: 13.019725799560547
Epoch 100, Loss: 1.120574712753296
Epoch 200, Loss: 1.0576012134552002
Epoch 300, Loss: 1.0493029356002808
Epoch 400, Loss: 1.0448155403137207
Epoch 500, Loss: 1.0433131456375122
Epoch 600, Loss: 1.040980577468872
Epoch 700, Loss: 1.0397700071334839
Epoch 800, Loss: 1.0377016067504883
Epoch 900, Loss: 1.0374125242233276
✅ Training Complete!


# 2️⃣ Collaborative Filtering with SVD

In [84]:
reader = Reader(rating_scale=(ratings_df['rating'].min(), ratings_df['rating'].max()))
data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=0.2)

svd = SVD()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x15df79a8500>

# 🔹Hybrid Recommendation System

In [86]:
def get_hybrid_recommendations(user_id, top_n=10, alpha=0.5):
    user_idx = user_encoder.transform([user_id])[0]
    unrated_movies =  movies[~movies['movieId'].isin(ratings['movieId'][ratings['userId']==user_id])]
    candidate_movies = unrated_movies.copy()
    candidate_movies['userId'] = user_idx

    feature_columns = ['userId'] + [col for col in movies.columns if col not in ['movieId', 'title']]
    X_candidate = torch.tensor(scaler.transform(candidate_movies[feature_columns]), dtype=torch.float32)

    model.eval()
    with torch.no_grad():
        candidate_movies['content_score'] = model(X_candidate).numpy().flatten()
    
    candidate_movies['svd_score'] = candidate_movies['movieId'].apply(lambda x: svd.predict(user_idx, x).est)
    candidate_movies['hybrid_score'] = alpha * candidate_movies['content_score'] + (1 - alpha) * candidate_movies['svd_score']
    
    top_recommendations = candidate_movies.sort_values(by='hybrid_score', ascending=False).head(top_n)
    
    return top_recommendations[['movieId', 'title', 'content_score', 'svd_score', 'hybrid_score']]

# Example Usage
user_id = 1 # Replace with actual user ID
recommendations = get_hybrid_recommendations(user_id)
print("Top Hybrid Recommendations:")
recommendations

Top Hybrid Recommendations:




Unnamed: 0,movieId,title,content_score,svd_score,hybrid_score
898,898,Star Wars: Episode V - The Empire Strikes Back...,4.250794,5.0,4.625397
527,527,"Aristocats, The (1970)",4.116347,5.0,4.558173
6016,6016,Kiss Kiss Bang Bang (2005),4.096378,5.0,4.548189
5618,5618,Dark Portals: The Chronicles of Vidocq (Vidoc...,4.073554,4.986054,4.529804
2571,2571,Teenage Mutant Ninja Turtles II: The Secret of...,4.092148,4.908234,4.500191
3275,3275,"10th Victim, The (La decima vittima) (1965)",4.086528,4.87752,4.482024
2028,2028,"South Park: Bigger, Longer and Uncut (1999)",4.061604,4.872762,4.467183
913,913,"Third Man, The (1949)",4.087634,4.79863,4.443132
4973,4973,Oklahoma! (1955),3.879189,5.0,4.439594
951,951,Chinatown (1974),4.066905,4.782972,4.424938


In [87]:
import requests
# Implement the function to take new_id value map it back to original id and then query the API and return the movie title
def get_titles(movie_id):
    #print("index_id",index_id)
    imdbId = links['imdbId'][links['movieId']==movie_id].values[0]
    #print("imdbId",imdbId)

    #url = "https://www.imdb.com/title/tt0114709/"
    # Define headers to make the request look like it's coming from a browser
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
    }
    
    response = requests.get(f"https://www.omdbapi.com/?i=tt0{imdbId}&apikey=56280332", headers=headers)
    
    if response.status_code == 200:
        #print("Request successful!")
        data = response.json()
        #print(data)  # This will print the HTML content of the page
        return data.get("Title", "Title not found")
    else:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")
        return f"Failed to retrieve the page. Status code: {response.status_code}"

In [88]:
get_titles(50)

'The Usual Suspects'