In [11]:
!pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
    Preparing wheel metadata: started
    Preparing wheel metadata: finished with status 'done'
Collecting numpy>=1.19.5
  Downloading numpy-1.24.4-cp38-cp38-win_amd64.whl (14.9 MB)
Collecting scipy>=1.6.0
  Downloading scipy-1.10.1-cp38-cp38-win_amd64.whl (42.2 MB)
Collecting joblib>=1.2.0
  Downloading joblib-1.4.2-py3-none-any.whl (301 kB)
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (PEP 517): started
  Building wheel for scikit-surprise (PEP 517): finished with status 'done'
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp38-cp38-win_amd64.whl size=1292930 sha256=f73965f9e09938af3306cadd99ccd3d975dced41d04e6b5e1431ecf1fa80

In [None]:
import os
import pandas as pd
import numpy as np
from datetime import datetime

In [16]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-2.1.4-py3-none-win_amd64.whl (124.9 MB)
Installing collected packages: xgboost
Successfully installed xgboost-2.1.4


In [19]:
!pip install --upgrade pandas

Collecting pandas
  Downloading pandas-2.0.3-cp38-cp38-win_amd64.whl (10.8 MB)
Collecting python-dateutil>=2.8.2
  Using cached python_dateutil-2.9.0.post0-py2.py3-none-any.whl (229 kB)
Collecting tzdata>=2022.1
  Downloading tzdata-2025.2-py2.py3-none-any.whl (347 kB)
Installing collected packages: python-dateutil, tzdata, pandas
  Attempting uninstall: python-dateutil
    Found existing installation: python-dateutil 2.8.1
    Uninstalling python-dateutil-2.8.1:
      Successfully uninstalled python-dateutil-2.8.1
  Attempting uninstall: pandas
    Found existing installation: pandas 1.1.3
    Uninstalling pandas-1.1.3:
      Successfully uninstalled pandas-1.1.3
Successfully installed pandas-2.0.3 python-dateutil-2.9.0.post0 tzdata-2025.2


In [20]:
# === 1. Import Libraries ===
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
import joblib
import os

In [39]:
# Movie Recommendation System with Belief Elicitation Data
# Author: Manas Mondal
# BBC Data Scientist Role Project

# === 2. Load Datasets ===
movies = pd.read_csv('C:\\Users\\Manas\\Downloads\\ml_belief_2024_data_release_2\\data_release\\movies.csv')
ratings = pd.read_csv('C:\\Users\\Manas\\Downloads\\ml_belief_2024_data_release_2\\data_release\\user_rating_history.csv')
beliefs = pd.read_csv('C:\\Users\\Manas\\Downloads\\ml_belief_2024_data_release_2\\data_release\\belief_data.csv')
recommendations = pd.read_csv('C:\\Users\\Manas\\Downloads\\ml_belief_2024_data_release_2\\data_release\\user_recommendation_history.csv')


In [44]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

# === 3. Merge and Preprocess ===
# Use ratings + beliefs
ratings = ratings.rename(columns={'rating': 'actual_rating'})
data = pd.merge(ratings, beliefs[['userId', 'movieId', 'userPredictRating', 'userCertainty']], on=['userId', 'movieId'], how='left')
data = pd.merge(data, movies[['movieId', 'genres']], on='movieId', how='left')

# Feature Engineering
data['userPredictRating'].fillna(data['actual_rating'].mean(), inplace=True)
data['userCertainty'].fillna(data['userCertainty'].mean(), inplace=True)
data['genre_count'] = data['genres'].apply(lambda x: len(str(x).split('|')) if pd.notnull(x) else 0)

# Drop unnecessary columns
data.drop(columns=['genres'], inplace=True)

# Important: Clean target column
data['actual_rating'].fillna(data['actual_rating'].mean(), inplace=True)
data = data[np.isfinite(data['actual_rating'])]

# === Quick Check (optional but recommended) ===
print(f"Number of NaNs in features: {data[['userId', 'movieId', 'userPredictRating', 'userCertainty', 'genre_count']].isnull().sum().sum()}")
print(f"Number of NaNs in target: {data['actual_rating'].isnull().sum()}")
print(f"Any Infinities in target? {np.isinf(data['actual_rating']).any()}")

# === 4. Feature Matrix and Target ===
features = ['userId', 'movieId', 'userPredictRating', 'userCertainty', 'genre_count']
X = data[features]
y = data['actual_rating']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# === 5. Train Model ===
model = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    random_state=42
)

model.fit(X_train.values, y_train.values)

# === 6. Evaluate Model ===
y_pred = model.predict(X_test.values)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)

print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")


Number of NaNs in features: 0
Number of NaNs in target: 0
Any Infinities in target? False
RMSE: 1.4157
MAE: 1.0598


In [52]:
os.makedirs('s3_bucket/models', exist_ok=True)
joblib.dump(model, 's3_bucket/models/movie_recommender.pkl')
model = joblib.load('s3_bucket/models/movie_recommender.pkl')

In [53]:
def recommend(user_id, top_n=5):
    user_movies = X[X['userId'] == user_id]
    user_preds = model.predict(user_movies.values)
    user_movies = user_movies.copy()
    user_movies['predicted_rating'] = user_preds
    top_recommendations = user_movies.sort_values('predicted_rating', ascending=False).head(top_n)
    
    # Merge with movie titles
    top_recommendations = pd.merge(top_recommendations, movies[['movieId', 'title']], on='movieId', how='left')
    return top_recommendations[['title', 'predicted_rating']]


In [55]:
print(X['userId'].unique())

[ 42170  43715  44282 ... 410562 410566 410572]


In [58]:
print("\nExample Recommendations:")
print(recommend(user_id=44282, top_n=5))            


Example Recommendations:
                                               title  predicted_rating
0                                   Inception (2010)          3.955132
1                   Shawshank Redemption, The (1994)          3.859077
2                         Clockwork Orange, A (1971)          3.853248
3  Star Wars: Episode VI - Return of the Jedi (1983)          3.853248
4                            Army of Darkness (1993)          3.842505


In [60]:
# Streamlit App: Movie Recommendation System
# Author: Manas Mondal

import streamlit as st
import pandas as pd
import joblib

# Prepare feature data (must match training process)
ratings = ratings.rename(columns={'rating': 'actual_rating'})
data = pd.merge(ratings, beliefs[['userId', 'movieId', 'userPredictRating', 'userCertainty']], on=['userId', 'movieId'], how='left')
data = pd.merge(data, movies[['movieId', 'title', 'genres']], on='movieId', how='left')

data['userPredictRating'].fillna(data['actual_rating'].mean(), inplace=True)
data['userCertainty'].fillna(data['userCertainty'].mean(), inplace=True)
data['genre_count'] = data['genres'].apply(lambda x: len(str(x).split('|')) if pd.notnull(x) else 0)

X = data[['userId', 'movieId', 'userPredictRating', 'userCertainty', 'genre_count']]

# Streamlit App Layout
st.title("🎬 Personalized Movie Recommendation System")

st.markdown("""
This app provides personalized movie recommendations based on user behavior and belief elicitation insights. 
Select your **User ID** to view your top recommended movies!
""")

# User ID selection
unique_users = X['userId'].unique()
user_id = st.selectbox("Select User ID", unique_users)

# Number of recommendations
top_n = st.slider("Number of Recommendations", 1, 20, 5)

# Recommendation function
def recommend(user_id, top_n=5):
    user_movies = X[X['userId'] == user_id]
    user_preds = model.predict(user_movies.values)
    user_movies = user_movies.copy()
    user_movies['predicted_rating'] = user_preds
    top_recommendations = user_movies.sort_values('predicted_rating', ascending=False).head(top_n)
    top_recommendations = pd.merge(top_recommendations, movies[['movieId', 'title']], on='movieId', how='left')
    return top_recommendations[['title', 'predicted_rating']]

# Recommend button
if st.button("Get Recommendations"):
    recommendations = recommend(user_id, top_n)
    st.subheader(f"Top {top_n} Recommended Movies for User {user_id}")
    st.table(recommendations)

# Footer
st.markdown("""
---
Created by **Manas Mondal**  
Powered by XGBoost + Streamlit 🚀
""")


2025-04-27 16:50:02.262 
  command:

    streamlit run C:\Users\Manas\anaconda3\lib\site-packages\ipykernel_launcher.py [ARGUMENTS]
2025-04-27 16:50:02.312 Session state does not function when running a script without `streamlit run`


DeltaGenerator()