In [1]:
import pandas as pd
import requests

In [5]:
api_key = "d35ce2b0877171b92857991dda69b375"

def fetch_data(api_key, page_limit=10):
    all_movies = []

    for page in range(1, page_limit + 1):
        print(f"Fetching data from page {page}")
        url = f"https://api.themoviedb.org/3/movie/popular?api_key={api_key}&language=en-US&page={page}"
        response = requests.get(url)

        if response.status_code != 200:
            print(f"Failed to fetch page {page}: {response.status_code}")
            continue
        
        results = response.json().get("results", [])

        for movie in results:
            try:
                movie_id = movie['id']
                details_url = f"https://api.themoviedb.org/3/movie/{movie_id}?api_key={api_key}&append_to_response=credits"
                details_response = requests.get(details_url).json()

                title = details_response.get("title", "")
                genres = ", ".join([g['name'] for g in details_response.get("genres", [])])
                synopsis = details_response.get("overview", "")
                rating = details_response.get("vote_average", 0)

                credits = details_response.get("credits", {})
                cast = credits.get("cast", [])
                crew = credits.get("crew", [])

                actors = ", ".join([person['name'] for person in cast[:3]])
                directors = ", ".join([person['name'] for person in crew if person['job'] == "Director"])

                all_movies.append({
                    'title': title,
                    'genres': genres,
                    'actors': actors,
                    'directors': directors,
                    'synopsis': synopsis,
                    'rating': rating
                })

            except Exception as e:
                print(f"Error in fetching data for movie id {movie_id}: {e}")
                continue

    return pd.DataFrame(all_movies)

# Fetch movie data and save to CSV
movie_df = fetch_data(api_key, page_limit=10)
movie_df.to_csv('movies_dataset.csv', index=False)
print("Movie dataset has been created successfully.")


Fetching data from page 1
Fetching data from page 2
Fetching data from page 3
Fetching data from page 4
Fetching data from page 5
Fetching data from page 6
Fetching data from page 7
Fetching data from page 8
Fetching data from page 9
Fetching data from page 10
Movie dataset has been created successfully.


In [6]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,r2_score
from xgboost import XGBRegressor
from sentence_transformers import SentenceTransformer
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
data = pd.read_csv("movies_dataset.csv")
data.fillna('',inplace=True)
data.head()

Unnamed: 0,title,genres,actors,directors,synopsis,rating
0,Sonic the Hedgehog 3,"Action, Science Fiction, Comedy, Family","Jim Carrey, Ben Schwartz, Keanu Reeves",Jeff Fowler,"Sonic, Knuckles, and Tails reunite against a p...",7.775
1,Kraven the Hunter,"Action, Adventure, Thriller","Aaron Taylor-Johnson, Ariana DeBose, Fred Hech...",J.C. Chandor,Kraven Kravinoff's complex relationship with h...,6.602
2,Moana 2,"Animation, Adventure, Family, Comedy","Auliʻi Cravalho, Dwayne Johnson, Hualālai Chung","David G. Derrick Jr., Jason Hand, Dana Ledoux ...",After receiving an unexpected call from her wa...,7.219
3,Back in Action,"Action, Comedy","Cameron Diaz, Jamie Foxx, McKenna Roberts",Seth Gordon,Fifteen years after vanishing from the CIA to ...,6.626
4,Mufasa: The Lion King,"Adventure, Family, Animation","Aaron Pierre, Kelvin Harrison Jr., Tiffany Boone",Barry Jenkins,"Mufasa, a cub lost and alone, meets a sympathe...",7.425


In [13]:
avg_actor_rating = data.groupby('actors')['rating'].mean().to_dict()
avg_director_rating = data.groupby('directors')['rating'].mean().to_dict()

data['avg_actor_rating'] = data['actors'].map(avg_actor_rating)
data['avg_director_rating'] = data['directors'].map(avg_director_rating)

In [14]:
data.head()

Unnamed: 0,title,genres,actors,directors,synopsis,rating,avg_actor_rating,avg_director_rating
0,Sonic the Hedgehog 3,"Action, Science Fiction, Comedy, Family","Jim Carrey, Ben Schwartz, Keanu Reeves",Jeff Fowler,"Sonic, Knuckles, and Tails reunite against a p...",7.775,7.775,7.521667
1,Kraven the Hunter,"Action, Adventure, Thriller","Aaron Taylor-Johnson, Ariana DeBose, Fred Hech...",J.C. Chandor,Kraven Kravinoff's complex relationship with h...,6.602,6.602,6.602
2,Moana 2,"Animation, Adventure, Family, Comedy","Auliʻi Cravalho, Dwayne Johnson, Hualālai Chung","David G. Derrick Jr., Jason Hand, Dana Ledoux ...",After receiving an unexpected call from her wa...,7.219,7.219,7.219
3,Back in Action,"Action, Comedy","Cameron Diaz, Jamie Foxx, McKenna Roberts",Seth Gordon,Fifteen years after vanishing from the CIA to ...,6.626,6.626,6.626
4,Mufasa: The Lion King,"Adventure, Family, Animation","Aaron Pierre, Kelvin Harrison Jr., Tiffany Boone",Barry Jenkins,"Mufasa, a cub lost and alone, meets a sympathe...",7.425,7.425,7.425


In [15]:
sentence_model = SentenceTransformer("all-miniLM-L6-v2")
synopsis_ambeddings = sentence_model.encode(data['synopsis'].tolist())

genre_ohe= pd.get_dummies(data['genres'])

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [16]:
X = np.hstack((
    synopsis_ambeddings,
    genre_ohe.values,
    data[['avg_actor_rating','avg_director_rating']].values
))
Y = data['rating']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [17]:
xgb_model= XGBRegressor(
    n_estimators = 300,
    learning_rate = 0.03,
    max_depth= 7,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

In [18]:
xgb_model.fit(X_train,Y_train)
y_pred = xgb_model.predict(X_test)

In [19]:
r2 = r2_score(Y_test,y_pred)
mse = mean_squared_error(Y_test,y_pred)

print(f"r2 score {r2:.4f}")
print(f"Mean Squared Erorr: {mse:.4f}")

r2 score 0.8651
Mean Squared Erorr: 0.0992


In [20]:
pickle.dump(xgb_model, open("movie_rating_model.pkl",'wb'))
pickle.dump(sentence_model, open("Sentence_model.pkl",'wb'))
pickle.dump(genre_ohe.columns.to_list(),open("genre_columns.pkl",'wb'))
pickle.dump(avg_actor_rating, open("avg_actor_rating.pkl",'wb'))
pickle.dump(avg_director_rating, open("avg_director_rating.pkl", 'wb'))