In [42]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

#To create the model for recommendation system
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [66]:
import sklearn as skl

# Print the scikit-learn version
print("scikit-learn version:", skl.__version__)


scikit-learn version: 1.2.2


In [43]:
netflix_titles=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/NIA_MINIPROJECT/titles.csv')

In [44]:
df = pd.DataFrame(netflix_titles)

In [45]:
df['seasons'].fillna(0, inplace=True)

In [46]:
df.isnull().sum()

id                         0
title                      1
type                       0
description               18
release_year               0
age_certification       2619
runtime                    0
genres                     0
production_countries       0
seasons                    0
imdb_id                  403
imdb_score               482
imdb_votes               498
tmdb_popularity           91
tmdb_score               311
dtype: int64

In [47]:
if df['age_certification'].isnull().any():
    df.drop('age_certification', axis=1, inplace=True)

In [48]:
df.dropna(subset=['imdb_id'], inplace=True)
df.dropna(subset=['description'], inplace=True)

In [49]:
duplicate_rows = df[df.duplicated()]

if not duplicate_rows.empty:
    df.drop_duplicates(inplace=True)

In [50]:
numeric_features = ['release_year', 'runtime', 'imdb_score', 'imdb_votes', 'tmdb_popularity', 'tmdb_score']

# replacing missing values with mean
df[numeric_features] = df[numeric_features].fillna(df[numeric_features].mean())

#transformer for scaling numerical features
numeric_transformer = StandardScaler()

# Applying transformation to the numerical columns in the dataset
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features)
])

In [51]:
index = df["imdb_score"] > 9
df_high_scores = df[index]
df = df[~index]
#outliers are removed from the dataset

# **Predictive Analysis**

# **1.Developng recommendation system using Term Frequency-Inverse Document Frequency (TF-IDF)**

**Resetting indexes of the dataframe:**
Resetting the index of the DataFrame ensures that the indices of the DataFrame match the indices used in the cosine similarity matrix. This alignment is crucial because any misalignment can cause "Index out of bounds" errors when accessing elements in the cosine similarity matrix.

In [52]:
#removing stopwords
tfidf = TfidfVectorizer(stop_words='english')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(df['description'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

(5429, 20357)

In [53]:
# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [54]:
indices = pd.Series(df.index, index=df['title']).drop_duplicates()

In [55]:

df.reset_index(drop=True, inplace=True)

# Combine selected features into a single string using .loc to avoid SettingWithCopyWarning
df.loc[:, 'combined_features'] = df['title'].str.lower() + ' ' + df['description'] + ' ' + df['production_countries']

# Feature Extraction using TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['combined_features'])

# Cosine Similarity
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# Series with titles as index
indices = pd.Series(df.index, index=df['title'].str.lower()).drop_duplicates()

def get_recommendations(title, cosine_sim=cosine_sim):
    title_lower = title.lower()

    if title_lower not in indices:
        return f"Title '{title}' not found in the dataset"

    idx = indices[title_lower]

    # pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    #similarity scores based on the score value
    sim_scores = sorted(sim_scores, key=lambda x: np.any(x[1]), reverse=True)

    # top 10 most similar movies
    movie_indices = [i[0] for i in sim_scores[1:11]]

    # Return the titles
    return df['title'].iloc[movie_indices]

test_title = 'Titanic'
recommendations = get_recommendations(test_title)

print(f"Recommendations for '{test_title}':")
print(recommendations)


Recommendations for 'Titanic':
5                      Life of Brian
8                    The Blue Lagoon
13                     Cairo Station
16                  Alexandria… Why?
20               Alibaba Aur 40 Chor
21                   The Blazing Sun
22                       Dark Waters
26                 Beirut, Oh Beirut
27    The Return of the Prodigal Son
29                        Manoranjan
Name: title, dtype: object


Example 2 for movie recommendation

In [56]:
test_title = 'Bridgerton'
recommendations = get_recommendations(test_title)

print(f"Recommendations for '{test_title}':")
print(recommendations)

Recommendations for 'Bridgerton':
16                         Alexandria… Why?
17                                 The Land
27           The Return of the Prodigal Son
30                                    Ujala
31                                 Whispers
35                               GoodFellas
37              Once Upon a Time in America
48    National Lampoon's Christmas Vacation
60                   Waiting for the Hearse
65                                Agneepath
Name: title, dtype: object


Example 3 for movie recommendation

In [57]:
test_title = 'Peaky blinders'
recommendations = get_recommendations(test_title)

print(f"Recommendations for '{test_title}':")
print(recommendations)

Recommendations for 'Peaky blinders':
3                    The Dirty Dozen
4       Monty Python's Flying Circus
5                      Life of Brian
9               The Guns of Navarone
11    Richard Pryor: Live in Concert
13                     Cairo Station
16                  Alexandria… Why?
17                          The Land
27    The Return of the Prodigal Son
28                       Khoon Khoon
Name: title, dtype: object


# **MODEL 2**
# **2.Predicting Movie Hits using Random Forest Classification**

In [58]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

#condition for data to be included in the hit column
df['hit'] = (df['imdb_score'] > 7.0) & (df['tmdb_score'] > 7.0) & (df['imdb_votes'] > 10000) & (df['tmdb_popularity'] > 50)

# Convert boolean to integer
df.loc[:, 'hit'] = df['hit'].astype(int)

X = df[['imdb_score', 'imdb_votes', 'tmdb_score', 'tmdb_popularity']]
y = df['hit']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Random Forest Classifier
classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(X_train, y_train)

# Prediction on the test set
y_pred = classifier.predict(X_test)

#evaluating model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(f'Classification Report:\n{classification_report(y_test, y_pred)}')


Accuracy: 0.998158379373849
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1061
           1       1.00      0.92      0.96        25

    accuracy                           1.00      1086
   macro avg       1.00      0.96      0.98      1086
weighted avg       1.00      1.00      1.00      1086



In [59]:
classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(X, y)

new_movie_name = 'The Door'

new_movie_data = pd.DataFrame({
    'imdb_score': [3.5],
    'imdb_votes': [1000],
    'tmdb_score': [3.0],
    'tmdb_popularity': [80.0]
})

# Predict whether the new movie will be a hit
new_prediction = classifier.predict(new_movie_data)

if new_prediction[0] == 1:
    print(f"Movie '{new_movie_name}' is predicted to be a hit!")
else:
    print(f"Movie '{new_movie_name}' is not predicted to be a hit.")


Movie 'The Door' is not predicted to be a hit.


Example for a movie being hit

In [60]:
new_movie_name = 'The Call'

new_movie_data = pd.DataFrame({
    'imdb_score': [8.2],
    'imdb_votes': [100000],
    'tmdb_score': [7.6],
    'tmdb_popularity': [142.5]
})

# Predict whether the new movie will be a hit
new_prediction = classifier.predict(new_movie_data)


if new_prediction[0] == 1:
    print(f"Movie '{new_movie_name}' is predicted to be a hit!")
else:
    print(f"Movie '{new_movie_name}' is not predicted to be a hit.")


Movie 'The Call' is predicted to be a hit!


# **Saving the trained model**

In [61]:
import pickle

In [62]:
file_name='trained_model.sav'
pickle.dump(classifier,open(file_name,'wb'))

Loading the saved model

In [63]:
loaded_model=pickle.load(open('trained_model.sav','rb'))

In [64]:
classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(X, y)

new_movie_name = 'The Call'

new_movie_data = pd.DataFrame({
    'imdb_score': [8.2],
    'imdb_votes': [100000],
    'tmdb_score': [7.6],
    'tmdb_popularity': [142.5]
})

# Predict whether the new movie will be a hit
new_prediction = loaded_model.predict(new_movie_data)


if new_prediction[0] == 1:
    print(f"Movie '{new_movie_name}' is predicted to be a hit!")
else:
    print(f"Movie '{new_movie_name}' is not predicted to be a hit.")


Movie 'The Call' is predicted to be a hit!
