# Predicting Netflix Show Success
Welcome to your COMP442 project notebook.

In [None]:
# dataset file ('netflix_merged.csv')
from google.colab import files
uploaded = files.upload()

In [None]:
import pandas as pd

# Load uploaded file
df = pd.read_csv('netflix_merged.csv')
pd.set_option('display.max_columns', None)
df.head()

In [None]:
# use the df we just loaded
df = df.copy()  # no merged_df here

In [None]:
# Convert categorical features (genre, type, release_year)
df_encoded = pd.get_dummies(df[['Genre', 'type', 'release_year']])
X = df_encoded
y = df['hit']



In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Training and testing

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred, labels=[0, 1]))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Label Distribution:\n", y.value_counts())


In [None]:
import matplotlib.pyplot as plt

importances = model.feature_importances_
feature_names = X.columns

plt.figure(figsize=(8, 6))
plt.barh(feature_names, importances)
plt.title("Feature Importance")
plt.xlabel("Importance")
plt.xlim(0, 0.05)  #Zoom into smaller features
plt.tight_layout()
plt.show()

In [None]:
df['IMDB_Rating'].hist(bins=10)
plt.title('Distribution of IMDb Ratings')
plt.xlabel('IMDb Rating')
plt.ylabel('Frequency')
plt.show()

In [None]:
import seaborn as sns


sns.boxplot(x='type', y='IMDB_Rating', data=df)
plt.title('Boxplot of IMDb Ratings by Type')
plt.xlabel('Type')
plt.ylabel('IMDb Rating')
plt.show()

In [None]:
df['IMDB_Rating'].plot.kde()
plt.title('KDE Plot of IMDb Ratings')
plt.xlabel('IMDb Rating')
plt.show()

In [None]:
plt.scatter(df['release_year'], df['IMDB_Rating'])
plt.title('Rating vs. Release Year')
plt.xlabel('Release Year')
plt.ylabel('IMDb Rating')
plt.show()

In [None]:
corr = df[['release_year', 'IMDB_Rating']].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

In [None]:

import re, joblib
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

#--- merged dataframe here ---
df = df.copy()

# Label: IMDb >= 8.0 -> hit (1) else 0
df["hit"] = (df["IMDB_Rating"] >= 8.0).astype(int)

# Minimal, consistent genre cleaner
def clean_genres(s):
    if pd.isna(s): return ""
    toks = [re.sub(r"\s+", " ", t.strip().lower()) for t in str(s).split(",") if t.strip()]
    return ",".join(sorted(set(toks)))

df["genres_clean"] = df["Genre"].apply(clean_genres)
df["type_clean"] = df["type"].str.strip().str.lower()
df = df.dropna(subset=["type_clean","release_year"])

# Split
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df["hit"], random_state=42)

# Custom transformer
class GenresMultiHot(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.vocab_ = None
    def fit(self, X, y=None):
        genres_sets = X["genres_clean"].apply(lambda s: [t for t in s.split(",") if t])
        vocab = sorted({g for gs in genres_sets for g in gs})
        self.vocab_ = vocab
        return self
    def transform(self, X):
        import numpy as np
        out = np.zeros((len(X), len(self.vocab_)), dtype=int)
        for i, s in enumerate(X["genres_clean"]):
            for g in [t for t in s.split(",") if t]:
                if g in self.vocab_:
                    out[i, self.vocab_.index(g)] = 1
        return out

genre_transformer = GenresMultiHot()

cat_transformer = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

preprocess = ColumnTransformer(
    transformers=[
        ("genres", genre_transformer, ["genres_clean"]),
        ("type",   cat_transformer,   ["type_clean"]),
        ("year",   "passthrough",     ["release_year"]),
    ],
    remainder="drop"
)

model = RandomForestClassifier(
    n_estimators=300, class_weight="balanced", random_state=42, n_jobs=-1
)

pipeline = Pipeline(steps=[("prep", preprocess), ("clf", model)])
pipeline.fit(train_df[["genres_clean","type_clean","release_year"]], train_df["hit"])

joblib.dump(pipeline, "pipeline.joblib")
print("Saved: pipeline.joblib")
