In [None]:
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    classification_report,
    mean_absolute_error,
    
    mean_squared_error,
    r2_score
)

from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.svm import SVC, SVR
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor


# Load Data 
file_path = os.path.join("Netflix_data", "netflix_titles.csv")
df = pd.read_csv(file_path)


# Preprocessing 
df = df.dropna(subset=["cast", "country", "duration", "rating"])

def parse_duration(value):
    if "Season" in value:
        return int(value.split()[0]) * 60
    return int(value.split()[0])

df["duration_mins"] = df["duration"].apply(parse_duration)

label_encoders = {}
for col in ["type", "rating", "country"]:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le


# Features & Targets 
X = df[["rating", "country", "duration_mins"]]
y_class = df["type"]
y_reg = df["release_year"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train_cls, X_test_cls, y_train_cls, y_test_cls = train_test_split(
    X_scaled, y_class, test_size=0.2, random_state=46
)

X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X_scaled, y_reg, test_size=0.2, random_state=46
)


# Classification 
classification_models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "KNN": KNeighborsClassifier(),
    "SVM": SVC()
}

for name, model in classification_models.items():
    model.fit(X_train_cls, y_train_cls)
    preds = model.predict(X_test_cls)

    print(f"\n{name}")
    print("Accuracy:", accuracy_score(y_test_cls, preds))
    print("Confusion Matrix:\n", confusion_matrix(y_test_cls, preds))
    print("Classification Report:\n", classification_report(y_test_cls, preds))


# Regression 
regression_models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(),
    "SVM": SVR(),
    "KNN": KNeighborsRegressor()
}

for name, model in regression_models.items():
    model.fit(X_train_reg, y_train_reg)
    preds = model.predict(X_test_reg)

    print(f"\n{name}")
    print("MAE:", mean_absolute_error(y_test_reg, preds))
    print("RMSE:", np.sqrt(mean_squared_error(y_test_reg, preds)))
    print("R2 Score:", r2_score(y_test_reg, preds))



Logistic Regression
Accuracy: 0.7486301369863013
Confusion Matrix:
 [[1029   38]
 [ 329   64]]
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.96      0.85      1067
           1       0.63      0.16      0.26       393

    accuracy                           0.75      1460
   macro avg       0.69      0.56      0.55      1460
weighted avg       0.72      0.75      0.69      1460


Decision Tree
Accuracy: 0.9924657534246575
Confusion Matrix:
 [[1058    9]
 [   2  391]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      0.99      1067
           1       0.98      0.99      0.99       393

    accuracy                           0.99      1460
   macro avg       0.99      0.99      0.99      1460
weighted avg       0.99      0.99      0.99      1460


Random Forest
Accuracy: 0.9931506849315068
Confusion Matrix:
 [[1058    9]
 [   1  392]]
Classification Report: