In [1]:
#............................................................
#                     STEP - 1 
#............................................................

import os 
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report,confusion_matrix , accuracy_score
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score 
from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.tree import DecisionTreeClassifier , DecisionTreeRegressor 
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor 
from sklearn.svm import SVC , SVR
from sklearn.neighbors import KNeighborsClassifier , KNeighborsRegressor


#preprocessing -- to clean , transform and to prepare data before feeding it into machine learning model 
#Convert text to numbers - (LabelEncoder)
#Scale values to similar ranges - (StandardScaler)

#metrics  -- tools to evaluate the performance of the ml model 
#For classification: Accuracy, precision, confusion matrix
#For regression: MAE, RMSE, R²

#linear_model -- model based on linear relationship between input and output 
# this predicts the output with straight line relationship 

# tree --  model that splits the data using decisoin types or flowchart 
# it works well without scaling or preprocessing \

#ensemble -- model which combines multiple weak ones and create an strong one with this 


#SVM --  a strong model which draws the optimal boundary between classes 
#finds the best dimeensional even in high dimensional or non-linear spaces 
#best for small and medium datsets 


#neighbors - based on the closenes of the data points it came
#best for simple probleming and refernce models 



#............................................................
#                     STEP - 2
#............................................................

file_path = os.path.join('Netflix_data','netflix_titles.csv')
df = pd.read_csv(file_path)
print("<<<<<<<<<<<<<<<<<<DATA SET IMPORTED SUCCESSFULLY>>>>>>>>>>>>>>>>>>>")



#............................................................
#                     STEP - 3 (clean and preprocess)
#............................................................


df = df.dropna(subset= ['cast','country','duration','rating'])

def parse_duration(value):   #parse_duration - converts duration string into numericalvalue(in min)
    if 'Season' in value:
        return int(value.split()[0]) * 60
    else:
        return int(value.split()[0])
    
df['duration_mins'] = df['duration'].apply(parse_duration)
    
#LabelEncoders  ---  which converts text into numbers 
label_encoder = {}
for col in ['type','rating','country']:
    le = LabelEncoder()                          #df[col] = df['type'] df['rating'] etcc
    df[col] = le.fit_transform(df[col])    #.fit = learns all unique text labels in that column
                                           #.transform - converts those text labels into numbers
    label_encoder[col] = le



#............................................................
#                     STEP - 4 (features and targets)
#............................................................
features = df[['rating','country','duration_mins']]      #in ML feature(X) - input we given
                                                        #target(Y) - what we need the model to predict
target_class = df['type']
target_reg = df['release_year']

#scale
scaler = StandardScaler()
features_scale = scaler.fit_transform(features)

#train/test/Split
x_train_cls,x_test_cls,y_train_cls,y_test_cls = train_test_split(features_scale,target_class,test_size=0.2,random_state=46)
x_train_reg,x_test_reg,y_train_reg,y_test_reg= train_test_split(features_scale,target_reg,test_size=0.2,random_state=46)




#................................................................................
#                     STEP - 5 (Model Training and evaluation)
#................................................................................
#classification models 
classification_models = {
     "Logistic Regression": LogisticRegression(),
     "Decision Tree" : DecisionTreeClassifier(),
     "Random Forest" : RandomForestClassifier(),
     "KNN" : KNeighborsClassifier(),
     "SVM":SVC()
}

print("CLASSIFICATION MODELS RESULT:\n")
for name,model in classification_models.items():
    model.fit(x_train_cls,y_train_cls)
    preds = model.predict(x_test_cls)
    print(f".........{name}........")
    print("Accuracy:",accuracy_score(y_test_cls,preds))    #how many predicitons were correcr
    print("Confusion matrix:",confusion_matrix(y_test_cls,preds))  #breakdown of correct/incorrect predicitons 
    print("Classification reports:",classification_report(y_test_cls,preds)) #includes precision,recall F1-Score


#Regression models 
regression_models = {
    "Linear Regression" : LinearRegression(),
    "Decision Tree" : DecisionTreeRegressor(),
    "Random Forest" : RandomForestRegressor(),
    "SVM" : SVR(),
    "KNN":KNeighborsRegressor()
}

print(".......REGRESSION RESULTS ........")
for name,model in regression_models.items():
    model.fit(x_train_reg,y_train_reg)
    preds = model.predict(x_test_reg)
    print(f"........{name}..........")
    print("MAE:", mean_absolute_error(y_test_reg, preds))
    print("RMSE:", np.sqrt(mean_squared_error(y_test_reg, preds)))
    print("R2 Score:", r2_score(y_test_reg,preds))



print("<<<<<<<<<<<<<<<<<<<< SUCCESSFULLY FINISHED >>>>>>>>>>>>>>>>>>>>>>>>")

<<<<<<<<<<<<<<<<<<DATA SET IMPORTED SUCCESSFULLY>>>>>>>>>>>>>>>>>>>
CLASSIFICATION MODELS RESULT:

.........Logistic Regression........
Accuracy: 0.7486301369863013
Confusion matrix: [[1029   38]
 [ 329   64]]
Classification reports:               precision    recall  f1-score   support

           0       0.76      0.96      0.85      1067
           1       0.63      0.16      0.26       393

    accuracy                           0.75      1460
   macro avg       0.69      0.56      0.55      1460
weighted avg       0.72      0.75      0.69      1460

.........Decision Tree........
Accuracy: 0.9924657534246575
Confusion matrix: [[1058    9]
 [   2  391]]
Classification reports:               precision    recall  f1-score   support

           0       1.00      0.99      0.99      1067
           1       0.98      0.99      0.99       393

    accuracy                           0.99      1460
   macro avg       0.99      0.99      0.99      1460
weighted avg       0.99      0.99     