# **Can we build a model that can predict whether a title is a movie or a TV show based on its features?**





In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier  # or any other model
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [4]:
df = pd.read_csv("/netflix_titles.csv")
print(df.head())
print(df.info())

  show_id     type                  title         director  \
0      s1    Movie   Dick Johnson Is Dead  Kirsten Johnson   
1      s2  TV Show          Blood & Water              NaN   
2      s3  TV Show              Ganglands  Julien Leclercq   
3      s4  TV Show  Jailbirds New Orleans              NaN   
4      s5  TV Show           Kota Factory              NaN   

                                                cast        country  \
0                                                NaN  United States   
1  Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...   South Africa   
2  Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...            NaN   
3                                                NaN            NaN   
4  Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...          India   

           date_added  release_year rating   duration  \
0  September 25, 2021          2020  PG-13     90 min   
1  September 24, 2021          2021  TV-MA  2 Seasons   
2  September 24, 2021        

In [5]:
print(df.isnull().sum())

show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64


# Preprocess the Data

In [7]:
import pandas as pd

# Load your dataset
df = pd.read_csv("/netflix_titles.csv")

# Drop rows with missing description
df = df.dropna(subset=['description', 'type'])

# Target variable
y = df['type']


# Extract Text Features (TF-IDF from description)

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english', max_features=1000)  # limit to top 1000 words

# Transform the description column into TF-IDF features
X_text = tfidf.fit_transform(df['description'])


# Combine TF-IDF with Other Features (optional)

In [9]:
# Process duration (as in earlier example)
df['duration_mins'] = df['duration'].str.extract('(\d+)').astype(float)
df['release_year'] = df['release_year'].fillna(0)

# Combine text and numeric features
from scipy.sparse import hstack

X_numeric = df[['release_year', 'duration_mins']].fillna(0)
X_all = hstack([X_text, X_numeric])


# Train the Model

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X_all, y, test_size=0.2, random_state=42)

model = RandomForestClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.9988649262202043
              precision    recall  f1-score   support

       Movie       1.00      1.00      1.00      1214
     TV Show       1.00      1.00      1.00       548

    accuracy                           1.00      1762
   macro avg       1.00      1.00      1.00      1762
weighted avg       1.00      1.00      1.00      1762

