In [5]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.feature_extraction import FeatureHasher
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import re

# Load the data
data = pd.read_csv('./netflix_titles.csv')

# Drop unnecessary columns
data.drop(['show_id', 'title', 'date_added'], axis=1, inplace=True)

# Label encode 'type' column
le = LabelEncoder()
data['type'] = le.fit_transform(data['type'])

# Convert 'duration' to numeric
data['duration'] = data['duration'].str.split(' ', expand=True)[0]
data['duration'] = pd.to_numeric(data['duration'])

# Normalize 'release_year'
scaler = MinMaxScaler()
data['release_year'] = scaler.fit_transform(data['release_year'].values.reshape(-1,1))

# Feature hashing for high cardinality categorical features
n_features = 100
hasher = FeatureHasher(n_features=n_features, input_type='string')

for column in ['director', 'cast', 'country', 'listed_in']:
    # Replace NaN values with the string "missing"
    data[column] = data[column].fillna('missing')
    # Split the strings into lists of words
    data[column] = data[column].str.split()
    hashed_features = hasher.transform(data[column])
    hashed_features = pd.DataFrame(hashed_features.toarray())
    hashed_features.columns = [f"{column}_{i}" for i in range(n_features)]
    data = pd.concat([data, hashed_features], axis=1)
    data.drop(column, axis=1, inplace=True)

# Text preprocessing and TF-IDF for 'description'
def preprocess_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'[^\w\s]', '', text)
        text = re.sub(r'\d+', '', text)
        text = " ".join(word for word in text.split() if word not in ENGLISH_STOP_WORDS)
    else:
        text = ""
    return text

data['description'] = data['description'].apply(preprocess_text)
vectorizer = TfidfVectorizer(max_features=100)
tfidf_features = vectorizer.fit_transform(data['description'])
tfidf_features = pd.DataFrame(tfidf_features.toarray(), columns=vectorizer.get_feature_names_out())
data.reset_index(drop=True, inplace=True)
tfidf_features.reset_index(drop=True, inplace=True)
data = pd.concat([data, tfidf_features], axis=1)
data.drop('description', axis=1, inplace=True)

# Label encode 'rating' column
data['rating'] = le.fit_transform(data['rating'])

# Split into features (X) and target (y)
X = data.drop(['rating'], axis=1)
y = data['rating']

# Fill NaN values with zeros
X.fillna(0, inplace=True)

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred, zero_division=1))

              precision    recall  f1-score   support

           0       1.00      0.00      0.00         1
           2       0.00      1.00      0.00         0
           3       1.00      0.08      0.15        12
           5       1.00      0.00      0.00        16
           6       0.60      0.58      0.59        62
           7       0.51      0.30      0.38        87
           8       0.55      0.61      0.58       163
           9       0.50      0.49      0.50       414
          10       0.50      0.07      0.12        43
          11       0.60      0.78      0.68       662
          12       0.44      0.16      0.23       185
          13       0.62      0.77      0.68        52
          14       0.62      0.65      0.63        65

    accuracy                           0.57      1762
   macro avg       0.61      0.42      0.35      1762
weighted avg       0.56      0.57      0.54      1762



In [13]:
# Create a new DataFrame for the movie
new_movie = pd.DataFrame({
    'type': ['1'],
    'director': ['Peter Jackson'],
    'cast': ['Ian McKellen, Martin Freeman, Richard Armitage'],
    'country': ['New Zealand, United States'],
    'release_year': [2012],
    'duration': ['169 min'],
    'listed_in': ['Action & Adventure, International Movies, Sci-Fi & Fantasy'],
    'description': ['A reluctant hobbit, Bilbo Baggins, sets out to the Lonely Mountain with a spirited group of dwarves to reclaim their mountain home - and the gold within it - from the dragon Smaug.']
})

# Preprocess the new entry in the same way as the training data
new_movie['type'] = le.transform(new_movie['type'])
new_movie['duration'] = new_movie['duration'].str.split(' ', expand=True)[0]
new_movie['duration'] = pd.to_numeric(new_movie['duration'])
new_movie['release_year'] = scaler.transform(new_movie['release_year'].values.reshape(-1,1))

for column in ['director', 'cast', 'country', 'listed_in']:
    new_movie[column] = new_movie[column].str.split(',')
    hashed_features = hasher.transform(new_movie[column])
    hashed_features = pd.DataFrame(hashed_features.toarray())
    hashed_features.columns = [f"{column}_{i}" for i in range(n_features)]
    new_movie = pd.concat([new_movie, hashed_features], axis=1)
    new_movie.drop(column, axis=1, inplace=True)

new_movie['description'] = new_movie['description'].apply(preprocess_text)
tfidf_features = vectorizer.transform(new_movie['description'])
tfidf_features = pd.DataFrame(tfidf_features.toarray(), columns=vectorizer.get_feature_names_out())
new_movie.reset_index(drop=True, inplace=True)
tfidf_features.reset_index(drop=True, inplace=True)
new_movie = pd.concat([new_movie, tfidf_features], axis=1)
new_movie.drop('description', axis=1, inplace=True)

# Use the model to make a prediction
predicted_rating = rf.predict(new_movie)

# Print the predicted rating
print(le.inverse_transform(predicted_rating))

ValueError: y contains previously unseen labels: '1'

In [8]:
print(data['type'].unique())

[0 1]


In [9]:
print(le.classes_)

['66 min' '74 min' '84 min' 'G' 'NC-17' 'NR' 'PG' 'PG-13' 'R' 'TV-14'
 'TV-G' 'TV-MA' 'TV-PG' 'TV-Y' 'TV-Y7' 'TV-Y7-FV' 'UR' nan]


In [11]:
from sklearn.preprocessing import LabelEncoder

# Create a new LabelEncoder
le_type = LabelEncoder()

# Fit the LabelEncoder on the 'type' column
le_type.fit(data['type'])

# Print the original classes
print(le_type.classes_)

[0 1]


In [6]:
for i in df:
    print(df[i])

show_id
type
title
director
cast
country
date_added
release_year
rating
duration
listed_in
description


In [7]:
df.isnull().any()
df.dropna(inplace=True)
df.isnull().any()

show_id         False
type            False
title           False
director         True
cast             True
country          True
date_added       True
release_year    False
rating           True
duration         True
listed_in       False
description     False
dtype: bool

In [14]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from collections import Counter

In [15]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\MichaelWong\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\MichaelWong\AppData\Roaming\nltk_data...
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\MichaelWong\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [16]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation and special characters
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords
    tokens = [token for token in tokens if token not in stopwords.words('english')]
    # Perform lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return tokens

In [25]:
data = pd.read_csv('./netflix_titles.csv')

In [20]:
import nltk
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\MichaelWong\AppData\Roaming\nltk_data...


True

In [21]:
# Preprocess the 'description' column
data['description'] = data['description'].apply(preprocess_text)

# Get the most common words
common_words = Counter([word for description in data['description'] for word in description]).most_common(10)

print(common_words)

[('life', 1063), ('young', 728), ('family', 709), ('new', 699), ('woman', 661), ('find', 654), ('friend', 626), ('world', 566), ('love', 553), ('man', 519)]


In [26]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

# Function to remove stop words from sentences
def remove_stopwords(text):
    return [word for word in text if word not in stop_words]

# Remove the stop words
data['description'] = data['description'].apply(remove_stopwords)

# Find the most common words now
words = [word for description in data['description'] for word in description]
counter = Counter(words)
print(counter.most_common(10))

[(' ', 201520), ('e', 118696), ('n', 74543), ('r', 70776), ('h', 48525), ('l', 41656), ('c', 30530), ('u', 27766), ('f', 24069), ('g', 23912)]


In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('./netflix_titles.csv')

In [6]:
df.isnull().sum()

show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64

In [7]:
df

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...
...,...,...,...,...,...,...,...,...,...,...,...,...
8802,s8803,Movie,Zodiac,David Fincher,"Mark Ruffalo, Jake Gyllenhaal, Robert Downey J...",United States,"November 20, 2019",2007,R,158 min,"Cult Movies, Dramas, Thrillers","A political cartoonist, a crime reporter and a..."
8803,s8804,TV Show,Zombie Dumb,,,,"July 1, 2019",2018,TV-Y7,2 Seasons,"Kids' TV, Korean TV Shows, TV Comedies","While living alone in a spooky town, a young g..."
8804,s8805,Movie,Zombieland,Ruben Fleischer,"Jesse Eisenberg, Woody Harrelson, Emma Stone, ...",United States,"November 1, 2019",2009,R,88 min,"Comedies, Horror Movies",Looking to survive in a world taken over by zo...
8805,s8806,Movie,Zoom,Peter Hewitt,"Tim Allen, Courteney Cox, Chevy Chase, Kate Ma...",United States,"January 11, 2020",2006,PG,88 min,"Children & Family Movies, Comedies","Dragged from civilian life, a former superhero..."


In [9]:
df.dropna(inplace=True)

In [10]:
df

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
7,s8,Movie,Sankofa,Haile Gerima,"Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra D...","United States, Ghana, Burkina Faso, United Kin...","September 24, 2021",1993,TV-MA,125 min,"Dramas, Independent Movies, International Movies","On a photo shoot in Ghana, an American model s..."
8,s9,TV Show,The Great British Baking Show,Andy Devonshire,"Mel Giedroyc, Sue Perkins, Mary Berry, Paul Ho...",United Kingdom,"September 24, 2021",2021,TV-14,9 Seasons,"British TV Shows, Reality TV",A talented batch of amateur bakers face off in...
9,s10,Movie,The Starling,Theodore Melfi,"Melissa McCarthy, Chris O'Dowd, Kevin Kline, T...",United States,"September 24, 2021",2021,PG-13,104 min,"Comedies, Dramas",A woman adjusting to life after a loss contend...
12,s13,Movie,Je Suis Karl,Christian Schwochow,"Luna Wedler, Jannis Niewöhner, Milan Peschel, ...","Germany, Czech Republic","September 23, 2021",2021,TV-MA,127 min,"Dramas, International Movies",After most of her family is murdered in a terr...
24,s25,Movie,Jeans,S. Shankar,"Prashanth, Aishwarya Rai Bachchan, Sri Lakshmi...",India,"September 21, 2021",1998,TV-14,166 min,"Comedies, International Movies, Romantic Movies",When the father of the man she loves insists t...
...,...,...,...,...,...,...,...,...,...,...,...,...
8801,s8802,Movie,Zinzana,Majid Al Ansari,"Ali Suliman, Saleh Bakri, Yasa, Ali Al-Jabri, ...","United Arab Emirates, Jordan","March 9, 2016",2015,TV-MA,96 min,"Dramas, International Movies, Thrillers",Recovering alcoholic Talal wakes up inside a s...
8802,s8803,Movie,Zodiac,David Fincher,"Mark Ruffalo, Jake Gyllenhaal, Robert Downey J...",United States,"November 20, 2019",2007,R,158 min,"Cult Movies, Dramas, Thrillers","A political cartoonist, a crime reporter and a..."
8804,s8805,Movie,Zombieland,Ruben Fleischer,"Jesse Eisenberg, Woody Harrelson, Emma Stone, ...",United States,"November 1, 2019",2009,R,88 min,"Comedies, Horror Movies",Looking to survive in a world taken over by zo...
8805,s8806,Movie,Zoom,Peter Hewitt,"Tim Allen, Courteney Cox, Chevy Chase, Kate Ma...",United States,"January 11, 2020",2006,PG,88 min,"Children & Family Movies, Comedies","Dragged from civilian life, a former superhero..."


In [11]:
df.isnull().sum()

show_id         0
type            0
title           0
director        0
cast            0
country         0
date_added      0
release_year    0
rating          0
duration        0
listed_in       0
description     0
dtype: int64