In [60]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical
from keras.optimizers import Adam, SGD

## Data Exploration

In [64]:
# Load Dataset
df = pd.read_csv('data/netflix_titles.csv')

In [65]:
# Show info about Dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


In [66]:
# Summary of Missing Values
df.isnull().sum()

show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64

### Filling Missing Values

Used **mode** approach for filling missing values on columns <code>(rating and duration)</code> because they are categorical values

In [67]:
df['director'].fillna('Unknown Director', inplace=True)
df['cast'].fillna('Unknown Cast', inplace=True)
df['country'].fillna('Unknown Country', inplace=True)
df['date_added'].fillna('Unknown date', inplace=True)
df['rating'].fillna(df['rating'].mode()[0], inplace=True)
df['duration'].fillna(df['duration'].mode()[0], inplace=True)

# Save the cleaned csv file
new_file_path = 'data/cleaned_netflix_titles.csv'
df.to_csv(new_file_path, index=False)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['director'].fillna('Unknown Director', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['cast'].fillna('Unknown Cast', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are sett

In [69]:
file_path = 'data/cleaned_netflix_titles.csv'
cleaned_df = pd.read_csv(file_path)
cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      8807 non-null   object
 4   cast          8807 non-null   object
 5   country       8807 non-null   object
 6   date_added    8807 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8807 non-null   object
 9   duration      8807 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


## Data Preprocessing

In [70]:
# Load the dataset
preprocessed_df = pd.read_csv('data/cleaned_netflix_titles.csv')

# Convert the 'duration' column to numerical format
preprocessed_df['duration'] = preprocessed_df['duration'].str.extract(r'(\d+)').astype(float)

# Normalize the 'duration' column
scaler = MinMaxScaler()
preprocessed_df['duration'] = scaler.fit_transform(preprocessed_df[['duration']])

# One-hot encode categorical variables (including genre if needed)
preprocessed_df = pd.get_dummies(preprocessed_df, columns=['country', 'listed_in'], drop_first=True)

In [71]:
# Define features and target variable
X = preprocessed_df.drop(columns=['rating', 'show_id', 'title', 'director', 'cast', 'date_added', 'description'])
y = preprocessed_df['rating']

# Convert target variable to categorical
y = pd.factorize(y)[0]  # Convert ratings to numerical values
y = to_categorical(y)    # One-hot encode the target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Checking the shape
print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)

Training set shape: (7045, 1264)
Testing set shape: (1762, 1264)


### Model Design

In [1]:
# Function to build the neural network model
def build_model(input_shape, num_classes, layers=2, neurons=64, optimizer='adam'):
    model = Sequential()
    
    # Input layer
    model.add(Dense(neurons, input_shape=(input_shape,), activation='relu'))  # First hidden layer
    
    # Additional hidden layers
    for _ in range(layers - 1):
        model.add(Dense(neurons, activation='relu'))  # Hidden layers with ReLU activation
    
    # Output layer (softmax for multi-class classification)
    model.add(Dense(num_classes, activation='softmax'))
    
    # Compile the model with the specified optimizer
    if optimizer == 'adam':
        model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    elif optimizer == 'sgd':
        model.compile(optimizer='sgd', loss='categorical_crossentropy', metrics=['accuracy'])
    else:
        raise ValueError("Optimizer not supported. Use 'adam' or 'sgd'.")
    
    return model

In [73]:
# Specify hyperparameters
num_layers = 3           # Number of hidden layers
neurons_per_layer = 128  # Number of neurons in each hidden layer
optimizer_choice = 'adam'  # Optimizer to use
epochs = 50              # Number of epochs for training
batch_size = 32          # Batch size for training

# Build the model
model = build_model(X_train.shape[1], y_train.shape[1], layers=num_layers, 
                    neurons=neurons_per_layer, optimizer=optimizer_choice)

# Train the model
model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2)

# Evaluate the model performance
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


ValueError: could not convert string to float: 'Movie'