In [27]:
import warnings
warnings.filterwarnings("ignore")

In [28]:
import os
import gc
import regex as re
import itertools
import numpy as np
import pandas as pd
import codecs,string
import matplotlib.pyplot as plt
from sklearn import decomposition
from sklearn import model_selection
from tqdm import tqdm_notebook as tqdm
from sklearn import metrics, preprocessing
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

gc.enable()

In [29]:
class config:
    SEED = 3407
    USE_TfIdf = True
    MAX_FEATURES = 2500

# Utils

In [30]:
# arabic
def is_arabic(character):
    maxchar = max(character)
    if u'\u0627' <= maxchar <= u'\u064a':
        return True
    else:
        return False
    
#korean
def is_hangul(value):
    if re.search(r'\p{IsHangul}', value):
        return True
    return False

#hindi
def is_hindi(character):
    maxchar = max(character)
    if u'\u0900' <= maxchar <= u'\u097f':
        return True
    else:
        return False

In [31]:
def wrangle_data(train_data, test_data):
    test_data['is_test_data'] = 1
    train_data['is_test_data'] = 0
    data = pd.concat([train_data, test_data]).reset_index(drop=True)

    # cat_cols = ['video_id', 'channelId', 'categoryId', 'comments_disabled']
    num_cols = ['duration_seconds']
    date_cols = ['publishedAt', 'trending_date']
    drop_cols = ['id']
    
    
    # ------- Date Time cols ------ 
    print('Date time cols...')
    for col in date_cols:
        data[col] = pd.to_datetime(data[col], utc=True)
        data.loc[:, col + '_year'] = (data[col].dt.year).astype('category')
        data.loc[:, col + '_weekofyear'] = (data[col].dt.isocalendar().week).astype('category')
        data.loc[:, col + '_month'] = (data[col].dt.month).astype('category')
        data.loc[:, col + '_dayofweek'] = (data[col].dt.dayofweek).astype('category')
        data.loc[:, col + '_weekend'] = ((data[col].dt.weekday >=5).astype(int)).astype('category')
        drop_cols.append(col)
    

    data['video_age'] = (data['trending_date'] - data['publishedAt'])\
                        .dt.days.astype('int')\
                        .replace({-1: 0})
   
    
    # ------- Text columns -------
    print('Text stuff')
    data['text'] = data['channelTitle']\
            + ' ' + data['title'] \
            + ' ' + data['description'].fillna(' ').apply(lambda x: re.sub(r'http\S+', ' ', x))\
            + ' ' + data['tags'].apply(lambda x: x.replace('|', ' ').replace('[None]', ''))
    
    data['num_words'] = data['text'].apply(lambda x: len(x.split()))
    data['num_characters'] = data['text'].apply(lambda x: len(x))
    
    drop_cols += ['channelTitle', 'title', 'description', 'tags']
    drop_cols += ['thumbnail_link', 'has_thumbnail']
    drop_cols += ['view_count', 'likes', 'dislikes', 'comment_count',]

    #  ----------- New Features ---------------------------------------
    data['Friday_Trending'] = [1 if a == 4 else 0 for a in data.trending_date_dayofweek]
    data['Friday_Published'] = [1 if a == 4 else 0 for a in data.publishedAt_dayofweek]
    data['Sunday_Published'] = [1 if a == 6 else 0 for a in data.publishedAt_dayofweek]
    data['isArabic'] = [is_arabic(a) for a in data.tags]
    data['isKorean'] = [is_hangul(a) for a in data.tags]
    data['isHindi'] = [is_hindi(a) for a in data.tags]

    data.drop(drop_cols, axis=1, inplace=True)
    
    train_data = data[data['is_test_data'] == 0].reset_index(drop=True)
    test_data = data[data['is_test_data'] == 1].reset_index(drop=True)

    train_data.drop(['is_test_data'], axis=1, inplace=True)
    test_data.drop(['is_test_data'], axis=1, inplace=True)


    # scaling
    for col in num_cols:
        
        col_median = train_data[col].median()
        train_data[col] = train_data[col].fillna(col_median)
        test_data[col] = test_data[col].fillna(col_median)

        scaler = MinMaxScaler()
        train_data[col] = scaler.fit_transform(train_data[[col]])  # Fit and transform on train data
        test_data[col] = scaler.transform(test_data[[col]])  # Only transform on test data

    # label encoding
    cat_cols = train_data.select_dtypes(include=['category', 'object']).columns.tolist()
    cat_cols.remove('text')
    for col in cat_cols:
        
        train_data[col].fillna('NONE', inplace=True)
        known_categories = train_data[col].unique().tolist() + ['Other']
    
        # Convert train data
        le = LabelEncoder()
        train_data[col] = le.fit(known_categories).transform(train_data[col])  # Fit on known categories
        
        # Convert test data, mapping unseen labels to 'Other'
        test_data[col] = test_data[col].apply(lambda x: x if x in known_categories else 'Other')
        test_data[col] = le.transform(test_data[col])  # Transform using the fitted encoder


    return train_data, test_data

In [32]:
def vectorize(train_text, test_text):
    vectorizer = TfidfVectorizer(max_features=config.MAX_FEATURES, ngram_range=(1, 2), stop_words='english')
    
    train_vectors = vectorizer.fit_transform(train_text).toarray() 
    test_vectors = vectorizer.transform(test_text).toarray()
    
    return train_vectors, test_vectors

In [33]:
def preprocess_data(train, test):

    train, test = wrangle_data(train, test)
    
    train_text = train.text.tolist()
    test_text = test.text.tolist()

    train.drop(['text',], axis=1, inplace=True)
    test.drop(['text',], axis=1, inplace=True)

    if config.USE_TfIdf:
        print('vectorizing...')
        
        train_vectors, test_vectors = vectorize(train_text, test_text)
        
        pca = decomposition.PCA(n_components=50, random_state=config.SEED)
        print("Fitting PCA...")
        pca.fit(train_vectors)
        train_projection = pca.transform(train_vectors)
        test_projection = pca.transform(test_vectors)
        
        train_vectors = pd.DataFrame(train_projection)
        train_vectors.columns = [f'cat_{i}' for i in range(train_vectors.shape[1])]

        test_vectors = pd.DataFrame(test_projection)
        test_vectors.columns = [f'cat_{i}' for i in range(test_vectors.shape[1])]
        
        train = pd.concat([train, train_vectors], axis=1)
        test = pd.concat([test, test_vectors], axis=1)
        
        print('done!')
    
        drop_cols = ['video_id','comments_disabled']
        train.drop(drop_cols, axis=1, inplace=True)
        test.drop(drop_cols, axis=1, inplace=True)

    return train, test

In [34]:
def train_loop(dataloader, model, loss_fn, optimizer):

    size = len(dataloader.dataset)
    # Set the model to training mode - important for batch normalization and dropout layers
    # Unnecessary in this situation but added for best practices
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * batch_size + len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")


def test_loop(dataloader, model, loss_fn):

    # Set the model to evaluation mode - important for batch normalization and dropout layers
    # Unnecessary in this situation but added for best practices
    model.eval()
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0

    # Evaluating the model with torch.no_grad() ensures that no gradients are computed during test mode
    # also serves to reduce unnecessary gradient computations and memory usage for tensors with requires_grad=True
    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            test_loss += loss_fn(pred, y).item()

    test_loss /= num_batches
    print(f"Avg validation loss: {test_loss:>8f}")


# Function to train and evaluate the model
def train_and_evaluate(dropout_rate=0.0, use_batchnorm=False, lr=1e-3, num_layers=3, epochs=10):
    model = NeuralNetwork(dropout_rate, use_batchnorm, num_layers).to(device)
    loss_fn = nn.L1Loss()
    optimizer = optim.SGD(model.parameters(), lr=lr)

    # early stopping
    best_loss = float("inf")
    patience = 3
    patience_counter = 0


    start_time = time.time()
    for epoch in range(epochs):

        # Training
        model.train()
        for batch, (X, y) in enumerate(train_dataloader):
            X, y = X.to(device), y.to(device)
            pred = model(X)
            loss = loss_fn(pred, y)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()


        # Evaluation
        model.eval()
        total_loss = 0
        with torch.no_grad():
            for X, y in validation_dataloader:
                X, y = X.to(device), y.to(device)
                pred = model(X)
                total_loss += loss_fn(pred, y).item()
        
        avg_validation_loss = total_loss / len(validation_dataloader)

        # print(f"Epoch {epoch+1}\n-------------------------------")
        # print(f"Avg validation loss: {avg_validation_loss:>8f}")

        # Early stopping check
        if avg_validation_loss < best_loss:
            best_loss = avg_validation_loss
            patience_counter = 0  # Reset patience counter
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f"Early stopping at epoch {epoch+1}")
                break


    training_time = time.time() - start_time

    
    
    return best_loss, round(training_time,3)

# Loading and Preprocessing data

In [35]:
df = pd.read_parquet('train.parquet')#.sample(500)
df = df[df['ratings_disabled'] == 0].reset_index(drop=True)
df.drop(['ratings_disabled'], axis=1, inplace=True)
df = df.sort_values(by='trending_date').drop_duplicates(subset=['video_id'], keep='first')

# splitting data into train and test
combined_train, test = train_test_split(df, test_size=0.2, random_state=42)

# splitting train further into train and validation
train, validation = train_test_split(combined_train, test_size=0.2, random_state=42)

In [36]:
print('Processing combined train and final testing data')
combined_train, test = preprocess_data(combined_train, test)
print('*'*20)
print('Processing train and validation data')
train, validation = preprocess_data(train, validation)

Processing combined train and final testing data
Date time cols...
Text stuff
vectorizing...
Fitting PCA...
done!
********************
Processing train and validation data
Date time cols...
Text stuff
vectorizing...
Fitting PCA...
done!


<hr>

# **Step 1: Define Your Deep Learning Problem**


#### **Problem Statement**

YouTube is one of the largest video-sharing platforms, where content creators rely on audience
engagement metrics such as likes, views, and comments to measure their success. Deep
learning can analyze large-scale YouTube metadata, video features, and historical engagement
trends to build a predictive model that estimates the number of likes a video will receive. This
can help creators optimize their content strategy, improve audience engagement, and increase
monetization opportunities. Coincidentally, this is also a useful tool for brands to analyze and
strategically place their YouTube ads.

Based on previous EDA and a high level reasoning of what factors could potentially influence user engagement, the following states the task and selected features:

- Task
    - Regression
    
- Target
    - Like to view ratio
    
- Features
    - Video category
    - Duration of video
    - Channel name
    - Datetime of publishing (and other features engineered from this)
    - Age of video at the time of predicting
    - length of description
    - language of description
    - description text (converted into tfidf vectors)




While the dataloader remains same from previous submission, the data has been preprocessed (scaling, label encoding, PCA) to ensure effective training.

# **Step 2: Train a Neural Network in PyTorch**


In [37]:
from torch.utils.data import Dataset

import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor


import torch.optim as optim
import time

import optuna
from optuna.integration import PyTorchLightningPruningCallback

### Creating Dataset class and defining dataloader

In [38]:
class CustomDataset(Dataset):
    def __init__(self, df):
        df = df.astype(np.float32) 

        self.features = df.drop(columns=['target']).values.astype(np.float32)   # Extract features
        self.targets = df['target'].values.astype(np.float32)   # Extract target values

        # Convert to torch tensors
        self.features = torch.tensor(self.features, dtype=torch.float32)
        self.targets = torch.tensor(self.targets, dtype=torch.float32).view(-1, 1)  # Reshape for compatibility

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.targets[idx]

In [39]:
combined_train_dataset = CustomDataset(combined_train)
train_dataset = CustomDataset(train)
validation_dataset = CustomDataset(validation)
test_dataset = CustomDataset(test)

In [40]:
combined_train_dataloader = DataLoader(combined_train_dataset, batch_size=64)
train_dataloader = DataLoader(train_dataset, batch_size=64)
validation_dataloader = DataLoader(validation_dataset, batch_size=64)
test_dataloader = DataLoader(test_dataset, batch_size=64)

### Defining the Model

In [41]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(72, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 1),
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

model = NeuralNetwork()

### Defining loss function, optimizer and other hyperparameters

In [42]:
learning_rate = 1e-3
batch_size = 64
epochs = 5

loss_fn = nn.L1Loss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

### Training and Testing model

In [43]:
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(train_dataloader, model, loss_fn, optimizer)
    test_loop(validation_dataloader, model, loss_fn)
print("Done!")

Epoch 1
-------------------------------
loss: 28.999962  [   64/10808]
loss: 0.087681  [ 6464/10808]
Avg validation loss: 0.043586
Epoch 2
-------------------------------
loss: 0.040303  [   64/10808]
loss: 0.033028  [ 6464/10808]
Avg validation loss: 0.041472
Epoch 3
-------------------------------
loss: 0.036227  [   64/10808]
loss: 0.032326  [ 6464/10808]
Avg validation loss: 0.040080
Epoch 4
-------------------------------
loss: 0.034686  [   64/10808]
loss: 0.031779  [ 6464/10808]
Avg validation loss: 0.039947
Epoch 5
-------------------------------
loss: 0.034919  [   64/10808]
loss: 0.031091  [ 6464/10808]
Avg validation loss: 0.039108
Done!


# **Step 2 continued: Try Stuff**

In [44]:
class NeuralNetwork(nn.Module):
    def __init__(self, dropout_rate=0.0, use_batchnorm=False, num_layers=3):
        super().__init__()
        self.flatten = nn.Flatten()

        layers = []
        input_size = 72
        hidden_size = 512

        for i in range(num_layers):
            layers.append(nn.Linear(input_size if i == 0 else hidden_size, hidden_size))
            if use_batchnorm:
                layers.append(nn.BatchNorm1d(hidden_size))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout_rate))

        layers.append(nn.Linear(hidden_size, 1))
        self.model = nn.Sequential(*layers)

    def forward(self, x):
        x = self.flatten(x)
        return self.model(x)



# Experiment settings
epochs = 10
batch_size = 64
device = "cuda" if torch.cuda.is_available() else "cpu"

dropout_rates = [0.0, 0.2, 0.5]
learning_rates = [1e-3, 1e-2, 1e-4]
batchnorm_options = [False, True]
no_of_layers = [2, 4, 6]

# DataFrames to store results
dropout_results = []
batchnorm_results = []
lr_results = []
layer_results = []

## Experiment with different dropout rates

In [45]:
# Run Dropout Experiment
for rate in dropout_rates:
    avg_loss, _ = train_and_evaluate(dropout_rate=rate)
    dropout_results.append({"Dropout Rate": rate, "Avg Validation Loss": avg_loss})
    
df_dropout = pd.DataFrame(dropout_results)
df_dropout

Unnamed: 0,Dropout Rate,Avg Validation Loss
0,0.0,0.039414
1,0.2,0.037569
2,0.5,0.037652


Even at 0.5 dropout rate, the model loss remains similar as that at 0.0 dropout. This means that there are many redundant neurons that contribute much to final prediction. In this case, using a lighter and sparse model gives similar performance as a fully connected neural network.

## Experiment with and without batch normalization

In [46]:
# Run Batch Normalization Experiment
for use_bn in batchnorm_options:
    avg_loss, train_time = train_and_evaluate(use_batchnorm=use_bn)
    batchnorm_results.append({"BatchNorm": use_bn, "Avg Validation Loss": avg_loss, "Training Time": train_time})

df_batchnorm = pd.DataFrame(batchnorm_results)
df_batchnorm

Early stopping at epoch 7


Unnamed: 0,BatchNorm,Avg Validation Loss,Training Time
0,False,0.038311,1.914
1,True,0.043508,1.9


There is marginal increase in validation loss when using batch normalization. The training time remains same.

## Experiment with different learning rates

In [47]:
# Run Learning Rate Experiment
for lr in learning_rates:
    avg_loss, train_time = train_and_evaluate(lr=lr)
    lr_results.append({"Learning Rate": lr, "Avg Validation Loss": avg_loss, "Training Time": train_time})

df_lr = pd.DataFrame(lr_results)
df_lr

Early stopping at epoch 9
Early stopping at epoch 3


Unnamed: 0,Learning Rate,Avg Validation Loss,Training Time
0,0.001,0.038203,1.726
1,0.01,inf,0.557
2,0.0001,0.067328,1.917


Using a larger learning rate speeds up the training time. Using an extremely small learning rate, the model did not converge within the specified epochs.

## Experiment with number of hidden layers

In [48]:
for num_layers in no_of_layers:
    avg_loss, train_time = train_and_evaluate(num_layers=num_layers)
    layer_results.append({"Num Layers": num_layers, "Avg Validation Loss": avg_loss, "Training Time": train_time})

df_layers = pd.DataFrame(layer_results)
df_layers

Early stopping at epoch 10


Unnamed: 0,Num Layers,Avg Validation Loss,Training Time
0,2,0.038223,1.273
1,4,0.039262,2.516
2,6,0.037656,3.807


Increasing the num layers increases training time while the final validation loss remains same. Maybe for this use case and data, a smaller model is enough to capture the patterns and explain the variance

## **Step 3: Hyperparameter Optimization with Optuna**

### Define an objective function to be minimized.

In [49]:
def objective(trial):
    # Sample hyperparameters
    dropout_rate = trial.suggest_float(name="dropout_rate", low=0.0, high=0.5, step=0.1)
    use_batchnorm = trial.suggest_categorical("use_batchnorm", [True, False])
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-4, 1e-2)
    num_layers = trial.suggest_int("num_layers", 2, 7)

    model = NeuralNetwork(dropout_rate, use_batchnorm, num_layers).to(device)
    loss_fn = nn.L1Loss()
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)

    # Training
    for epoch in range(epochs):
        model.train()
        for batch, (X, y) in enumerate(train_dataloader):
            X, y = X.to(device), y.to(device)
            pred = model(X)
            loss = loss_fn(pred, y)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        # Evaluation for pruning
        model.eval()
        total_loss = 0
        with torch.no_grad():
            for X, y in validation_dataloader:
                X, y = X.to(device), y.to(device)
                pred = model(X)
                total_loss += loss_fn(pred, y).item()
        
        avg_validation_loss = total_loss / len(validation_dataloader)

        # Report intermediate loss and enable pruning
        trial.report(avg_validation_loss, epoch)
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

    return avg_validation_loss

### Create a study object and optimize the objective function

In [50]:
# Run Optuna Study
epochs = 5
batch_size = 64
device = "cuda" if torch.cuda.is_available() else "cpu"

study = optuna.create_study(direction="minimize", pruner=optuna.pruners.MedianPruner())
study.optimize(objective, n_trials=15)

df_trials = pd.DataFrame([
    {**trial.params, "Validation Loss": trial.value, "Pruned": trial.state == optuna.trial.TrialState.PRUNED}
    for trial in study.trials
])

[I 2025-03-14 20:59:02,189] A new study created in memory with name: no-name-300f34ac-8005-4e64-897d-2d8a819fcab1
[I 2025-03-14 20:59:03,078] Trial 0 finished with value: 0.03761253644560659 and parameters: {'dropout_rate': 0.30000000000000004, 'use_batchnorm': False, 'learning_rate': 0.0009613188891460426, 'num_layers': 2}. Best is trial 0 with value: 0.03761253644560659.
[I 2025-03-14 20:59:05,008] Trial 1 finished with value: 0.03730190255094406 and parameters: {'dropout_rate': 0.0, 'use_batchnorm': False, 'learning_rate': 0.0055831672423104104, 'num_layers': 6}. Best is trial 1 with value: 0.03730190255094406.
[I 2025-03-14 20:59:07,692] Trial 2 finished with value: 0.04546715050589207 and parameters: {'dropout_rate': 0.4, 'use_batchnorm': False, 'learning_rate': 0.00030953866577489305, 'num_layers': 6}. Best is trial 1 with value: 0.03730190255094406.
[I 2025-03-14 20:59:10,991] Trial 3 finished with value: 0.05744081025206765 and parameters: {'dropout_rate': 0.30000000000000004, 

### Best parameters

In [51]:
# Print the best hyperparameter configuration
df_trials.index.names = ['Trial']
print('All Trials')
display(df_trials)
print("\nBest Trial:")
print(study.best_trial.params)

All Trials


Unnamed: 0_level_0,dropout_rate,use_batchnorm,learning_rate,num_layers,Validation Loss,Pruned
Trial,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.3,False,0.000961,2,0.037613,False
1,0.0,False,0.005583,6,0.037302,False
2,0.4,False,0.00031,6,0.045467,False
3,0.3,True,0.000131,6,0.057441,False
4,0.3,False,0.008543,7,0.037643,False
5,0.3,False,0.002297,6,0.037511,False
6,0.3,False,0.00081,3,0.056433,True
7,0.4,False,0.000783,5,0.079076,True
8,0.0,True,0.00012,2,0.044388,True
9,0.4,False,0.003438,6,0.037569,False



Best Trial:
{'dropout_rate': 0.0, 'use_batchnorm': False, 'learning_rate': 0.0055831672423104104, 'num_layers': 6}


## **Step 3 continued: Insights**

Using Optuna, I was able to run 14 trials within a short span. Some of the non effective trials were also pruned, thus saving more experimentation time.

## **Step 4: Final Training**


In [52]:
# Extract the best parameters from Optuna
best_params = study.best_trial.params
print("\nBest Hyperparameters:", best_params)

# Define and train the final model with the best hyperparameters
final_model = NeuralNetwork(
    dropout_rate=best_params["dropout_rate"],
    use_batchnorm=best_params["use_batchnorm"],
    num_layers=best_params["num_layers"]
).to(device)

loss_fn = nn.L1Loss()
optimizer = optim.SGD(final_model.parameters(), lr=best_params["learning_rate"])

# Training final model on the combined dataset
epochs = 10
print("\nTraining final model...")
for epoch in range(epochs):
    final_model.train()
    for batch, (X, y) in enumerate(combined_train_dataloader):
        X, y = X.to(device), y.to(device)
        pred = final_model(X)
        loss = loss_fn(pred, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

# Evaluate the final model on the test set
final_model.eval()
total_test_loss = 0
with torch.no_grad():
    for X, y in test_dataloader:
        X, y = X.to(device), y.to(device)
        pred = final_model(X)
        total_test_loss += loss_fn(pred, y).item()

avg_test_loss = total_test_loss / len(test_dataloader)

# Print the final test metric
print(f"\nFinal Model Test Loss: {avg_test_loss:.6f}")



Best Hyperparameters: {'dropout_rate': 0.0, 'use_batchnorm': False, 'learning_rate': 0.0055831672423104104, 'num_layers': 6}

Training final model...

Final Model Test Loss: 0.039236


<hr>