In [1]:
import pandas as pd 
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight
import joblib

In [2]:
LABEL_FILE = './data/labelHistory.csv'
FULL_DATAFILE = './data/cleanHistory.csv'
OUTPUTFILE = './data/fullClassified.csv'

MODEL = 'models/classifier_3.pth'
VECTORIZER = 'models/vectorizer_3.pkl'

In [3]:
torch.manual_seed(50)
np.random.seed(50)

In [4]:
def loadAndPrepData():
    labeledDf = pd.read_csv(LABEL_FILE)
    fullDf = pd.read_csv(FULL_DATAFILE)

    labeledDf['text'] = labeledDf['title']+" "+labeledDf['channel'].fillna('')
    fullDf['text'] = fullDf['title']+" "+fullDf['channel'].fillna('')

    return labeledDf, fullDf

In [5]:
def vectorizeData(labledDf, fullDf):
    allText = pd.concat([labledDf['text'], fullDf['text']])

    vectorizer = TfidfVectorizer(max_features=2000, stop_words='english')
    vectorizer.fit(allText)

    X_labeled = vectorizer.transform(labledDf['text']).toarray()
    y_labeled = labledDf['label'].values-1

    return X_labeled, y_labeled, vectorizer

In [6]:
class WatchHistoryClassifier(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(WatchHistoryClassifier, self).__init__()

        self.main = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, num_classes),
        )
    def forward(self, x):
        return self.main(x)

In [7]:
def train(X, y):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=50, stratify=y)


    classWeight = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
    weightTenson = torch.tensor(classWeight, dtype=torch.float)
    print("Class Weights:", classWeight, '\n\n')

    # Tensors
    X_train_t = torch.FloatTensor(X_train)
    y_train_t = torch.LongTensor(y_train)
    X_val_t = torch.FloatTensor(X_val)
    y_val_t = torch.LongTensor(y_val)

    # Model
    model = WatchHistoryClassifier(input_dim=X.shape[1], num_classes=4)
    criterion = nn.CrossEntropyLoss(weight=weightTenson)
    optimizer = optim.Adam(model.parameters(), lr=0.01)

    print("Starting Training...\n")
    for epoch in range(100):
        optimizer.zero_grad()
        outputs = model(X_train_t)
        loss = criterion(outputs, y_train_t)
        loss.backward()
        optimizer.step()

        if (epoch+1) % 10 == 0:
            print(f'Epoch [{epoch+1}/100], Loss: {loss.item():.4f}')
    
    print("\n\nValidation Report:")
    with torch.no_grad():
        valOut = model(X_val_t)
        _, predicted = torch.max(valOut, 1)
        print("\nValidation Report:")
        targetNames = ['Learning', 'Entertainment', 'Music',  'Others']
        presentClasses = sorted(list(set(y_val) | set(predicted.numpy())))
        presentNames = [targetNames[i] for i in presentClasses]
        print(classification_report(y_val, predicted, target_names=presentNames, zero_division=0))
    return model

In [8]:
def predictFullHistory(model, vectorizer, fullDf):
    X_full = vectorizer.transform(fullDf['text']).toarray()
    X_full_t = torch.FloatTensor(X_full)

    model.eval()
    with torch.no_grad():
        outputs = model(X_full_t)
        _, predicted = torch.max(outputs, 1)

    CATEGORY = {
        0: 'Learning',
        1: 'Entertainment',
        2: 'Music',
        3: 'Others'
    }

    fullDf['category'] = [CATEGORY[p.item()] for p in predicted]

    return fullDf

In [9]:
labledDf, fullDf = loadAndPrepData()

In [10]:
X_labeled, y_labeled, vectorizer = vectorizeData(labledDf, fullDf)

In [11]:
model = train(X_labeled, y_labeled)

Class Weights: [0.81851852 0.87698413 0.75684932 3.15714286] 


Starting Training...

Epoch [10/100], Loss: 0.7723
Epoch [20/100], Loss: 0.1167
Epoch [30/100], Loss: 0.0234
Epoch [40/100], Loss: 0.0129
Epoch [50/100], Loss: 0.0094
Epoch [60/100], Loss: 0.0076
Epoch [70/100], Loss: 0.0094
Epoch [80/100], Loss: 0.0080
Epoch [90/100], Loss: 0.0110
Epoch [100/100], Loss: 0.0050


Validation Report:

Validation Report:
               precision    recall  f1-score   support

     Learning       0.93      0.79      0.86        34
Entertainment       0.69      0.78      0.74        32
        Music       0.88      0.95      0.91        37
       Others       0.67      0.50      0.57         8

     accuracy                           0.82       111
    macro avg       0.79      0.76      0.77       111
 weighted avg       0.83      0.82      0.82       111



In [12]:
torch.save(model.state_dict(), MODEL)

In [13]:
joblib.dump(vectorizer, VECTORIZER)

['models/vectorizer_3.pkl']

In [14]:
finalDf = predictFullHistory(model, vectorizer, fullDf)
finalDf

Unnamed: 0,title,date,video_url,channel,text,category
0,Ara Ke Othlali Mein - Slowed And Reverb | Pawa...,2026-01-29 17:22:46.857000+00:00,https://www.youtube.com/watch?v=MHQ7YSO28ZY,Lofi Roxx,Ara Ke Othlali Mein - Slowed And Reverb | Pawa...,Music
1,"Sajde - (16D Audio ""Not 8D"") | Faheem Abdullah...",2026-01-29 17:20:19.647000+00:00,https://www.youtube.com/watch?v=NY4PJAuqCMY,ROYAL JAAT ♪,"Sajde - (16D Audio ""Not 8D"") | Faheem Abdullah...",Music
2,"Saiyaara Title Song (16D Audio ""Not 8D"") | Tan...",2026-01-29 17:14:05.588000+00:00,https://www.youtube.com/watch?v=nNcrLdmuTg8,ROYAL JAAT ♪,"Saiyaara Title Song (16D Audio ""Not 8D"") | Tan...",Music
3,"Barbaad (16D Audio ""Not 8D"") | Saiyaara | Ahaa...",2026-01-29 17:08:03.933000+00:00,https://www.youtube.com/watch?v=OK2cQg7xbQc,ROYAL JAAT ♪,"Barbaad (16D Audio ""Not 8D"") | Saiyaara | Ahaa...",Music
4,"GURU RANDHAWA - DOPAMINE (16D Audio ""Not 8D"") ...",2026-01-29 17:05:23.447000+00:00,https://www.youtube.com/watch?v=TRfQMVuMzHs,ROYAL JAAT ♪,"GURU RANDHAWA - DOPAMINE (16D Audio ""Not 8D"") ...",Music
...,...,...,...,...,...,...
7594,SOFTLY (Official Music Video) KARAN AUJLA | IK...,2025-02-01 17:24:18.967000+00:00,https://music.youtube.com/watch?v=cWMxCE2HTag,Karan Aujla,SOFTLY (Official Music Video) KARAN AUJLA | IK...,Music
7595,Parshawan - Harnoor (Official Video) Gifty | J...,2025-02-01 17:24:10.877000+00:00,https://music.youtube.com/watch?v=jn77BhLMGc8,Legacy Records,Parshawan - Harnoor (Official Video) Gifty | J...,Music
7596,The PropheC - Kina Chir | Official Video | Lat...,2025-02-01 17:14:32.510000+00:00,https://music.youtube.com/watch?v=Wa6it7j_OHY,The PropheC,The PropheC - Kina Chir | Official Video | Lat...,Music
7597,Shubh - One Love (Official Audio),2025-02-01 17:11:53.397000+00:00,https://music.youtube.com/watch?v=0pWsCiBvLOk,SHUBH,Shubh - One Love (Official Audio) SHUBH,Music
