In [None]:
import pandas as pd 
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight
import joblib

In [None]:
LABEL_FILE = './data/labelHistory.csv'
FULL_DATAFILE = './data/cleanHistory.csv'
OUTPUTFILE = './data/fullClassified.csv'

MODEL = 'models/classifier_3.pth'
VECTORIZER = 'models/vectorizer_3.pkl'

In [None]:
torch.manual_seed(50)
np.random.seed(50)

In [None]:
def loadAndPrepData():
    labeledDf = pd.read_csv(LABEL_FILE)
    fullDf = pd.read_csv(FULL_DATAFILE)

    labeledDf['text'] = labeledDf['title']+" "+labeledDf['channel'].fillna('')
    fullDf['text'] = fullDf['title']+" "+fullDf['channel'].fillna('')

    return labeledDf, fullDf

In [None]:
def vectorizeData(labledDf, fullDf):
    allText = pd.concat([labledDf['text'], fullDf['text']])

    vectorizer = TfidfVectorizer(max_features=2000, stop_words='english')
    vectorizer.fit(allText)

    X_labeled = vectorizer.transform(labledDf['text']).toarray()
    y_labeled = labledDf['label'].values-1

    return X_labeled, y_labeled, vectorizer

In [None]:
class WatchHistoryClassifier(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(WatchHistoryClassifier, self).__init__()

        self.main = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, num_classes),
        )
    def forward(self, x):
        return self.main(x)

In [None]:
def train(X, y):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=50, stratify=y)


    classWeight = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
    weightTenson = torch.tensor(classWeight, dtype=torch.float)
    print("Class Weights:", classWeight, '\n\n')

    # Tensors
    X_train_t = torch.FloatTensor(X_train)
    y_train_t = torch.LongTensor(y_train)
    X_val_t = torch.FloatTensor(X_val)
    y_val_t = torch.LongTensor(y_val)

    # Model
    model = WatchHistoryClassifier(input_dim=X.shape[1], num_classes=4)
    criterion = nn.CrossEntropyLoss(weight=weightTenson)
    optimizer = optim.Adam(model.parameters(), lr=0.01)

    print("Starting Training...\n")
    for epoch in range(100):
        optimizer.zero_grad()
        outputs = model(X_train_t)
        loss = criterion(outputs, y_train_t)
        loss.backward()
        optimizer.step()

        if (epoch+1) % 10 == 0:
            print(f'Epoch [{epoch+1}/100], Loss: {loss.item():.4f}')
    
    print("\n\nValidation Report:")
    with torch.no_grad():
        valOut = model(X_val_t)
        _, predicted = torch.max(valOut, 1)
        print("\nValidation Report:")
        targetNames = ['Learning', 'Entertainment', 'Music',  'Others']
        presentClasses = sorted(list(set(y_val) | set(predicted.numpy())))
        presentNames = [targetNames[i] for i in presentClasses]
        print(classification_report(y_val, predicted, target_names=presentNames, zero_division=0))
    return model

In [None]:
def predictFullHistory(model, vectorizer, fullDf):
    X_full = vectorizer.transform(fullDf['text']).toarray()
    X_full_t = torch.FloatTensor(X_full)

    model.eval()
    with torch.no_grad():
        outputs = model(X_full_t)
        _, predicted = torch.max(outputs, 1)

    CATEGORY = {
        0: 'Learning',
        1: 'Entertainment',
        2: 'Music',
        3: 'Others'
    }

    fullDf['category'] = [CATEGORY[p.item()] for p in predicted]

    return fullDf

In [None]:
labledDf, fullDf = loadAndPrepData()

In [None]:
X_labeled, y_labeled, vectorizer = vectorizeData(labledDf, fullDf)

In [None]:
model = train(X_labeled, y_labeled)

In [None]:
torch.save(model.state_dict(), MODEL)

In [None]:
joblib.dump(vectorizer, VECTORIZER)

In [None]:
finalDf = predictFullHistory(model, vectorizer, fullDf)
finalDf