In [8]:
import pandas as pd 
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import joblib

In [3]:
LABLE_FILE = './data/lableHistory.csv'
FULL_DATAFILE = './data/cleanHistory.csv'
OUTPUTFILE = './data/fullClassified.csv'

MODLE = 'models/classifier.pth'
VECTORIZER = 'models/vectorizer.pkl'

In [4]:
torch.manual_seed(50)
np.random.seed(50)

In [6]:
def loadAndPrepData():
    labeledDf = pd.read_csv(LABLE_FILE)
    fullDf = pd.read_csv(FULL_DATAFILE)

    labeledDf['text'] = labeledDf['title']+" "+labeledDf['channel'].fillna('')
    fullDf['text'] = fullDf['title']+" "+fullDf['channel'].fillna('')

    return labeledDf, fullDf

In [9]:
def vectorizeData(labledDf, fullDf):
    allText = pd.concat([labledDf['text'], fullDf['text']])

    vectorizer = TfidfVectorizer(max_features=2000, stop_words='english')
    vectorizer.fit(allText)

    X_labeled = vectorizer.transform(labledDf['text']).toarray()
    y_labeled = labledDf['label'].values-1

    return X_labeled, y_labeled, vectorizer

In [10]:
class WatchHistoryClassifier(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(WatchHistoryClassifier, self).__init__()

        # Layer 1: compression to 64 features
        self.fc1 = nn.Linear(input_dim, 64)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)

        # Layer 2: Output logits for classes
        self.fc2 = nn.Linear(64, num_classes)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return x

In [13]:
def train(X, y):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=50)

    X_train_t = torch.FloatTensor(X_train)
    y_train_t = torch.LongTensor(y_train)
    X_val_t = torch.FloatTensor(X_val)
    y_val_t = torch.LongTensor(y_val)

    model = WatchHistoryClassifier(input_dim=X.shape[1], num_classes=len(np.unique(y)))
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.01)

    for epoch in range(100):
        optimizer.zero_grad()
        outputs = model(X_train_t)
        loss = criterion(outputs, y_train_t)
        loss.backward()
        optimizer.step()

        if (epoch+1) % 10 == 0:
            print(f'Epoch [{epoch+1}/100], Loss: {loss.item():.4f}')

    with torch.no_grad():
        val_outputs = model(X_val_t)
        _, predicted = torch.max(val_outputs.data, 1)
        print(classification_report(y_val_t, predicted, zero_division=0))
    return model

In [14]:
def predictFullHistory(model, vectorizer, fullDf):
    X_full = vectorizer.transform(fullDf['text']).toarray()
    X_full_t = torch.FloatTensor(X_full)

    model.eval()
    with torch.no_grad():
        outputs = model(X_full_t)
        _, predicted = torch.max(outputs, 1)

    fullDf['predicted_label'] = predicted.numpy()+1

    CATEGORY = {
        '1': 'Learning',
        '2': 'Entertainment',
        '3': 'Music',
        '4': 'Finance',
        '5': 'News',
        '6': 'Others'
    }

    fullDf['category_name'] = fullDf['predicted_label'].map(CATEGORY)

    return fullDf