In [142]:
import os
import pandas as pd
import sys
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
import numpy as np
import nltk
nltk.download('stopwords')

In [143]:
def vectorizeData(X_train, X_test):
    cv = CountVectorizer(stop_words=stopwords.words('english'))
    X_train_counts = cv.fit_transform(X_train['headline'])
    X_test_counts = cv.transform(X_test['headline'])
    return X_train_counts, X_test_counts


def splitTrainingData(df, featureCols, targetCols, random=False):
    state = 42 if random else None
    X = df[featureCols]
    y = df[targetCols]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=state)
    X_train, X_test = vectorizeData(X_train, X_test)
    return X_train, X_test, y_train, y_test


"""
Read in the data, we are only interested in headline and category. 
One hot encode the categories
"""
df = pd.read_json("data/News_Category_Dataset_v3.json", lines=True)
df = df[['headline', 'category']]
# print(df['category'].unique())
# get only subset of categories
# df = df[df['category'].isin(['POLITICS', 'ENTERTAINMENT', 'TECH'])]
df = pd.get_dummies(df, columns=['category'])
print(df.columns)


Index(['headline', 'category_ARTS', 'category_ARTS & CULTURE',
       'category_BLACK VOICES', 'category_BUSINESS', 'category_COLLEGE',
       'category_COMEDY', 'category_CRIME', 'category_CULTURE & ARTS',
       'category_DIVORCE', 'category_EDUCATION', 'category_ENTERTAINMENT',
       'category_ENVIRONMENT', 'category_FIFTY', 'category_FOOD & DRINK',
       'category_GOOD NEWS', 'category_GREEN', 'category_HEALTHY LIVING',
       'category_HOME & LIVING', 'category_IMPACT', 'category_LATINO VOICES',
       'category_MEDIA', 'category_MONEY', 'category_PARENTING',
       'category_PARENTS', 'category_POLITICS', 'category_QUEER VOICES',
       'category_RELIGION', 'category_SCIENCE', 'category_SPORTS',
       'category_STYLE', 'category_STYLE & BEAUTY', 'category_TASTE',
       'category_TECH', 'category_THE WORLDPOST', 'category_TRAVEL',
       'category_U.S. NEWS', 'category_WEDDINGS', 'category_WEIRD NEWS',
       'category_WELLNESS', 'category_WOMEN', 'category_WORLD NEWS',
      

In [144]:
"""
Split the data into training and testing
"""
feature_columns = ['headline']
category_columns = df.columns[1:]
df = df.sample(frac=.1).reset_index(drop=True)
X_train, X_test, y_train, y_test = splitTrainingData(df, feature_columns, category_columns)
X_train = X_train.toarray()
X_test = X_test.toarray()


In [145]:
""" Declare the Model """
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense
from keras.models import Sequential
from keras.initializers import HeNormal
from keras.regularizers import l2
from keras.callbacks import LearningRateScheduler
from keras.optimizers.schedules import ExponentialDecay
from keras.layers import Dropout
from sklearn.base import BaseEstimator, ClassifierMixin
from keras.losses import CategoricalCrossentropy 
from keras.layers import MultiHeadAttention, Input, Reshape
from keras.models import Model
import tensorflow as tf
from tensorflow.keras.optimizers import Adam


class NLTK_Classifier:
    def __init__(self, input_shape=(1, 8744), num_classes=3):
        inputs = Input(shape=input_shape)
        transformer_output = self.transformLayer(heads=8)(inputs, inputs, inputs)
        dense_output = Dense(512, activation='relu')(transformer_output)
        outputs = Dense(num_classes, activation='softmax')(dense_output)

        self.model = Model(inputs=inputs, outputs=outputs)
        
    # custom transformer layer 
    """
    A transformer layer expects inputs to be of shape (batch_size, seq_len, dimension of each input vector)
    """
    def transformLayer(self, heads):
        return MultiHeadAttention(
                num_heads=heads, key_dim=8, dropout=0.1,
                kernel_initializer='he_normal', bias_initializer='zeros',
                bias_regularizer=l2(0.01), kernel_regularizer=l2(0.01)
        )
    
    # Customer Dense layer
    def DenseLayer(self, nodes, activation='relu'):
        return Dense(
            nodes, activation=activation, 
            kernel_initializer=HeNormal(), bias_initializer=HeNormal(),
            kernel_regularizer=l2(0.01), bias_regularizer=l2(0.01)
        )

    # Resets weights to HeNormal
    def reset_weights(self):
        initial_weights = self.model.get_weights()
        self.model.set_weights(initial_weights)

    # compile the model
    def compile(self):
        self.model.compile(optimizer=Adam(), loss=CategoricalCrossentropy(), metrics=['accuracy'])

    # Run the model. Forward fit using a learning rate scheduler
    def fit(self, X_train, training_labels, epochs=1, batch_size=32):
        lr_scheduler = ExponentialDecay(initial_learning_rate=0.001, decay_steps=5, decay_rate=0.5)
        self.model.fit(X_train, training_labels, epochs=epochs, 
                    batch_size=batch_size, callbacks=[LearningRateScheduler(lr_scheduler)])



In [146]:
# Rows indicate headlines, columns indicate words
print(X_train.shape)

# A transformer layer expects inputs to be of shape (batch_size, seq_len, dimension of each input vector)
X_train = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1]))
print(X_train.shape)
input_shape = X_train.shape[1:]

y_train = np.expand_dims(y_train, axis=1)
print(f'y_train shape: {y_train.shape}')

model = NLTK_Classifier(input_shape=input_shape, num_classes=y_train.shape[2])
model.compile()
model.reset_weights()

history = model.fit(X_train, y_train, epochs=21, batch_size=64)

# Save the model
model.model.save('models/transformer_model.keras')


(16762, 19322)
(16762, 1, 19322)
y_train shape: (16762, 1, 42)
Epoch 1/32


2024-03-07 14:09:23.682208: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 2591002912 exceeds 10% of free system memory.


 59/262 [=====>........................] - ETA: 3:00 - loss: 300.6331 - accuracy: 0.2757

KeyboardInterrupt: 

In [None]:
"""
Load the Model and test it with some data
"""
from keras.models import load_model
model = load_model('models/transformer_model.keras')
