In [None]:
###Import statements
import numpy as np
import pandas as pd
from tensorflow import keras
from keras import Input
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.regularizers import L2
from keras.models import Sequential, Model
from keras.layers import Dense, Flatten, Dropout, Conv1D, MaxPooling1D, GlobalMaxPooling1D
from keras.layers.embeddings import Embedding
from keras.callbacks import ModelCheckpoint, EarlyStopping
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from keras.layers import concatenate

In [None]:
###load dataframes and specify the input column (to train the model on)
ttrain=pd.read_pickle('./news_ttrain.pkl')
tvalid=pd.read_pickle('./news_tvalid.pkl')
test=pd.read_pickle('./news_test.pkl')
input_col='cleaned_words'
###Train a tokenizer and convert the input_col to integer sequences
tokenizer=Tokenizer()
train_list_str=ttrain[input_col].tolist()
tokenizer.fit_on_texts(train_list_str)
train_list = tokenizer.texts_to_sequences(train_list_str)
###compute number of distinct words, the max length of a sequence, and pad sequences to be equal length 
tlmax_len=max([len(tlist) for tlist in train_list])
tlnum_words=max([max(tlist) for tlist in train_list])
train_list = pad_sequences(train_list, maxlen=tlmax_len)

In [None]:
print(tlmax_len, tlnum_words)
train_list

In [None]:
###Now we preprocess the validation set, similar to the train_set
valid_list_str=tvalid[input_col].tolist()
valid_list = tokenizer.texts_to_sequences(valid_list_str)

###need to truncate any strings that are too long, then pad to appropriate length
valid_list = [tlist[0:tlmax_len] for tlist in valid_list]
valid_list = pad_sequences(valid_list, maxlen=tlmax_len)

###Same treatment for test set as the validation set
test_list_str=test[input_col].tolist()
test_list = tokenizer.texts_to_sequences(test_list_str)
test_list = [tlist[0:tlmax_len] for tlist in test_list]
test_list = pad_sequences(test_list, maxlen=tlmax_len)

In [None]:
print(train_list.shape)
print(valid_list.shape)
print(test_list.shape)

In [None]:
###Now we make one hot encoded y vectors for the train and validation sets
ytrain= pd.get_dummies(ttrain['category_col']).to_numpy()
yvalid= pd.get_dummies(tvalid['category_col']).to_numpy()
ytest= pd.get_dummies(test['category_col']).to_numpy()

print(ytrain.shape, yvalid.shape, ytest.shape)
ytrain

In [None]:
###Some model hyperparameters, chosen by training different models over a set of options
embed_len=256
reg_param=0 ### in testing, even small regularization worsened accuracy
lr=0.0002 ###smaller than default learning rate leads to more stable outcomes, albeit longer train times
nfeatures=256

### START OF MODEL ###
input = Input(shape=tlmax_len)
x = input
x=Embedding(tlnum_words+1, embed_len, input_length=tlmax_len, mask_zero=True)(x)

###Model creates 3 filters over kernels of size 2, 4, and 6. kernel arrangement chosen as the best from a list of tested options
tower_1 = Conv1D(nfeatures, 2, padding='valid', activation='relu', kernel_regularizer=L2(reg_param))(x)
tower_1 = GlobalMaxPooling1D() (tower_1)
tower_2 = Conv1D(nfeatures, 4, padding='valid', activation='relu',kernel_regularizer=L2(reg_param))(x)
tower_2 = GlobalMaxPooling1D() (tower_2)
tower_3 = Conv1D(nfeatures, 6, padding='valid', activation='relu', kernel_regularizer=L2(reg_param))(x)
tower_3 = GlobalMaxPooling1D() (tower_3)

###Merge the output of the 3 filters, flatten, then combine their output with a dense softmax layer with dropout
merged = concatenate([tower_1, tower_2, tower_3], axis=1)
merged = Flatten()(merged)
merged = Dropout(0.3)(merged)
output=Dense(12, activation='softmax', kernel_regularizer=L2(reg_param))(merged)
model=Model(inputs=input, outputs=output)
###Compile model with custom choice of learning rate and binary crossentropy loss
model.compile(optimizer=keras.optimizers.Adam(learning_rate=lr), loss='binary_crossentropy', metrics=['accuracy'])
### END OF MODEL ###


In [None]:
es = EarlyStopping(monitor='val_loss', verbose=1, patience=1)
out = model.fit(train_list, ytrain, validation_data=(valid_list, yvalid), epochs=3, callbacks=[es])

In [None]:
###Generate prediction vector on the validation set 
categories = ['BLACK VOICES', 'BUSINESS', 'ENTERTAINMENT', 'FOOD & DRINK', 'PARENTING', 'POLITICS', 'QUEER VOICES', 'SPORTS', 'STYLE & BEAUTY', 'TRAVEL', 'WELLNESS', 'WORLD NEWS'] 
y_pred=model.predict(test_list).tolist()
ypred1=[] ### vector of 0's and 1's
ypred2=[] ### vector of corresponding string label
for tlist in y_pred:
    tmax=max(tlist)
    temp=[1 if i==tmax else 0 for i in tlist]
    for i, j in enumerate(temp):
        if j==1:
            ypred2.append(categories[i])
    ypred1.append(temp)
accuracy_score(ypred1, ytest)


In [None]:
###Plotting a confusion matrix for the test set
cm = confusion_matrix(test['category_col'].to_list(), ypred2)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
labels = categories
class_names = labels

# Plot confusion matrix in a beautiful manner
fig = plt.figure(figsize=(16, 14))
ax= plt.subplot()
sns.heatmap(cm, annot=True, ax = ax, fmt = 'g'); #annot=True to annotate cells

# labels, title and ticks
ax.set_xlabel('Predicted', fontsize=20)
ax.xaxis.set_label_position('bottom')
plt.xticks(rotation=90)
ax.xaxis.set_ticklabels(class_names, fontsize = 10)
ax.xaxis.tick_bottom()

ax.set_ylabel('True', fontsize=20)
ax.yaxis.set_ticklabels(class_names, fontsize = 10)
plt.yticks(rotation=0)

plt.title('Refined Confusion Matrix', fontsize=20)
plt.show()

In [None]:
print(classification_report(ytest, ypred1,target_names=categories))

In [None]:
###save model predictions to the dataset for later analysis
test['cnn_cat_pred']= ypred2
test.to_pickle('./news_test_with_pred.pkl')

In [None]:
###some rows corresponding to failed model predictions are displayed here (I find them interesting)
pd.set_option('display.max_colwidth', None)
test[test['category_col']!=test['cnn_cat_pred']][['headline_col', 'short_description', 'cleaned_words', 'category_col', 'cnn_cat_pred']].head(50) 