In [27]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import sklearn.metrics
from sklearn.metrics import classification_report
from collections import Counter

In [28]:
train = pd.read_csv('agnews_train.csv')

In [29]:
test = pd.read_csv('agnews_test.csv')

# Binary-Classifier Model with Neural Networks and Tensorflow

In [30]:
from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import preprocessing
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

This the function that is created to evaluate each one of the binary classifier, the following elements are present:
- Confusion matrix
- ROC AUC score
- Classification report *(accuracy, recall, precision)*

In [None]:
#This function is used to calculate all the performance indicators of the different models
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report, precision_score, recall_score


def metrics_model(x_test,y_test):
    print("Performance evaluation: "+"\n")
    
    predictions = np.round(model.predict(x_test),0)
    cm = metrics.confusion_matrix(y_test, predictions)
    print("Test score confusion matrix: "+"\n"+str(cm))

    print("\n"+"ROC AUC score: "+str(np.round(roc_auc_score(y_test, predictions),3)))

    report = classification_report(y_test, predictions)
    print("\n"+report)

This is the function that we created in order to produce a neural network model that we defined everytime that the function is called. The parameters of the function are the number of dimensions of the embedding layers, the max number of words considered by the algorithm (most commun words) and lenght of the sequence.

In [36]:
def model_creation(embedding_dims=100, max_features = 35000, sequence_length = 250):
  x_train=np.array(train.text)
  x_test=np.array(test.text)    

  vectorize_layer = TextVectorization(
      max_tokens=max_features,
      output_mode='int',
      output_sequence_length=sequence_length)

  vectorize_layer.adapt(x_train)

  model = tf.keras.models.Sequential()
  model.add(tf.keras.Input(shape=(1,), dtype=tf.string))
  model.add(vectorize_layer)
  model.add(tf.keras.layers.Embedding(max_features + 1,embedding_dims))
  model.add(tf.keras.layers.Flatten())
  model.add(tf.keras.layers.Dense(16, activation='relu',kernel_regularizer='l2'))
  model.add(tf.keras.layers.Dropout(0.5))
  model.add(tf.keras.layers.Dense(1,activation="sigmoid"))

  model.summary()

  model.compile(optimizer="adam",loss="binary_crossentropy", metrics=['accuracy'])

Fist, the science classification model:

In [37]:
model_creation()
y_train_science=np.array(train.science_int)
y_test_science=np.array(test.science_int)
history_science=model.fit(x_train, y_train_science, epochs=10, batch_size=1000, validation_data=(x_test,y_test_science))
metrics_model(x_test,y_test_science)

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_vectorization_8 (TextVe (None, 250)               0         
_________________________________________________________________
embedding_8 (Embedding)      (None, 250, 100)          3500100   
_________________________________________________________________
flatten_8 (Flatten)          (None, 25000)             0         
_________________________________________________________________
dense_16 (Dense)             (None, 16)                400016    
_________________________________________________________________
dropout_8 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_17 (Dense)             (None, 1)                 17        
Total params: 3,900,133
Trainable params: 3,900,133
Non-trainable params: 0
____________________________________________

Second, the world classification model:

In [38]:
model_creation()
y_train_world=np.array(train.world_int)
y_test_world=np.array(test.world_int)
history_world=model.fit(x_train, y_train_world, epochs=10, batch_size=1000, validation_data=(x_test,y_test_world))
metrics_model(x_test,y_test_world)

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_vectorization_9 (TextVe (None, 250)               0         
_________________________________________________________________
embedding_9 (Embedding)      (None, 250, 100)          3500100   
_________________________________________________________________
flatten_9 (Flatten)          (None, 25000)             0         
_________________________________________________________________
dense_18 (Dense)             (None, 16)                400016    
_________________________________________________________________
dropout_9 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_19 (Dense)             (None, 1)                 17        
Total params: 3,900,133
Trainable params: 3,900,133
Non-trainable params: 0
____________________________________________

Third, the business classification model:

In [39]:
model_creation()
y_train_business=np.array(train.business_int)
y_test_business=np.array(test.business_int)
history_business=model.fit(x_train, y_train_business, epochs=10, batch_size=1000, validation_data=(x_test,y_test_business))
metrics_model(x_test,y_test_business)

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_vectorization_10 (TextV (None, 250)               0         
_________________________________________________________________
embedding_10 (Embedding)     (None, 250, 100)          3500100   
_________________________________________________________________
flatten_10 (Flatten)         (None, 25000)             0         
_________________________________________________________________
dense_20 (Dense)             (None, 16)                400016    
_________________________________________________________________
dropout_10 (Dropout)         (None, 16)                0         
_________________________________________________________________
dense_21 (Dense)             (None, 1)                 17        
Total params: 3,900,133
Trainable params: 3,900,133
Non-trainable params: 0
___________________________________________

Finally, the sports classification model:

In [40]:
model_creation()
y_train_sports=np.array(train.sports_int)
y_test_sports=np.array(test.sports_int)
history_sports=model.fit(x_train, y_train_sports, epochs=10, batch_size=1000, validation_data=(x_test,y_test_sports))
metrics_model(x_test,y_test_sports)

Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_vectorization_11 (TextV (None, 250)               0         
_________________________________________________________________
embedding_11 (Embedding)     (None, 250, 100)          3500100   
_________________________________________________________________
flatten_11 (Flatten)         (None, 25000)             0         
_________________________________________________________________
dense_22 (Dense)             (None, 16)                400016    
_________________________________________________________________
dropout_11 (Dropout)         (None, 16)                0         
_________________________________________________________________
dense_23 (Dense)             (None, 1)                 17        
Total params: 3,900,133
Trainable params: 3,900,133
Non-trainable params: 0
___________________________________________

# Multi-Classifier Model with Neural Networks and Tensorflow
First, encoding the y variable in an appropriate format for the softmax layer in tensorflow. Therefore, we transformed the ordinal encoding into a binary list with the 1 indicating the class.

In [41]:
from sklearn.preprocessing import OneHotEncoder
onehot_encoder = OneHotEncoder(sparse=False)
#Training set
np_train_label_int=np.array(train.label_int)
np_train_label_int=np_train_label_int.reshape(len(np_train_label_int), 1)
y_train_softmax = onehot_encoder.fit_transform(np_train_label_int)

#Test set
np_test_label_int=np.array(test.label_int)
np_test_label_int=np_test_label_int.reshape(len(np_test_label_int), 1)
y_test_softmax = onehot_encoder.transform(np_test_label_int)

Creation of the model:

In [42]:
embedding_dims=150
max_features = 35000
sequence_length = 250

vectorize_layer = TextVectorization(
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)

x_train=np.array(train.text)
x_test=np.array(test.text)    
vectorize_layer.adapt(x_train)

model_soft = tf.keras.models.Sequential()
model_soft.add(tf.keras.Input(shape=(1,), dtype=tf.string))
model_soft.add(vectorize_layer)
model_soft.add(tf.keras.layers.Embedding(max_features + 1,embedding_dims))
model_soft.add(tf.keras.layers.Flatten())
model_soft.add(tf.keras.layers.Dense(32, activation='relu',kernel_regularizer='l2'))
model_soft.add(tf.keras.layers.Dropout(0.5))
model_soft.add(tf.keras.layers.Dense(16, activation='relu',kernel_regularizer='l2'))
model_soft.add(tf.keras.layers.Dropout(0.5))
model_soft.add(tf.keras.layers.Dense(4, activation=tf.nn.softmax))

model_soft.summary()

model_soft.compile(optimizer="adam",loss="categorical_crossentropy", metrics=['accuracy'])

Model: "sequential_12"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_vectorization_12 (TextV (None, 250)               0         
_________________________________________________________________
embedding_12 (Embedding)     (None, 250, 150)          5250150   
_________________________________________________________________
flatten_12 (Flatten)         (None, 37500)             0         
_________________________________________________________________
dense_24 (Dense)             (None, 32)                1200032   
_________________________________________________________________
dropout_12 (Dropout)         (None, 32)                0         
_________________________________________________________________
dense_25 (Dense)             (None, 16)                528       
_________________________________________________________________
dropout_13 (Dropout)         (None, 16)              

Training of the model:

In [43]:
history_softmax=model_soft.fit(x_train, y_train_softmax, epochs=20, batch_size=1000, validation_data=(x_test,y_test_softmax))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


Evaluation of the model:

In [45]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import accuracy_score

y_pred_softmax=np.round(model_soft.predict(x_test),0)
roc_s = roc_auc_score(y_test_softmax, np.round(model_soft.predict(x_test)), average='macro', multi_class='ovo')
print("ROC AUC value test score: {:.3f}".format(roc_s),"\n")
accuracy_test=accuracy_score(y_test_softmax, y_pred_softmax)
print("Accuracy test set: {:.3f}".format(accuracy_test),"\n")

report = classification_report(y_test_softmax, y_pred_softmax)
print(report)

conf = multilabel_confusion_matrix(y_test_softmax, y_pred_softmax)
print("Confusion matrix: \n{}".format(conf))

ROC AUC value test score: 0.934 

Accuracy test set: 0.896 

              precision    recall  f1-score   support

           0       0.88      0.87      0.87      2537
           1       0.96      0.98      0.97      2458
           2       0.93      0.88      0.90      2509
           3       0.88      0.86      0.87      2496

   micro avg       0.91      0.90      0.90     10000
   macro avg       0.91      0.90      0.90     10000
weighted avg       0.91      0.90      0.90     10000
 samples avg       0.90      0.90      0.90     10000

Confusion matrix: 
[[[7165  298]
  [ 339 2198]]

 [[7442  100]
  [  58 2400]]

 [[7313  178]
  [ 300 2209]]

 [[7207  297]
  [ 343 2153]]]


  _warn_prf(average, modifier, msg_start, len(result))
