In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from pathlib import Path

file_path = Path.cwd().joinpath("Data", "mpr_data_merged")
output_path = os.path.join(os.path.expanduser("~"), "documents/queensma/ma_essay/data/output")

In [2]:
## SKLEARN ##
from sklearn.model_selection import train_test_split
from sklearn import metrics

## TENSORFLOW ##
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.callbacks import ModelCheckpoint, EarlyStopping

# Load data

Data was uploaded to kaggle and loaded into the notebook.

In [3]:
df = pd.read_csv(file_path.joinpath('data_deep_model_ready_V3.csv'))
df.head()

Unnamed: 0,text,class,mpr,wordcount,NPositiveWords,NNegativeWords,NNeutralWords,NUncertainWords,NStrongWords,NWeakWords,...,Poswords,Negwords,Neuwords,Unwords,Strongwords,Weakwords,Conwords,quarter,year,raw_text
0,Information received since the last Monetary P...,2,2000Q1,41,1,0,41,0,0,0,...,stronger,,information received since the last monetary p...,,,,,1,2000,Information received since the last Monetary P...
1,With the further strengthening of global deman...,2,2000Q1,25,1,0,25,0,0,0,...,strengthening,,with the further strengthening of global deman...,,,,,1,2000,With the further strengthening of global deman...
2,"This has been particularly true for oil, lumbe...",2,2000Q1,18,0,0,17,0,0,0,...,,,this has been particularly true for oil lumber...,,,,constraints,1,2000,"This has been particularly true for oil, lumbe..."
3,Higher crude oil prices have led to higher ene...,0,2000Q1,25,0,0,25,0,0,0,...,,,higher crude oil prices have led to higher ene...,,,,,1,2000,Higher crude oil prices have led to higher ene...
4,"As yet, however, these countries have not seen...",1,2000Q1,26,0,0,26,0,0,0,...,,,as yet however these countries have not seen a...,,,,,1,2000,"As yet, however, these countries have not seen..."


Select the variables we want to use in our model.

In [4]:
y = df['class']
X = df[
    ['text', 
     'year',
     'quarter',
     'wordcount', 
     'NPositiveWords', 
     'NNegativeWords', 
     'NUncertainWords', 
     'NConstWords', 
     'NStrongWords',
     'NWeakWords']
]

# Preprocess text data

In [5]:
## TOKENIZE AND VECTORIZE TEXT ##
vectorizer = layers.TextVectorization()
vectorizer.adapt(X['text'])

# Fit normalizer to training data
normalize = layers.Normalization()
normalize.adapt(X.loc[:, 'year':])

## SPLIT DATA ##
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.3)

2023-07-22 17:42:45.290134: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


In [6]:
## RESHAPE ##
"""
Unlike sklearn, kera's requires the class labels to be one-hot encoded.
Therefore, we need to reshape the data into wide format
"""

def reshape_labels(data):
    data = pd.DataFrame(data)
    data[0] = data['class'].apply(lambda x: 1 if x == 0 else 0)
    data[1] = data['class'].apply(lambda x: 1 if x == 1 else 0)
    data[2] = data['class'].apply(lambda x: 1 if x == 2 else 0)
    return data.loc[:, 0:]

# Reshape target vectors
y_train_wide = reshape_labels(y_train)
y_test_wide = reshape_labels(y_test)

# LSTM -- Untrained EMBEDDINGS

In [7]:
def get_uncompiled_model(
    output_dim=64, 
    spatial_dropout_rate=0.3, 
    lstm_units=[128, 64, 32], 
    dense_units=[64, 32, 16], 
    dropout_rates=[0.2, 0.2, 0.2],
    pre_trained_embedding=None
):
    
    """
    Define the network/model
    
    This will be a 2 input model: 
        (i) text data 
        (ii) numeric data
        
    A 3 layer LSTM will be used to learn the text
    A 3 layer MLP model for numeric features
    
    Text data will be vectorized during training 
    Numeric data will be normalized during training
    
    Both vectorization and normalization has been learned outside of the model
    """
    # Define Network -- input layers
    text_inputs = keras.Input(shape=(1, ), name='text', dtype=tf.string) # text
    num_inputs = keras.Input(shape=(n_num_features, ), name='num') # numeric
    
    # Text layers
    """
    Embedding layer, 
    Spatial dropout, 
    3 alternating LSTM and Batch Normalization layers
    
    NOTE: If a pre-trained embedding matrix is used, then the embedding layer
    will not longer be trainable and will use the pre-trained embedding matrix given.
    """
    if pre_trained_embedding is not None:
        text_layers = layers.Embedding(
            input_dim=vocab_size, 
            output_dim=output_dim,
            embeddings_initializer=pre_trained_embedding, 
            trainable=False
        )(vectorizer(text_inputs))
    else:
        text_layers = layers.Embedding(input_dim=vocab_size, output_dim=output_dim)(vectorizer(text_inputs))
    
    text_layers = layers.SpatialDropout1D(spatial_dropout_rate)(text_layers)
    
    text_layers = layers.LSTM(lstm_units[0], return_sequences=True)(text_layers)
    text_layers = layers.BatchNormalization()(text_layers)
    
    text_layers = layers.LSTM(lstm_units[1], return_sequences=True)(text_layers)
    text_layers = layers.BatchNormalization()(text_layers)
    
    text_layers = layers.LSTM(lstm_units[2])(text_layers)
    text_layers = layers.BatchNormalization()(text_layers)

    # Numeric layers
    """
    Normalized numerical input data,
    3 Dense layers with Dropout and Batch Normalization.
    
    Normalization is known to help neural networks.
    """
    num_layers = layers.Dropout(dropout_rates[0])(normalize(num_inputs)) # normalize input data
    num_layers = layers.Dense(dense_units[0], activation='relu')(num_layers)
    num_layers = layers.BatchNormalization()(num_layers)

    num_layers = layers.Dropout(dropout_rates[1])(num_layers)
    num_layers = layers.Dense(dense_units[1], activation='relu')(num_layers)
    num_layers = layers.BatchNormalization()(num_layers)

    num_layers = layers.Dropout(dropout_rates[2])(num_layers)
    num_layers = layers.Dense(dense_units[2], activation='relu')(num_layers)
    num_layers = layers.BatchNormalization()(num_layers)
    
    # merge text and numeric features together
    x = layers.concatenate([text_layers, num_layers])
    
    # Output layer
    """
    Output layer, layer determines the form of the models output
    A softmax activation function is used because of the multiclass model
    3 unit dense layer is used because there are 3 classes (pos, neu, neg)
    """
    outputs = layers.Dense(3, activation='softmax')(x)

    # Model object
    model = keras.Model(inputs=[text_inputs, num_inputs], outputs=outputs, name='LSTM')
    return model

def compile_model(model, learning_rate=0.001, use_ema=False, momentum=0.99):
    """
    This compiles the model -- necessary step for all keras models
    
    Setup the loss/objective function
    Choose optimization algorithm
    Choose metrics used to measure loss or model performance
    """
    model.compile(
        loss='categorical_crossentropy', 
        optimizer=keras.optimizers.Adam(
            learning_rate=learning_rate, 
            use_ema=use_ema, 
            ema_momentum=momentum
        ), 
        metrics=[
            keras.metrics.CategoricalAccuracy(),
            keras.metrics.AUC(multi_label=True, num_labels=3),
            keras.metrics.Precision(),
            keras.metrics.Recall()
        ]
    )
    return model

In [8]:
# FUNCTIONS TO GET PREDICTIONS AND METRICS
# to use sklearn's classification report we need predicted labels as an output.

def get_predicted_probs(model, x={"text": X_test.loc[:, 'text'], "num": X_test.loc[:, 'year':]}):
    """
    We feed in the `text` and `num` data as seperate inputs.
    
    This gives us an array of the predicted probabilities for each class/label.
    
    e.g. [0.41, 0.89, 0.12]
    """
    return model.predict(x)

def get_predicted_labels(pred_probs):
    """
    After obtaining the predicted probabilities, we can get the
    predicted label using the `argmax` function -- base method use by kera's.
    """
    pred_labels = []
    
    for i in range(len(pred_probs)):
        pred_labels.append(np.argmax(pred_probs[i]))
        
    return pred_labels

## Initialization parameters and compiling model

In [9]:
## Initiate some parameters for TRAINING ##
BATCH_SIZE = 128
EPOCHS = 50

# Shape for input
n_num_features = X_train.loc[:, 'year':].shape[1]
# Input shape for text data must be (1, ), stated in the model

# Vocabulary size
vocab_size = len(vectorizer.get_vocabulary())+1

In [10]:
# Set random seed
np.random.seed(42)
tf.random.set_seed(42)

# Create and compile model instance
# model = compile_model(
#     get_uncompiled_model(
#         output_dim=128, 
#         lstm_units=[32, 32, 32], 
#         dense_units=[16, 16, 16],
#         dropout_rates=[0.3, 0.3, 0.3]
#     ), 
#     learning_rate=0.01,
# )

# print(model.summary())

## TRAINING ROUND 1

In [11]:
# Set random seed
np.random.seed(42)
tf.random.set_seed(42)

"""Create checkpoint, this will autosave the entire model per epoch
IF the model achieves a better accuracy than the best so far..."""

# checkpoint_path = "/Users/kelstonchen/Documents/QueensMA/MA_Essay/MODELS/checkpoints/lstm_model-{epoch:02d}-{val_categorical_accuracy:.3f}"

# checkpoint = ModelCheckpoint(
#     checkpoint_path, 
#     monitor='val_categorical_accuracy',
#     verbose=1,
#     save_best_only=True,
#     mode='max',
# )

# early_stopping = EarlyStopping(
#     monitor='val_categorical_accuracy', 
#     patience=5, 
#     verbose=1
# )

# # Train model -- First Round.
# model_hist = model.fit(
#     {"text": X_train.loc[:, 'text'], "num": X_train.loc[:, 'year':]}, 
#     y_train_wide,
#     epochs=EPOCHS,
#     validation_split=0.2,
#     batch_size=BATCH_SIZE,
#     verbose=1,
#     callbacks=[checkpoint, early_stopping]
# )

'Create checkpoint, this will autosave the entire model per epoch\nIF the model achieves a better accuracy than the best so far...'

### LOAD MODEL

In [12]:
## Load Saved Model ##
model = keras.models.load_model("/Users/kelstonchen/Documents/QueensMA/MA_Essay/MODELS/lstm_model-05-0.76")
model.summary()

Model: "LSTM"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 num (InputLayer)               [(None, 9)]          0           []                               
                                                                                                  
 text (InputLayer)              [(None, 1)]          0           []                               
                                                                                                  
 normalization (Normalization)  (None, 9)            19          ['num[0][0]']                    
                                                                                                  
 text_vectorization (TextVector  (None, None)        0           ['text[0][0]']                   
 ization)                                                                                      

In [13]:
# Evaluate -- Test model
loss, acc, auc, pre, recall = model.evaluate(
    {"text": X_test.loc[:, 'text'], "num": X_test.loc[:, 'year':]}, 
    y_test_wide, 
    verbose=1
)



In [14]:
# Get predicted labels
pred_labels = get_predicted_labels(
    get_predicted_probs(model)
)
print(metrics.classification_report(y_test, pred_labels))

              precision    recall  f1-score   support

           0       0.94      0.28      0.43       893
           1       0.75      0.98      0.85      3408
           2       0.77      0.39      0.51       894

    accuracy                           0.76      5195
   macro avg       0.82      0.55      0.60      5195
weighted avg       0.78      0.76      0.72      5195



# PRE-TRAINED EMBEDDINGS

## GloVe

In [15]:
# Grabbing the vocabulary from our vectorizer, create a word index
# Note: vectorizer was fitted to our data
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

In [16]:
path_to_glove_file = os.path.join(
    os.path.expanduser("~"), 
    "Documents/QueensMA/MA_Essay/Data/glove.6B/glove.6B.200d.txt"
)

In [17]:
def load_embeddings_index(path):
    embeddings_index = {}
    with open(path) as f:
        for line in f:
            word, coefs = line.split(maxsplit=1)
            coefs = np.fromstring(coefs, "f", sep=" ")
            embeddings_index[word] = coefs
    return embeddings_index

In [18]:
embeddings_index = load_embeddings_index(path_to_glove_file)

## Creating the embedding matrix

In [19]:
def gen_embedding_matrix(embedding_index, embedding_size, other_embeddings_index=None):
    """
    The following grabs the embedding vector for each word found in our data
    to the pre-trained embedding vector from GloVe.

    The returned embedding matrix is one with the embedding vectors from GloVe.
    """

    ## INITAL PARAMS FOR EMBEDDING MATRIX ##
    hits = 0
    misses = 0

    # Prepare embedding matrix
    embedding_matrix = np.zeros((vocab_size, embedding_size))
    for word, i in word_index.items():
        embedding_vector = embedding_index.get(word) # get the embedding vector from GloVe
        if embedding_vector is not None:
            # Words not found in embedding index will be all-zeros.
            # This includes the representation for "padding" and "OOV"
            embedding_matrix[i] = embedding_vector
            hits += 1

        # NEW ADDITION ##
        elif other_embeddings_index != None and embedding_vector is None:
            embedding_matrix[i] = other_embeddings_index.get(word)
            # print(word)
        else:
            misses += 1
            # print(word)

    print("Converted %d words (%d misses)" % (hits, misses))
    return embedding_matrix

In [20]:
## INITAL PARAMS FOR EMBEDDING MATRIX ##
embedding_size = 200 # output dim - MUST match the dimension of pretrained embedding
# Vocabulary size
vocab_size = len(vectorizer.get_vocabulary())

# Generate matrix
embedding_matrix = gen_embedding_matrix(embedding_index=embeddings_index, embedding_size=embedding_size)

Converted 5273 words (686 misses)


In [21]:
# Set random seed
np.random.seed(42)
tf.random.set_seed(42)

# Create and compile model instance
model_glove = compile_model(
    get_uncompiled_model(
        spatial_dropout_rate=0.1,
        output_dim=embedding_size, 
        lstm_units=[128, 128, 128], 
        dense_units=[16, 16, 16],
        dropout_rates=[0.1, 0.1, 0.1],
        pre_trained_embedding=keras.initializers.Constant(embedding_matrix) # LOAD GloVe embeddings 
    ), 
    learning_rate=0.01,
)

## TRAINING ROUND 1

In [22]:
# Set random seed
np.random.seed(42)
tf.random.set_seed(42)

"""Create checkpoint, this will autosave the entire model per epoch
IF the model achieves a better accuracy than the best so far..."""

# checkpoint_path = "/Users/kelstonchen/Documents/QueensMA/MA_Essay/MODELS/checkpoints/glove_model-{epoch:02d}-{val_categorical_accuracy:.3f}"

# checkpoint = ModelCheckpoint(
#     checkpoint_path, 
#     monitor='val_categorical_accuracy',
#     verbose=1,
#     save_best_only=True,
#     mode='max',
# )

# # Train model -- First Round.
# model_hist = model_glove.fit(
#     {"text": X_train.loc[:, 'text'], "num": X_train.loc[:, 'year':]}, 
#     y_train_wide,
#     epochs=EPOCHS,
#     validation_split=0.2,
#     batch_size=200,
#     verbose=1,
#     callbacks=[checkpoint]
# )

'Create checkpoint, this will autosave the entire model per epoch\nIF the model achieves a better accuracy than the best so far...'

### LOAD MODEL

In [23]:
# ## Load Saved Model ##
model_glove = keras.models.load_model('/Users/kelstonchen/Documents/QueensMA/MA_Essay/MODELS/glove_model-16-0.793')
model_glove.summary()

Model: "LSTM"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 num (InputLayer)               [(None, 9)]          0           []                               
                                                                                                  
 text (InputLayer)              [(None, 1)]          0           []                               
                                                                                                  
 normalization (Normalization)  (None, 9)            19          ['num[0][0]']                    
                                                                                                  
 text_vectorization (TextVector  (None, None)        0           ['text[0][0]']                   
 ization)                                                                                      

In [24]:
# Evaluate -- Test model
loss, acc, auc, pre, recall = model_glove.evaluate(
    {"text": X_test.loc[:, 'text'], "num": X_test.loc[:, 'year':]},
    y_test_wide, 
    verbose=1
)



In [25]:
# Get predicted labels
pred_labels = get_predicted_labels(
    get_predicted_probs(model_glove)
)
# print(metrics.classification_report(
#     y_test, pred_labels,
#     target_names=['negative', 'neutral', 'positive'], 
#     digits=3
# )
#      )
report = pd.DataFrame(
    metrics.classification_report(
        y_test, pred_labels, 
        target_names=['negative', 'neutral', 'positive'], 
        digits=3,
        output_dict=True
    )
)
display(report)
print(f"AUC: {auc:.3f}")



Unnamed: 0,negative,neutral,positive,accuracy,macro avg,weighted avg
precision,0.757627,0.797043,0.758958,0.788065,0.771209,0.783714
recall,0.50056,0.933392,0.521253,0.788065,0.651735,0.788065
f1-score,0.602832,0.859846,0.618037,0.788065,0.693572,0.774054
support,893.0,3408.0,894.0,0.788065,5195.0,5195.0


AUC: 0.877


In [26]:
## SAVE REPORT ##
report.to_csv(os.path.join(output_path, "tables", "LSTM_report.csv"))

In [27]:
## BALANCED ACCURACY ##
metrics.balanced_accuracy_score(y_test, pred_labels)

0.651734908538086