Import libraries

In [1]:
import nltk
import re
import pandas as pd
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.optimizers.legacy import Adam
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.callbacks import EarlyStopping

Define the text cleaning and preprocessing functions

In [2]:
stopwords = set(nltk.corpus.stopwords.words('english'))
def cleantext(string):
    # Remove all punctuation
    string = re.sub(r"'s\b", '', string)
    string = re.sub(r'[^\w\s]', '', string)
    # Make all lowercase
    string = string.lower()
    # remove all stopwords
    string = ' '.join([word for word in string.split() if word not in stopwords])
    # Remove all special characters
    string = re.sub(r'\W+', ' ', string)
    return string
def lemmatize(string):
    lemmatizer = nltk.stem.WordNetLemmatizer()
    string = ' '.join([lemmatizer.lemmatize(word) for word in string.split()])
    return string
def preprocess(obj):
    obj = cleantext(obj)
    obj = lemmatize(obj)
    return obj

Load in the data

In [3]:
data = pd.read_csv('../Datasets/news_data_labelled.csv', encoding='latin-1')

Apply preprocessing to the data

In [4]:
data.dropna(inplace=True)
data['Text'] = data['Title'] + ' ' + data['Text']
data['cleaned_text'] = data['Text'].apply(preprocess)

Tokenise the text and convert to sequences of integers

In [5]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['cleaned_text'])
sequences = tokenizer.texts_to_sequences(data['cleaned_text'])

word_index = tokenizer.word_index
print(f'Found {len(word_index)} unique tokens.')


Found 1035707 unique tokens.


Pad the sequences to length of 400

In [6]:
data_pad = pad_sequences(sequences, maxlen=400)

Prepare labels for the model

In [7]:
labels = pd.get_dummies(data['Sentiment']).values

Train-Test Split

In [8]:
X_train, X_val, y_train, y_val = train_test_split(data_pad, labels, test_size=0.1, random_state=42)

Define the model architecture

In [9]:
def create_model(lstm_units=64, dropout_rate=0.2, learning_rate=0.001):
    model = Sequential()
    model.add(Embedding(len(word_index) + 1, 100, input_length=400))
    model.add(LSTM(lstm_units, return_sequences=False))
    model.add(Dropout(dropout_rate))
    model.add(Dense(3, activation='softmax'))  
    
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model


In [10]:
early_stopping = EarlyStopping(monitor='val_loss', patience=3)

Hyperparameter tuning through a coarse parameter grid search, to narrow values down to a smaller range

In [11]:
early_stopping = EarlyStopping(monitor='val_loss', patience=3)

# Define the parameter grid
param_grid = {
    'lstm_units': [64, 128],
    'dropout_rate': [0.2, 0.4],
    'learning_rate': [0.01, 0.001]
}

# This will generate all combinations of parameters
grid = list(ParameterGrid(param_grid))

# Placeholder for best score and best params
best_score = 0
best_params = None

# Iterate over all combinations
for params in grid:
    # Create a new model with current params
    model = create_model(lstm_units=params['lstm_units'],
                         dropout_rate=params['dropout_rate'],
                         learning_rate=params['learning_rate'])
    
    # Train the model (use a smaller subset of data for speed)
    history = model.fit(X_train[:int(len(X_train)*0.1)], y_train[:int(len(y_train)*0.1)],
                        batch_size=32,
                        epochs=3,
                        validation_data=(X_val, y_val),
                        callbacks=[early_stopping])
    
    # Check if the performance is improved
    score = max(history.history['val_accuracy'])  # Use the best epoch's accuracy
    print(f"Params: {params}, Score: {score}")
    
    if score > best_score:
        best_score = score
        best_params = params

print(f"Best params: {best_params}, Best score: {best_score}")

Epoch 1/3
Epoch 2/3
Epoch 3/3
Params: {'dropout_rate': 0.2, 'learning_rate': 0.01, 'lstm_units': 64}, Score: 0.8196427226066589
Epoch 1/3
Epoch 2/3
Epoch 3/3
Params: {'dropout_rate': 0.2, 'learning_rate': 0.01, 'lstm_units': 128}, Score: 0.8088504076004028
Epoch 1/3
Epoch 2/3
Epoch 3/3
Params: {'dropout_rate': 0.2, 'learning_rate': 0.001, 'lstm_units': 64}, Score: 0.817206859588623
Epoch 1/3
Epoch 2/3
Epoch 3/3
Params: {'dropout_rate': 0.2, 'learning_rate': 0.001, 'lstm_units': 128}, Score: 0.8272210359573364
Epoch 1/3
Epoch 2/3
Epoch 3/3
Params: {'dropout_rate': 0.4, 'learning_rate': 0.01, 'lstm_units': 64}, Score: 0.8213343024253845
Epoch 1/3
Epoch 2/3
Epoch 3/3
Params: {'dropout_rate': 0.4, 'learning_rate': 0.01, 'lstm_units': 128}, Score: 0.7853373289108276
Epoch 1/3
Epoch 2/3
Epoch 3/3
Params: {'dropout_rate': 0.4, 'learning_rate': 0.001, 'lstm_units': 64}, Score: 0.818458616733551
Epoch 1/3
Epoch 2/3
Epoch 3/3
Params: {'dropout_rate': 0.4, 'learning_rate': 0.001, 'lstm_units': 12

Testing smaller range to find optimal values

In [None]:
def create_model(lstm_units=64, dropout_rate=0.2, learning_rate=0.01, lstm_layers=1):
    model = Sequential()
    model.add(Embedding(input_dim=len(word_index) + 1, output_dim=100, input_length=400))
    for i in range(lstm_layers):
        return_sequences = i < lstm_layers - 1  # Only the last LSTM layer has return_sequences=False
        model.add(LSTM(lstm_units, return_sequences=return_sequences))
        if return_sequences:  # Optionally add dropout after each LSTM layer except the last one
            model.add(Dropout(dropout_rate))
    model.add(Dropout(dropout_rate))
    model.add(Dense(3, activation='softmax'))  # Assuming 3 classes for the output layer
    model.compile(loss='categorical_crossentropy',
                  optimizer=Adam(learning_rate=learning_rate),
                  metrics=['accuracy'])
    return model

# Define a new grid with narrowed down parameters and including the number of LSTM layers
param_grid = {
    'lstm_units': [64],  # Narrowed down to the best value from the previous search
    'dropout_rate': [0.1, 0.2, 0.3],  # Slightly broader around the best value
    'learning_rate': [0.005, 0.01, 0.02],  # Slightly broader around the best value
    'lstm_layers': [1, 2]  # Testing both 1 and 2 layers
}

grid = list(ParameterGrid(param_grid))

# Iterate over the grid
for params in grid:
    model = create_model(lstm_units=params['lstm_units'],
                         dropout_rate=params['dropout_rate'],
                         learning_rate=params['learning_rate'],
                         lstm_layers=params['lstm_layers'])
    
    history = model.fit(X_train, y_train, 
                        batch_size=32, 
                        epochs=3,  # Assuming you want to keep epochs low for quick iterations
                        validation_data=(X_val, y_val), 
                        callbacks=[early_stopping])
    
    # Use the best epoch's accuracy as the score
    score = max(history.history['val_accuracy'])
    print(f"Params: {params}, Validation Accuracy: {score}")

Model training and evaluation

In [15]:
final_model = create_model(lstm_units=64,
                         dropout_rate=0.1,
                         learning_rate=0.005,
                         lstm_layers=2)
history = final_model.fit(X_train, y_train,
                    batch_size=32,
                    epochs=3,
                    validation_data=(X_val, y_val),
                    callbacks=[early_stopping])


Epoch 1/3
Epoch 2/3
Epoch 3/3


Evaluate model performance

In [16]:
score = final_model.evaluate(X_val, y_val, verbose=0)
print('Validation Loss:', score[0])
print('Validation Accuracy:', score[1])

Validation Loss: 0.21009482443332672
Validation Accuracy: 0.9263482093811035


Save model

In [17]:
final_model.save('../Finalised_Models/LSTM.h5')

  saving_api.save_model(


Return continous output

In [None]:
def predict_sentiment(text):
    # Preprocess the text
    preprocessed_text = preprocess(text)
    
    # Tokenize and pad the text
    sequence = tokenizer.texts_to_sequences([preprocessed_text])
    padded_sequence = pad_sequences(sequence, maxlen=400)
    
    # Get the model's prediction (probabilities for each class)
    prediction = final_model.predict(padded_sequence)[0]
    
    # Assuming the order of output probabilities is [negative, neutral, positive]
    # We can take a weighted sum of the probabilities and the sentiment scores
    sentiment_score = (prediction[0] * -1) + (prediction[1] * 0) + (prediction[2] * 1)
    

    return sentiment_score
