In [44]:
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, load_model
from keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.utils.class_weight import compute_class_weight
from keras.optimizers import Adam
# from sklearn.utils.class_weight import compute_class_weight
# from scikeras.wrappers import KerasClassifier
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from newsapi import NewsApiClient
import string
import spacy
from sklearn.utils import resample
import pickle

In [45]:
data = pd.read_csv("../csv_file/big_data.csv") 
test_data = pd.read_csv("../csv_file/test_data.csv")

In [46]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Download spaCy model
spacy.cli.download("en_core_web_sm")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\soder\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\soder\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\soder\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [47]:
nlp = spacy.load("en_core_web_sm")

In [48]:
# Convert GloVe format to Word2Vec format
glove_input_file = '../pretrained_word_vectors/glove.6B.100d.txt'
word2vec_output_file = '../pretrained_word_vectors/glove.6B.100d.word2vec'

In [49]:
glove2word2vec(glove_input_file, word2vec_output_file)

  glove2word2vec(glove_input_file, word2vec_output_file)


(400000, 100)

In [50]:
glove_model = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)

In [51]:
def preprocess_text(text):
    # Lowercasing
    text = text.lower()

    # Tokenization using spaCy
    tokens = nlp(text)
    tokens = [token.text for token in tokens]

    # Punctuation Removal
    tokens = [token for token in tokens if token not in string.punctuation]

    # Stop Word Removal
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return tokens


In [52]:
texts = data["headlines"]
labels = data["outcome"]

In [53]:
preprocessed_texts = [preprocess_text(text) for text in texts]

In [61]:
max_words = 23000
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(preprocessed_texts)
sequences = tokenizer.texts_to_sequences(preprocessed_texts)

In [62]:
# Padding sequences
maxlen = 75
X = pad_sequences(sequences, maxlen=maxlen)

In [63]:
# Convert labels to numpy array
y = np.array(labels)

In [64]:
# Create an embedding matrix
word_index = tokenizer.word_index
num_words = min(max_words, len(word_index) + 1)
embedding_dim = 100  # GloVe embedding dimension

In [65]:
embedding_matrix = np.zeros((num_words, embedding_dim))

In [66]:
for word, i in word_index.items():
    if i < max_words:
        embedding_vector = glove_model[word] if word in glove_model else None
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [67]:
# Define the model with pre-trained embeddings
embedding_layer = Embedding(max_words, embedding_dim, input_length=maxlen, weights=[embedding_matrix], trainable=False)
model = Sequential()
model.add(embedding_layer)
model.add(LSTM(50))
model.add(Dense(1, activation='sigmoid'))

In [68]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [69]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)

In [70]:
# Train the model
epochs = 10
batch_size = 64

In [74]:
model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x1a5b70e6850>

In [75]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

Test Accuracy: 89.45%


In [77]:
# Probability predictions on the test set
y_prob = model.predict(X_test)

# Convert probabilities to binary predictions using a threshold (e.g., 0.5)
y_pred = (y_prob >= 0.5).astype(int)

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

# Classification Report
cr = classification_report(y_test, y_pred)
print("Classification Report:")
print(cr)

Confusion Matrix:
[[1576  184]
 [ 229 1926]]
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.90      0.88      1760
           1       0.91      0.89      0.90      2155

    accuracy                           0.89      3915
   macro avg       0.89      0.89      0.89      3915
weighted avg       0.89      0.89      0.89      3915



In [78]:
# Save the model
model.save("../trained_models_and_preprocessing pipeline/keras_model/keras_model_with_glove_2.h5")

  saving_api.save_model(


In [36]:
loaded_model = load_model("../trained_models_and_preprocessing pipeline/keras_model/keras_model_with_glove.h5")