<a href="https://colab.research.google.com/github/JeanMusenga/PhD-Thesis_2024_Musenga/blob/main/TextCNN_with_GloVe.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

https://chatgpt.com/share/21220a6d-4cca-48bc-b461-bca6b0e140bc

In [None]:
pip install tensorflow

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [3]:
file_path = './saved_file'
file_path = ('ARPs_and_ProgrammingPosts.xlsx')
data = pd.read_excel(file_path)

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

In [4]:
# Preprocess the data
X = data['Question_body']
y = data['Label']

In [5]:
# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [6]:

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)


In [7]:
import numpy as np

# Load GloVe embeddings
def load_glove_embeddings(glove_file):
    embeddings_index = {}
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

In [8]:
# Assuming you downloaded the 'glove.6B.100d.txt' file
glove_file = 'glove.6B.100d.txt'
embeddings_index = load_glove_embeddings(glove_file)
print(f'Found {len(embeddings_index)} word vectors.')

Found 400000 word vectors.


Step 3: Tokenize Text and Create Embedding Matrix

In [9]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenize the text data
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index

# Create the embedding matrix
embedding_dim = 100
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Pad the sequences
max_length = 100
X_train_padded = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=max_length)
X_test_padded = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=max_length)


Step 4: Create and Train TextCNN Model

In [10]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Conv1D, MaxPooling1D, Flatten, Dense, Dropout, concatenate

# Create the TextCNN model
def create_text_cnn_model(vocab_size, embedding_dim, max_length, embedding_matrix):
    inputs = Input(shape=(max_length,))
    embedding = Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length=max_length, trainable=False)(inputs)

    conv1 = Conv1D(128, 3, activation='relu')(embedding)
    pool1 = MaxPooling1D(pool_size=2)(conv1)

    conv2 = Conv1D(128, 4, activation='relu')(embedding)
    pool2 = MaxPooling1D(pool_size=2)(conv2)

    conv3 = Conv1D(128, 5, activation='relu')(embedding)
    pool3 = MaxPooling1D(pool_size=2)(conv3)

    concatenated = concatenate([pool1, pool2, pool3], axis=1)
    flatten = Flatten()(concatenated)
    dense1 = Dense(128, activation='relu')(flatten)
    dropout = Dropout(0.5)(dense1)
    outputs = Dense(1, activation='sigmoid')(dropout)

    model = Model(inputs=inputs, outputs=outputs)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    return model

vocab_size = len(word_index) + 1


In [11]:
# Create the model
text_cnn_model = create_text_cnn_model(vocab_size, embedding_dim, max_length, embedding_matrix)


In [None]:
# Display the model summary
text_cnn_model.summary()

In [13]:
# Train the model
history = text_cnn_model.fit(X_train_padded, y_train, epochs=10, batch_size=32, validation_data=(X_test_padded, y_test))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [14]:
# Evaluate the model
loss, accuracy = text_cnn_model.evaluate(X_test_padded, y_test)
print(f'Test Accuracy: {accuracy}')

Test Accuracy: 0.9327083826065063


In [15]:
# Predict on new data
y_pred = text_cnn_model.predict(X_test_padded)
y_pred_classes = (y_pred > 0.5).astype("int32")



In [16]:
# Evaluate predictions
from sklearn.metrics import classification_report

report = classification_report(y_test, y_pred_classes)
print(report)

              precision    recall  f1-score   support

           0       0.95      0.92      0.93      1504
           1       0.92      0.95      0.93      1483

    accuracy                           0.93      2987
   macro avg       0.93      0.93      0.93      2987
weighted avg       0.93      0.93      0.93      2987

