<a href="https://colab.research.google.com/github/JeanMusenga/PhD-Thesis_2024_Musenga/blob/main/TextCNN_with_Word2Vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Important sorces:
https://chatgpt.com/share/21220a6d-4cca-48bc-b461-bca6b0e140bc

In [None]:
pip install gensim

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
file_path = './saved_file'
file_path = ('posts.xlsx')
arp_data = pd.read_excel(file_path, sheet_name='13,205 ARPs')

In [8]:
# Parameters
max_len = 100  # Maximum length of all sequences

In [3]:
# Extract the text and labels
texts = arp_data['Question_body'].astype(str).tolist()
labels = arp_data['Label'].tolist()

# Tokenize the text and train Word2Vec model

In [4]:
# Tokenize the text and train Word2Vec model
tokenized_texts = [text.split() for text in texts]
word2vec_model = Word2Vec(tokenized_texts, vector_size=100, window=5, min_count=1, workers=4)


# Create an embedding matrix

In [5]:
# Create an embedding matrix
vocab_size = len(word2vec_model.wv.key_to_index) + 1
embedding_dim = word2vec_model.vector_size
embedding_matrix = np.zeros((vocab_size, embedding_dim))

word_index = {word: index for index, word in enumerate(word2vec_model.wv.index_to_key, 1)}

for word, index in word_index.items():
    embedding_matrix[index] = word2vec_model.wv[word]

# Convert text to sequences of word indexes

In [6]:
# Convert text to sequences of word indexes
sequences = [[word_index.get(word, 0) for word in text] for text in tokenized_texts]


# Pad the sequences to ensure uniform input size

In [9]:
# Pad the sequences to ensure uniform input size
data = pad_sequences(sequences, maxlen=max_len)

# Convert labels to numpy array

In [10]:
# Convert labels to numpy array
labels = tf.keras.utils.to_categorical(labels, num_classes=2)

# Split the data into training and testing sets

In [11]:
# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.3, random_state=42)


# Define the neural network model

In [12]:
# Define the neural network model
def create_model(vocab_size, embedding_dim, input_length, embedding_matrix):
    model = tf.keras.models.Sequential([
        tf.keras.layers.Embedding(input_dim=vocab_size,
                                  output_dim=embedding_dim,
                                  weights=[embedding_matrix],
                                  input_length=input_length,
                                  trainable=False),
        tf.keras.layers.Conv1D(128, 5, activation='relu'),
        tf.keras.layers.GlobalMaxPooling1D(),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dense(2, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Create the model

In [13]:
# Create the model
model = create_model(vocab_size, embedding_dim, max_len, embedding_matrix)




# Train the model

In [14]:
# Train the model
history = model.fit(x_train, y_train, epochs=10, batch_size=32, validation_split=0.2)


Epoch 1/10
[1m262/262[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 16ms/step - accuracy: 0.8145 - loss: 0.5116 - val_accuracy: 0.8871 - val_loss: 0.2875
Epoch 2/10
[1m262/262[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.9002 - loss: 0.2429 - val_accuracy: 0.8862 - val_loss: 0.2658
Epoch 3/10
[1m262/262[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9217 - loss: 0.1936 - val_accuracy: 0.8895 - val_loss: 0.2781
Epoch 4/10
[1m262/262[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9369 - loss: 0.1569 - val_accuracy: 0.8934 - val_loss: 0.3280
Epoch 5/10
[1m262/262[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9484 - loss: 0.1289 - val_accuracy: 0.8871 - val_loss: 0.3190
Epoch 6/10
[1m262/262[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9539 - loss: 0.1150 - val_accuracy: 0.8642 - val_loss: 0.3201
Epoch 7/10
[1m262/262[0m 

# Evaluate the model

In [15]:
# Evaluate the model
loss, accuracy = model.evaluate(x_test, y_test)
print(f'Test Accuracy: {accuracy:.4f}')


[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9016 - loss: 0.4806
Test Accuracy: 0.8982


# Make predictions

In [32]:
# Make predictions
y_pred = model.predict(x_test)
y_pred_classes = y_pred.argmax(axis=1)
y_true_classes = y_test.argmax(axis=1)


[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step


In [30]:
# Calculate precision, recall, and F1-score
report = classification_report(y_true_classes, y_pred_classes, target_names=['Class 0', 'Class 1'])
print(report)

              precision    recall  f1-score   support

     Class 0       0.95      0.84      0.89      2265
     Class 1       0.86      0.95      0.90      2215

    accuracy                           0.90      4480
   macro avg       0.90      0.90      0.90      4480
weighted avg       0.90      0.90      0.90      4480



In [35]:
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix

# Calculate class-wise accuracy
accuracy_class_0 = conf_matrix[0, 0] / (conf_matrix[0, 0] + conf_matrix[0, 1]) if (conf_matrix[0, 0] + conf_matrix[0, 1]) > 0 else 0
accuracy_class_1 = conf_matrix[1, 1] / (conf_matrix[1, 1] + conf_matrix[1, 0]) if (conf_matrix[1, 1] + conf_matrix[1, 0]) > 0 else 0

# Compute overall accuracy
accuracy = (conf_matrix[0, 0] + conf_matrix[1, 1]) / conf_matrix.sum()

# Print metrics
print(f'Class 0 - Precision: {precision_class[0]}, Recall: {recall_class[0]}, F1-score: {f1_class[0]}, Support: {support_class[0]}, Accuracy: {accuracy_class_0}')
print(f'Class 1 - Precision: {precision_class[1]}, Recall: {recall_class[1]}, F1-score: {f1_class[1]}, Support: {support_class[1]}, Accuracy: {accuracy_class_1}')
print(f'Overall Accuracy: {accuracy}')

Class 0 - Precision: 0.9493293591654247, Recall: 0.8437086092715231, F1-score: 0.8934081346423561, Support: 2265, Accuracy: 0.8437086092715231
Class 1 - Precision: 0.8565058775841102, Recall: 0.9539503386004514, F1-score: 0.9026057240495514, Support: 2215, Accuracy: 0.9539503386004514
Overall Accuracy: 0.8982142857142857


In [None]:

# Save the model
model.save('word2vec_textcnn_model.h5')

  saving_api.save_model(
