<a href="https://colab.research.google.com/github/JeanMusenga/PhD-Thesis_2024_Musenga/blob/main/TextCNN_ARPs_Classification_With_PredictionMetrics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install tensorflow

In [5]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


In [None]:
file_path = './saved_file'
file_path = ('ARPs_and_ProgrammingPosts.xlsx')
arp_data = pd.read_excel(file_path, sheet_name='13,205 ARPs')

In [14]:
# Parameters for text preprocessing
max_words = 10000  # Maximum number of words to keep, based on word frequency
max_len = 100      # Maximum length of all sequences

In [15]:
# Extract the text and labels
texts = arp_data['Question_body'].astype(str).tolist()
labels = arp_data['Label'].tolist()

In [16]:
# Tokenize the text
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

In [17]:
# Pad the sequences to ensure uniform input size
data = pad_sequences(sequences, maxlen=max_len)

In [18]:
# Convert labels to numpy array
labels = tf.keras.utils.to_categorical(labels, num_classes=2)

In [19]:
# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)


In [20]:
# Define the TextCNN model
def create_textcnn_model(input_length, vocab_size):
    model = tf.keras.models.Sequential([
        tf.keras.layers.Embedding(vocab_size, 128, input_length=input_length),
        tf.keras.layers.Conv1D(128, 5, activation='relu'),
        tf.keras.layers.GlobalMaxPooling1D(),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dense(2, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [21]:
# Create the model
model = create_textcnn_model(max_len, max_words)

In [22]:
# Train the model
history = model.fit(x_train, y_train, epochs=10, batch_size=32, validation_split=0.2)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [25]:
# Evaluate the model
loss, accuracy = model.evaluate(x_test, y_test)
print(f'Test Accuracy: {accuracy:.4f}')


Test Accuracy: 0.9394


In [26]:
# Make predictions
y_pred = model.predict(x_test)
y_pred_classes = y_pred.argmax(axis=1)
y_true_classes = y_test.argmax(axis=1)

# Calculate precision, recall, and F1-score
report = classification_report(y_true_classes, y_pred_classes, target_names=['Class 0', 'Class 1'])
print(report)


              precision    recall  f1-score   support

     Class 0       0.95      0.92      0.94      1504
     Class 1       0.93      0.95      0.94      1483

    accuracy                           0.94      2987
   macro avg       0.94      0.94      0.94      2987
weighted avg       0.94      0.94      0.94      2987



In [27]:
# Save the model
model.save('textcnn_model.h5')

  saving_api.save_model(
