<a href="https://colab.research.google.com/github/JeanMusenga/PhD-Thesis_2024_Musenga/blob/main/TextCNN_with_TF_IDF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

https://chatgpt.com/share/727dee06-b960-4baa-b017-d4ac62e8d326

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv1D, MaxPooling1D, concatenate, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix
import numpy as np

# Load the dataset

In [2]:
# Load the dataset
file_path = 'posts.xlsx'  # Replace with the actual file path
data = pd.read_excel(file_path)

# Preprocess text data

In [3]:
# Preprocess text data
data['Question_body'] = data['Question_body'].str.replace('\n', ' ').str.replace('<.*?>', '', regex=True)

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(data['Question_body'], data['Label'], test_size=0.3, random_state=42)


# Compute TF-IDF features

In [4]:
# Compute TF-IDF features
vectorizer = TfidfVectorizer(max_features=10000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Reshape TF-IDF features to fit CNN input shape
X_train_cnn = X_train_tfidf.toarray().reshape((X_train_tfidf.shape[0], X_train_tfidf.shape[1], 1))
X_test_cnn = X_test_tfidf.toarray().reshape((X_test_tfidf.shape[0], X_test_tfidf.shape[1], 1))

# Define the TextCNN model

In [5]:
# Define the TextCNN model
def create_text_cnn_model(input_shape):
    inputs = Input(shape=input_shape)

    conv1 = Conv1D(128, 3, activation='relu')(inputs)
    pool1 = MaxPooling1D(pool_size=2)(conv1)

    conv2 = Conv1D(128, 4, activation='relu')(inputs)
    pool2 = MaxPooling1D(pool_size=2)(conv2)

    conv3 = Conv1D(128, 5, activation='relu')(inputs)
    pool3 = MaxPooling1D(pool_size=2)(conv3)

    concatenated = concatenate([pool1, pool2, pool3], axis=1)
    flatten = Flatten()(concatenated)
    dense1 = Dense(128, activation='relu')(flatten)
    dropout = Dropout(0.5)(dense1)
    outputs = Dense(1, activation='sigmoid')(dropout)

    model = Model(inputs=inputs, outputs=outputs)
    model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

    return model

# Create the model

In [6]:
# Create the model
input_shape = (X_train_cnn.shape[1], 1)
model = create_text_cnn_model(input_shape)


# Train the model

In [8]:
# Train the model
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
history = model.fit(X_train_cnn, y_train, epochs=10, batch_size=32, validation_split=0.2, callbacks=[early_stopping])


Epoch 1/10
[1m262/262[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 159ms/step - accuracy: 0.9385 - loss: 0.1563 - val_accuracy: 0.9024 - val_loss: 0.2622
Epoch 2/10
[1m262/262[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 149ms/step - accuracy: 0.9639 - loss: 0.0967 - val_accuracy: 0.8967 - val_loss: 0.3352
Epoch 3/10
[1m262/262[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 149ms/step - accuracy: 0.9875 - loss: 0.0351 - val_accuracy: 0.8996 - val_loss: 0.3546
Epoch 4/10
[1m262/262[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 148ms/step - accuracy: 0.9946 - loss: 0.0179 - val_accuracy: 0.8962 - val_loss: 0.4199


In [9]:
# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test_cnn, y_test)
print(f'Test Accuracy: {test_accuracy:.4f}')

[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - accuracy: 0.9060 - loss: 0.2588
Test Accuracy: 0.9045


In [10]:
# Predict on the test set
y_pred_probs = model.predict(X_test_cnn)
y_pred = (y_pred_probs > 0.5).astype(int)

# Compute and display metrics
precision_class, recall_class, f1_class, support_class = precision_recall_fscore_support(y_test, y_pred, average=None, labels=[0, 1])
conf_matrix = confusion_matrix(y_test, y_pred, labels=[0, 1])

# Calculate overall accuracy
accuracy = (conf_matrix[0, 0] + conf_matrix[1, 1]) / conf_matrix.sum()

print(f'Class 0 - Precision: {precision_class[0]}, Recall: {recall_class[0]}, Accuracy: {accuracy}, F1-score: {f1_class[0]}, Support: {support_class[0]}')
print(f'Class 1 - Precision: {precision_class[1]}, Recall: {recall_class[1]}, Accuracy: {accuracy}, F1-score: {f1_class[1]}, Support: {support_class[1]}')


[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step
Class 0 - Precision: 0.9169314571039492, Recall: 0.891832229580574, Accuracy: 0.9044642857142857, F1-score: 0.9042076991942704, Support: 2265
Class 1 - Precision: 0.8924022837066315, Recall: 0.9173814898419864, Accuracy: 0.9044642857142857, F1-score: 0.9047195013357079, Support: 2215
