In [7]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import accuracy_score, classification_report
from textpreprocessor import TextPreprocessor  # Assuming the TextPreprocessor class is defined

# Initialize the Text Pre Processor class
processor = TextPreprocessor()

# Load data
df_train, df_test = processor.load_data()

# Preprocess data
df_train = processor.preprocess(df_train)
df_test = processor.preprocess(df_test)

# Split data into X and y
X_train, y_train = processor.split_data(df_train)
X_test, y_test = processor.split_data(df_test)

# Tokenization and Padding
X_train_pad, X_test_pad = processor.tokenization_and_padding(X_train, X_test)

(3600000, 3)
(10000, 3)
(400000, 3)
(10000, 3)


In [2]:
# CNN Model Construction
embedding_dim = 128  # Embedding layer dimension

model = Sequential()
model.add(Embedding(input_dim=processor.max_features, output_dim=embedding_dim, input_length=processor.max_length))
model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  # Binary classification (0 or 1)



In [3]:
# Compile the model
model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])

I0000 00:00:1726778062.642552   69552 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1726778062.667746   69552 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1726778062.667813   69552 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1726778062.672444   69552 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1726778062.672501   69552 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:0

In [4]:
# Train the model
model.fit(X_train_pad, y_train, epochs=5, batch_size=64, validation_data=(X_test_pad, y_test))

Epoch 1/5


I0000 00:00:1726778063.544998   69779 service.cc:146] XLA service 0x7fd43000ad40 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1726778063.545029   69779 service.cc:154]   StreamExecutor device (0): NVIDIA GeForce RTX 3080, Compute Capability 8.6
2024-09-19 22:34:23.559137: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-09-19 22:34:23.639739: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:531] Loaded cuDNN version 8907


[1m109/157[0m [32m━━━━━━━━━━━━━[0m[37m━━━━━━━[0m [1m0s[0m 1ms/step - accuracy: 0.5746 - loss: 0.6621

I0000 00:00:1726778064.314545   69779 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - accuracy: 0.6074 - loss: 0.6344 - val_accuracy: 0.8231 - val_loss: 0.3934
Epoch 2/5
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8931 - loss: 0.2680 - val_accuracy: 0.8498 - val_loss: 0.3440
Epoch 3/5
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9765 - loss: 0.0952 - val_accuracy: 0.8485 - val_loss: 0.4016
Epoch 4/5
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9983 - loss: 0.0208 - val_accuracy: 0.8456 - val_loss: 0.4852
Epoch 5/5
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9995 - loss: 0.0079 - val_accuracy: 0.8483 - val_loss: 0.5488


<keras.src.callbacks.history.History at 0x7fd63b73b430>

In [5]:
# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test_pad, y_test)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 852us/step - accuracy: 0.8531 - loss: 0.5366
Test Accuracy: 84.83%


In [10]:
# Evaluate the model
y_pred_prob = model.predict(X_test_pad)
y_pred = (y_pred_prob > 0.5).astype("int32")

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step


In [11]:
# Model evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:\n", classification_report(y_test, y_pred))

Model Accuracy: 84.83%
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.86      0.85      4972
           1       0.86      0.84      0.85      5028

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000

