In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM, Bidirectional, SimpleRNN
from tensorflow.keras.layers import GlobalAveragePooling1D, SpatialDropout1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import Precision, Recall
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report

# Load data
data = pd.read_csv('TP_DS.csv')  # Replace with your file path
data['label_encoded'] = LabelEncoder().fit_transform(data['label'])

# Split data
X = data['cleaned_text']
y = data['label_encoded']
X = data['cleaned_text'].astype(str)  # Convert to string to handle any float or NaN issues
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenization and Padding
vocab_size = 10000
max_length = 100
embedding_dim = 100

tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length, padding='post', truncating='post')

# Define a function to build models
def build_model(model_type="RNN"):
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_dim, input_length=max_length))
    model.add(SpatialDropout1D(0.2))

    if model_type == "RNN":
        model.add(SimpleRNN(64, return_sequences=False))
    elif model_type == "LSTM":
        model.add(LSTM(64, return_sequences=False))
    elif model_type == "BiLSTM":
        model.add(Bidirectional(LSTM(64, return_sequences=False)))
        
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))  # Binary classification

    model.compile(optimizer=Adam(learning_rate=0.001), 
                  loss='binary_crossentropy', 
                  metrics=['accuracy', Precision(), Recall()])
    return model

# Training and Evaluation
def train_and_evaluate(model_type):
    model = build_model(model_type)
    early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
    
    history = model.fit(X_train_pad, y_train, 
                        epochs=10, 
                        batch_size=64, 
                        validation_split=0.2,
                        callbacks=[early_stopping])
    
    # Evaluation
    y_pred = (model.predict(X_test_pad) > 0.5).astype("int32")
    print(f"Classification Report for {model_type}:")
    print(classification_report(y_test, y_pred, target_names=['CG', 'OR']))
    
# Train and evaluate RNN, LSTM, and BiLSTM models
for model_type in ["RNN", "LSTM", "BiLSTM"]:
    train_and_evaluate(model_type)




Epoch 1/10
[1m405/405[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 30ms/step - accuracy: 0.5128 - loss: 0.6944 - precision: 0.5138 - recall: 0.4306 - val_accuracy: 0.5247 - val_loss: 0.6964 - val_precision: 0.7150 - val_recall: 0.0881
Epoch 2/10
[1m405/405[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 27ms/step - accuracy: 0.5249 - loss: 0.6868 - precision: 0.5364 - recall: 0.4132 - val_accuracy: 0.5353 - val_loss: 0.6917 - val_precision: 0.6002 - val_recall: 0.2224
Epoch 3/10
[1m405/405[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 29ms/step - accuracy: 0.5179 - loss: 0.6890 - precision: 0.5217 - recall: 0.4416 - val_accuracy: 0.5268 - val_loss: 0.6898 - val_precision: 0.7313 - val_recall: 0.0905
Epoch 4/10
[1m405/405[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 29ms/step - accuracy: 0.5287 - loss: 0.6802 - precision: 0.5416 - recall: 0.4325 - val_accuracy: 0.5531 - val_loss: 0.6820 - val_precision: 0.5845 - val_recall: 0.3791
Epoch 5/10
[1m4



[1m405/405[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 43ms/step - accuracy: 0.5312 - loss: 0.6839 - precision_1: 0.5584 - recall_1: 0.3165 - val_accuracy: 0.5747 - val_loss: 0.6572 - val_precision_1: 0.9444 - val_recall_1: 0.1623
Epoch 2/10
[1m405/405[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 43ms/step - accuracy: 0.6771 - loss: 0.5770 - precision_1: 0.7418 - recall_1: 0.5383 - val_accuracy: 0.8589 - val_loss: 0.3214 - val_precision_1: 0.8964 - val_recall_1: 0.8128
Epoch 3/10
[1m405/405[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 43ms/step - accuracy: 0.8918 - loss: 0.2580 - precision_1: 0.8939 - recall_1: 0.8891 - val_accuracy: 0.8813 - val_loss: 0.3156 - val_precision_1: 0.9063 - val_recall_1: 0.8516
Epoch 4/10
[1m405/405[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 48ms/step - accuracy: 0.9262 - loss: 0.1747 - precision_1: 0.9285 - recall_1: 0.9237 - val_accuracy: 0.9006 - val_loss: 0.2497 - val_precision_1: 0.8841 - val_recall_1: 0.



[1m405/405[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 73ms/step - accuracy: 0.7694 - loss: 0.4364 - precision_2: 0.7748 - recall_2: 0.7666 - val_accuracy: 0.8907 - val_loss: 0.2491 - val_precision_2: 0.9370 - val_recall_2: 0.8386
Epoch 2/10
[1m405/405[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 67ms/step - accuracy: 0.9267 - loss: 0.1814 - precision_2: 0.9256 - recall_2: 0.9278 - val_accuracy: 0.9145 - val_loss: 0.2071 - val_precision_2: 0.8971 - val_recall_2: 0.9372
Epoch 3/10
[1m405/405[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 81ms/step - accuracy: 0.9470 - loss: 0.1306 - precision_2: 0.9464 - recall_2: 0.9462 - val_accuracy: 0.9080 - val_loss: 0.2136 - val_precision_2: 0.9131 - val_recall_2: 0.9027
Epoch 4/10
[1m405/405[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 80ms/step - accuracy: 0.9672 - loss: 0.0867 - precision_2: 0.9674 - recall_2: 0.9665 - val_accuracy: 0.9130 - val_loss: 0.2412 - val_precision_2: 0.9139 - val_recall_2: 0.

Transformers and PreTrained Models

In [10]:
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124

Note: you may need to restart the kernel to use updated packages.


  You can safely remove it manually.

[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Looking in indexes: https://download.pytorch.org/whl/cu124
INFO: pip is looking at multiple versions of torchvision to determine which version is compatible with other requirements. This could take a while.
Collecting torchvision
  Downloading https://download.pytorch.org/whl/cu124/torchvision-0.20.1%2Bcu124-cp311-cp311-win_amd64.whl (6.1 MB)
     ---------------------------------------- 0.0/6.1 MB ? eta -:--:--
     ---------------------------------------- 0.0/6.1 MB 991.0 kB/s eta 0:00:07
      --------------------------------------- 0.1/6.1 MB 1.8 MB/s eta 0:00:04
     - -------------------------------------- 0.3/6.1 MB 2.0 MB/s eta 0:00:03
     -- ------------------------------------- 0.5/6.1 MB 2.6 MB/s eta 0:00:03
     ---- ----------------------------------- 0.6/6.1 MB 2.9 MB/s eta 0:00:02
     ------- -------------------------------- 1.1/6.1 MB 4.1 MB/s eta 0:00:02
     --------- ------------------------------ 1.5/6.1 MB 4.9 MB/s eta 0:00:01
     ------------- -----------------

In [1]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Check if CUDA (GPU) is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Print the device being used (CPU or GPU)
print(f"Using device: {device}")

# Load dataset and preprocess
data = pd.read_csv('TP_DS.csv')  # Replace with your file path
data['label_encoded'] = LabelEncoder().fit_transform(data['label'])

# Split dataset
X = data['cleaned_text'].astype(str)
y = data['label_encoded']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Custom Dataset class to handle text and labels
class ReviewDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        label = self.labels.iloc[idx]
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten().to(device),
            'attention_mask': encoding['attention_mask'].flatten().to(device),
            'labels': torch.tensor(label, dtype=torch.long).to(device)
        }

# Create dataset and data loaders
train_dataset = ReviewDataset(X_train, y_train, tokenizer)
test_dataset = ReviewDataset(X_test, y_test, tokenizer)

# Load pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2).to(device)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
)

# Define Trainer for training BERT model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer
)

# Train model
trainer.train()

# Evaluate on test set
predictions = trainer.predict(test_dataset)
pred_labels = torch.argmax(torch.tensor(predictions.predictions), axis=1).numpy()

from sklearn.metrics import classification_report
print(classification_report(y_test, pred_labels, target_names=['CG', 'OR']))



Using device: cuda


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


  0%|          | 0/12132 [00:00<?, ?it/s]

{'loss': 0.5007, 'grad_norm': 21.1203670501709, 'learning_rate': 4.793933399274646e-05, 'epoch': 0.12}
{'loss': 0.4261, 'grad_norm': 190.98106384277344, 'learning_rate': 4.587866798549291e-05, 'epoch': 0.25}
{'loss': 0.3834, 'grad_norm': 19.80316162109375, 'learning_rate': 4.381800197823937e-05, 'epoch': 0.37}
{'loss': 0.3472, 'grad_norm': 3.3415114879608154, 'learning_rate': 4.175733597098582e-05, 'epoch': 0.49}
{'loss': 0.3522, 'grad_norm': 12.993058204650879, 'learning_rate': 3.969666996373228e-05, 'epoch': 0.62}
{'loss': 0.3714, 'grad_norm': 22.177139282226562, 'learning_rate': 3.763600395647874e-05, 'epoch': 0.74}
{'loss': 0.3499, 'grad_norm': 7.584499359130859, 'learning_rate': 3.557533794922519e-05, 'epoch': 0.87}
{'loss': 0.3363, 'grad_norm': 7.132500171661377, 'learning_rate': 3.3514671941971646e-05, 'epoch': 0.99}


  0%|          | 0/1011 [00:00<?, ?it/s]

{'eval_loss': 0.347614049911499, 'eval_runtime': 65.4323, 'eval_samples_per_second': 123.593, 'eval_steps_per_second': 15.451, 'epoch': 1.0}
{'loss': 0.3295, 'grad_norm': 23.84906578063965, 'learning_rate': 3.14540059347181e-05, 'epoch': 1.11}
{'loss': 0.3093, 'grad_norm': 6.431917190551758, 'learning_rate': 2.939333992746456e-05, 'epoch': 1.24}
{'loss': 0.3138, 'grad_norm': 53.90829086303711, 'learning_rate': 2.7332673920211017e-05, 'epoch': 1.36}
{'loss': 0.2794, 'grad_norm': 3.948133945465088, 'learning_rate': 2.5272007912957468e-05, 'epoch': 1.48}
{'loss': 0.2561, 'grad_norm': 0.33560460805892944, 'learning_rate': 2.3211341905703922e-05, 'epoch': 1.61}
{'loss': 0.2843, 'grad_norm': 107.04515075683594, 'learning_rate': 2.115067589845038e-05, 'epoch': 1.73}
{'loss': 0.2474, 'grad_norm': 1.2846009731292725, 'learning_rate': 1.9090009891196835e-05, 'epoch': 1.85}
{'loss': 0.2726, 'grad_norm': 10.7352876663208, 'learning_rate': 1.702934388394329e-05, 'epoch': 1.98}


  0%|          | 0/1011 [00:00<?, ?it/s]

{'eval_loss': 0.37920597195625305, 'eval_runtime': 65.4403, 'eval_samples_per_second': 123.578, 'eval_steps_per_second': 15.449, 'epoch': 2.0}
{'loss': 0.2136, 'grad_norm': 0.15450133383274078, 'learning_rate': 1.4968677876689746e-05, 'epoch': 2.1}
{'loss': 0.2118, 'grad_norm': 4.162031650543213, 'learning_rate': 1.29080118694362e-05, 'epoch': 2.23}
{'loss': 0.1979, 'grad_norm': 2.6745712757110596, 'learning_rate': 1.0847345862182658e-05, 'epoch': 2.35}
{'loss': 0.1966, 'grad_norm': 92.49822998046875, 'learning_rate': 8.786679854929115e-06, 'epoch': 2.47}
{'loss': 0.1877, 'grad_norm': 0.08781655132770538, 'learning_rate': 6.726013847675569e-06, 'epoch': 2.6}
{'loss': 0.1977, 'grad_norm': 4.686941623687744, 'learning_rate': 4.665347840422025e-06, 'epoch': 2.72}
{'loss': 0.1691, 'grad_norm': 0.3438433110713959, 'learning_rate': 2.6046818331684802e-06, 'epoch': 2.84}
{'loss': 0.1742, 'grad_norm': 0.259590744972229, 'learning_rate': 5.440158259149357e-07, 'epoch': 2.97}


  0%|          | 0/1011 [00:00<?, ?it/s]

{'eval_loss': 0.28281155228614807, 'eval_runtime': 66.3205, 'eval_samples_per_second': 121.938, 'eval_steps_per_second': 15.244, 'epoch': 3.0}
{'train_runtime': 3283.9233, 'train_samples_per_second': 29.548, 'train_steps_per_second': 3.694, 'train_loss': 0.2865584568736815, 'epoch': 3.0}


  0%|          | 0/1011 [00:00<?, ?it/s]

              precision    recall  f1-score   support

          CG       0.90      0.95      0.93      4016
          OR       0.95      0.90      0.92      4071

    accuracy                           0.92      8087
   macro avg       0.93      0.92      0.92      8087
weighted avg       0.93      0.92      0.92      8087



In [4]:
import os
import json

# Define a directory to save the model and tokenizer
model_dir = 'BERT./saved_model'
os.makedirs(model_dir, exist_ok=True)

# Save the trained BERT model
model.save_pretrained(model_dir)

# Save the tokenizer
tokenizer.save_pretrained(model_dir)

# Save training arguments as a JSON file
with open(os.path.join(model_dir, 'training_args.json'), 'w') as f:
    json.dump(training_args.to_dict(), f)

print(f"Model, tokenizer, and training arguments saved in {model_dir}")


Model, tokenizer, and training arguments saved in BERT./saved_model


In [1]:
import torch
print("Is CUDA available:", torch.cuda.is_available())
print("Number of GPUs available:", torch.cuda.device_count())
print("GPU Name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU detected")


Is CUDA available: True
Number of GPUs available: 1
GPU Name: NVIDIA GeForce RTX 3050 Laptop GPU


In [None]:
'''This saved setup can be used directly for deployment, where you can reload the model and tokenizer using the from_pretrained method as follows:'''
from transformers import BertTokenizer, BertForSequenceClassification

# Load the model and tokenizer for deployment
loaded_model = BertForSequenceClassification.from_pretrained(model_dir)
loaded_tokenizer = BertTokenizer.from_pretrained(model_dir)
