<a href="https://colab.research.google.com/github/Meenusj/Case_study/blob/main/bert_base_uncased.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers[torch]

Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-0.31.0-py3-none-any.whl (309 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->transformers[torch])
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->transformers[torch])
  Using cached nvidia_cublas_cu

In [2]:
!pip install accelerate -U




In [1]:
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, classification_report
import torch
import pickle

# Download stopwords
nltk.download('stopwords')

# Load datasets
dataset1 = pd.read_csv('/content/LLM_generated_essay_PaLM.csv')
dataset2 = pd.read_csv('/content/reddit_filtered_dataset.csv')
dataset3 = pd.read_csv('/content/train_essays.csv')

# Clean and preprocess datasets
def preprocess_dataset(dataset, text_col, label_col):
    dataset_cleaned = dataset.dropna()
    dataset_cleaned = dataset_cleaned[[text_col, label_col]]
    dataset_cleaned.rename(columns={text_col: 'text', label_col: 'labels'}, inplace=True)
    dataset_cleaned['labels'] = dataset_cleaned['labels'].astype(int)
    return dataset_cleaned

dataset1_cleaned = preprocess_dataset(dataset1, 'text', 'generated')
dataset2_cleaned = preprocess_dataset(dataset2, 'Data', 'Labels')
dataset3_cleaned = preprocess_dataset(dataset3, 'text', 'generated')

# Combine datasets
combined_dataset = pd.concat([dataset1_cleaned, dataset2_cleaned, dataset3_cleaned]).reset_index(drop=True)

# Save the combined dataset
combined_dataset.to_csv('/content/combined_cleaned_dataset.csv', index=False)

# Balance the dataset
def balance_dataset(dataset):
    df_majority = dataset[dataset.labels == 0]
    df_minority = dataset[dataset.labels == 1]
    df_majority_downsampled = df_majority.sample(len(df_minority), random_state=42)
    df_balanced = pd.concat([df_majority_downsampled, df_minority]).sample(frac=1, random_state=42).reset_index(drop=True)
    return df_balanced

df_balanced = balance_dataset(combined_dataset)

# Save the balanced dataset
df_balanced.to_csv('/content/balanced_dataset.csv', index=False)

# Preprocess text data
stop_words = set(stopwords.words('english'))
punctuation = string.punctuation

def clean_text(text):
    text = text.translate(str.maketrans('', '', punctuation))
    text = ' '.join([word for word in text.split() if word.lower() not in stop_words])
    return text

df_balanced['text'] = df_balanced['text'].apply(clean_text)

# Split the dataset into training, validation, and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(df_balanced['text'], df_balanced['labels'], test_size=0.2, random_state=42)
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.1, random_state=42)

# Save the split datasets
train_df = pd.DataFrame({'text': train_texts, 'labels': train_labels})
val_df = pd.DataFrame({'text': val_texts, 'labels': val_labels})
test_df = pd.DataFrame({'text': test_texts, 'labels': test_labels})

train_df.to_csv('/content/train_dataset.csv', index=False)
val_df.to_csv('/content/val_dataset.csv', index=False)
test_df.to_csv('/content/test_dataset.csv', index=False)

# Tokenize the text data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_data(text_list, tokenizer, max_length=128):
    return tokenizer(text_list, padding=True, truncation=True, max_length=max_length, return_tensors='pt')

train_encodings = tokenize_data(train_texts.tolist(), tokenizer)
val_encodings = tokenize_data(val_texts.tolist(), tokenizer)
test_encodings = tokenize_data(test_texts.tolist(), tokenizer)

# Create torch datasets
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = self.add_labels(encodings, labels)

    def add_labels(self, encodings, labels):
        encodings['labels'] = labels
        return encodings

    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings['input_ids'])

train_dataset = TextDataset(train_encodings, train_labels.tolist())
val_dataset = TextDataset(val_encodings, val_labels.tolist())
test_dataset = TextDataset(test_encodings, test_labels.tolist())

# Load the BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Set training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy='epoch',  # Evaluate at the end of each epoch
    save_strategy='epoch',  # Save the model at the end of each epoch
)

# Create the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Train the model
trainer.train()

# Evaluate the model on the validation set
val_results = trainer.evaluate(eval_dataset=val_dataset)
print("Validation results:", val_results)

# Evaluate the model on the test set
test_results = trainer.evaluate(eval_dataset=test_dataset)
print("Test results:", test_results)

# Predicting labels for the test set
predictions = trainer.predict(test_dataset)
predicted_labels = predictions.predictions.argmax(-1)

# Calculate accuracy
test_accuracy = accuracy_score(test_labels, predicted_labels)
print("Test Accuracy:", test_accuracy)

# Classification report
print(classification_report(test_labels, predicted_labels, target_names=['Human', 'Bot']))

# Print actual vs predicted labels
label_map = {0: "Human", 1: "Bot"}
actual_labels = [label_map[label] for label in test_labels]
predicted_labels_mapped = [label_map[label] for label in predicted_labels]
print("Actual labels:", actual_labels)
print("Predicted labels:", predicted_labels_mapped)

# Save the model and tokenizer as pickle files
with open('/content/bert_model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

with open('/content/bert_tokenizer.pkl', 'wb') as tokenizer_file:
    pickle.dump(tokenizer, tokenizer_file)

print("Model and tokenizer saved as pickle files.")

# To load the model and tokenizer from pickle files
with open('/content/bert_model.pkl', 'rb') as model_file:
    loaded_model = pickle.load(model_file)

with open('/content/bert_tokenizer.pkl', 'rb') as tokenizer_file:
    loaded_tokenizer = pickle.load(tokenizer_file)

# Use the loaded model for prediction
loaded_model.eval()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.0809,0.180625
2,0.0653,0.077875
3,0.1531,0.097388


Validation results: {'eval_loss': 0.09738809615373611, 'eval_runtime': 3.2752, 'eval_samples_per_second': 134.646, 'eval_steps_per_second': 17.098, 'epoch': 3.0}
Test results: {'eval_loss': 0.16856439411640167, 'eval_runtime': 8.2548, 'eval_samples_per_second': 133.498, 'eval_steps_per_second': 16.717, 'epoch': 3.0}
Test Accuracy: 0.9709618874773139
              precision    recall  f1-score   support

       Human       0.99      0.95      0.97       565
         Bot       0.95      0.99      0.97       537

    accuracy                           0.97      1102
   macro avg       0.97      0.97      0.97      1102
weighted avg       0.97      0.97      0.97      1102

Actual labels: ['Human', 'Human', 'Bot', 'Human', 'Bot', 'Bot', 'Human', 'Human', 'Bot', 'Human', 'Bot', 'Bot', 'Bot', 'Human', 'Human', 'Human', 'Bot', 'Human', 'Bot', 'Human', 'Human', 'Bot', 'Bot', 'Human', 'Bot', 'Human', 'Human', 'Bot', 'Human', 'Bot', 'Bot', 'Bot', 'Bot', 'Bot', 'Bot', 'Human', 'Human', 'Human', '

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [2]:

# Save the model and tokenizer pickle files to Google Drive
pickle_model_path = '/content/drive/My Drive/bert_model.pkl'
pickle_tokenizer_path = '/content/drive/My Drive/bert_tokenizer.pkl'

with open(pickle_model_path, 'wb') as model_file:
    pickle.dump(model, model_file)

with open(pickle_tokenizer_path, 'wb') as tokenizer_file:
    pickle.dump(tokenizer, tokenizer_file)

print(f"Model and tokenizer saved to Google Drive at {pickle_model_path} and {pickle_tokenizer_path}")

Model and tokenizer saved to Google Drive at /content/drive/My Drive/bert_model.pkl and /content/drive/My Drive/bert_tokenizer.pkl
