<a href="https://colab.research.google.com/github/Meenusj/Case_study/blob/main/bertforsequenceclassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers[torch]
!pip install accelerate -U


Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-0.31.0-py3-none-any.whl (309 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->transformers[torch])
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->transformers[torch])
  Using cached nvidia_cublas_cu

In [1]:

import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, classification_report
import torch
import pickle

# Download stopwords
nltk.download('stopwords')

# Load datasets
dataset1 = pd.read_csv('/content/LLM_generated_essay_PaLM.csv')
dataset2 = pd.read_csv('/content/reddit_filtered_dataset.csv')
dataset3 = pd.read_csv('/content/train_essays.csv')

# Clean and preprocess datasets
def preprocess_dataset(dataset, text_col, label_col):
    dataset_cleaned = dataset.dropna()
    dataset_cleaned = dataset_cleaned[[text_col, label_col]]
    dataset_cleaned.rename(columns={text_col: 'text', label_col: 'labels'}, inplace=True)
    dataset_cleaned['labels'] = dataset_cleaned['labels'].astype(int)
    return dataset_cleaned

dataset1_cleaned = preprocess_dataset(dataset1, 'text', 'generated')
dataset2_cleaned = preprocess_dataset(dataset2, 'Data', 'Labels')
dataset3_cleaned = preprocess_dataset(dataset3, 'text', 'generated')

# Combine datasets
combined_dataset = pd.concat([dataset1_cleaned, dataset2_cleaned, dataset3_cleaned]).reset_index(drop=True)

# Save the combined dataset
combined_dataset.to_csv('/content/combined_cleaned_dataset.csv', index=False)

# Balance the dataset
def balance_dataset(dataset):
    df_majority = dataset[dataset.labels == 0]
    df_minority = dataset[dataset.labels == 1]
    df_majority_downsampled = df_majority.sample(len(df_minority), random_state=42)
    df_balanced = pd.concat([df_majority_downsampled, df_minority]).sample(frac=1, random_state=42).reset_index(drop=True)
    return df_balanced

df_balanced = balance_dataset(combined_dataset)

# Save the balanced dataset
df_balanced.to_csv('/content/balanced_dataset.csv', index=False)

# Preprocess text data
stop_words = set(stopwords.words('english'))
punctuation = string.punctuation

def clean_text(text):
    text = text.translate(str.maketrans('', '', punctuation))
    text = ' '.join([word for word in text.split() if word.lower() not in stop_words])
    return text

df_balanced['text'] = df_balanced['text'].apply(clean_text)

# Split the dataset into training, validation, and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(df_balanced['text'], df_balanced['labels'], test_size=0.2, random_state=42)
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.1, random_state=42)

# Save the split datasets
train_df = pd.DataFrame({'text': train_texts, 'labels': train_labels})
val_df = pd.DataFrame({'text': val_texts, 'labels': val_labels})
test_df = pd.DataFrame({'text': test_texts, 'labels': test_labels})

train_df.to_csv('/content/train_dataset.csv', index=False)
val_df.to_csv('/content/val_dataset.csv', index=False)
test_df.to_csv('/content/test_dataset.csv', index=False)

# Tokenize the text data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_data(text_list, tokenizer, max_length=128):
    return tokenizer(text_list, padding=True, truncation=True, max_length=max_length, return_tensors='pt')

train_encodings = tokenize_data(train_texts.tolist(), tokenizer)
val_encodings = tokenize_data(val_texts.tolist(), tokenizer)
test_encodings = tokenize_data(test_texts.tolist(), tokenizer)

# Create torch datasets
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = self.add_labels(encodings, labels)

    def add_labels(self, encodings, labels):
        encodings['labels'] = labels
        return encodings

    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings['input_ids'])

train_dataset = TextDataset(train_encodings, train_labels.tolist())
val_dataset = TextDataset(val_encodings, val_labels.tolist())
test_dataset = TextDataset(test_encodings, test_labels.tolist())

# Load the BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Set training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy='epoch',  # Evaluate at the end of each epoch
    save_strategy='epoch',  # Save the model at the end of each epoch
)

# Create the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Train the model
train_results = trainer.train()

# Print training metrics
print("Training metrics:")
for key, value in train_results.metrics.items():
    print(f"{key}: {value}")

# Evaluate the model on the validation set
val_results = trainer.evaluate(eval_dataset=val_dataset)
print("\nValidation results:")
for key, value in val_results.items():
    print(f"{key}: {value}")

# Evaluate the model on the test set
test_results = trainer.evaluate(eval_dataset=test_dataset)
print("\nTest results:")
for key, value in test_results.items():
    print(f"{key}: {value}")

# Predicting labels for the test set
predictions = trainer.predict(test_dataset)
predicted_labels = predictions.predictions.argmax(-1)

# Calculate accuracy
test_accuracy = accuracy_score(test_labels, predicted_labels)
print("\nTest Accuracy:", test_accuracy)

# Classification report
print(classification_report(test_labels, predicted_labels, target_names=['Human', 'Bot']))

# Print actual vs predicted labels
label_map = {0: "Human", 1: "Bot"}
actual_labels = [label_map[label] for label in test_labels]
predicted_labels_mapped = [label_map[label] for label in predicted_labels]
print("\nActual labels:", actual_labels)
print("Predicted labels:", predicted_labels_mapped)

# Save the model and tokenizer as pickle files
with open('/content/bert_model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

with open('/content/bert_tokenizer.pkl', 'wb') as tokenizer_file:
    pickle.dump(tokenizer, tokenizer_file)

print("\nModel and tokenizer saved as pickle files.")

# Save the model and tokenizer pickle files to Google Drive
pickle_model_path = 'bert_model.pkl'
pickle_tokenizer_path = 'bert_tokenizer.pkl'

with open(pickle_model_path, 'wb') as model_file:
    pickle.dump(model, model_file)

with open(pickle_tokenizer_path, 'wb') as tokenizer_file:
    pickle.dump(tokenizer, tokenizer_file)

print(f"Model and tokenizer saved  at {pickle_model_path} and {pickle_tokenizer_path}")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.2699,0.241434
2,0.0006,0.133271
3,0.0001,0.142549
4,0.0,0.109777
5,0.0,0.206536
6,0.0001,0.138209
7,0.0,0.143825
8,0.0,0.148287
9,0.0,0.150439
10,0.0,0.150733


Training metrics:
train_runtime: 1262.9212
train_samples_per_second: 31.411
train_steps_per_second: 3.927
total_flos: 2609403891532800.0
train_loss: 0.04895027032133429
epoch: 10.0



Validation results:
eval_loss: 0.15073302388191223
eval_runtime: 3.3289
eval_samples_per_second: 132.478
eval_steps_per_second: 16.823
epoch: 10.0

Test results:
eval_loss: 0.20370440185070038
eval_runtime: 8.3678
eval_samples_per_second: 131.695
eval_steps_per_second: 16.492
epoch: 10.0

Test Accuracy: 0.9764065335753176
              precision    recall  f1-score   support

       Human       0.99      0.96      0.98       565
         Bot       0.96      0.99      0.98       537

    accuracy                           0.98      1102
   macro avg       0.98      0.98      0.98      1102
weighted avg       0.98      0.98      0.98      1102


Actual labels: ['Human', 'Human', 'Bot', 'Human', 'Bot', 'Bot', 'Human', 'Human', 'Bot', 'Human', 'Bot', 'Bot', 'Bot', 'Human', 'Human', 'Human', 'Bot', 'Human', 'Bot', 'Human', 'Human', 'Bot', 'Bot', 'Human', 'Bot', 'Human', 'Human', 'Bot', 'Human', 'Bot', 'Bot', 'Bot', 'Bot', 'Bot', 'Bot', 'Human', 'Human', 'Human', 'Bot', 'Bot', 'Bot', 'Human

In [3]:
!pip install transformers[torch]
!pip install accelerate -U

import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, TrainerCallback
from sklearn.metrics import accuracy_score, classification_report
import torch
import pickle

# Download stopwords
nltk.download('stopwords')

# Load datasets
dataset1 = pd.read_csv('/content/LLM_generated_essay_PaLM.csv')
dataset2 = pd.read_csv('/content/reddit_filtered_dataset.csv')
dataset3 = pd.read_csv('/content/train_essays.csv')

# Clean and preprocess datasets
def preprocess_dataset(dataset, text_col, label_col):
    dataset_cleaned = dataset.dropna()
    dataset_cleaned = dataset_cleaned[[text_col, label_col]]
    dataset_cleaned.rename(columns={text_col: 'text', label_col: 'labels'}, inplace=True)
    dataset_cleaned['labels'] = dataset_cleaned['labels'].astype(int)
    return dataset_cleaned

dataset1_cleaned = preprocess_dataset(dataset1, 'text', 'generated')
dataset2_cleaned = preprocess_dataset(dataset2, 'Data', 'Labels')
dataset3_cleaned = preprocess_dataset(dataset3, 'text', 'generated')

# Combine datasets
combined_dataset = pd.concat([dataset1_cleaned, dataset2_cleaned, dataset3_cleaned]).reset_index(drop=True)

# Save the combined dataset
combined_dataset.to_csv('/content/combined_cleaned_dataset.csv', index=False)

# Balance the dataset
def balance_dataset(dataset):
    df_majority = dataset[dataset.labels == 0]
    df_minority = dataset[dataset.labels == 1]
    df_majority_downsampled = df_majority.sample(len(df_minority), random_state=42)
    df_balanced = pd.concat([df_majority_downsampled, df_minority]).sample(frac=1, random_state=42).reset_index(drop=True)
    return df_balanced

df_balanced = balance_dataset(combined_dataset)

# Save the balanced dataset
df_balanced.to_csv('/content/balanced_dataset.csv', index=False)

# Preprocess text data
stop_words = set(stopwords.words('english'))
punctuation = string.punctuation

def clean_text(text):
    text = text.translate(str.maketrans('', '', punctuation))
    text = ' '.join([word for word in text.split() if word.lower() not in stop_words])
    return text

df_balanced['text'] = df_balanced['text'].apply(clean_text)

# Split the dataset into training, validation, and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(df_balanced['text'], df_balanced['labels'], test_size=0.2, random_state=42)
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.1, random_state=42)

# Save the split datasets
train_df = pd.DataFrame({'text': train_texts, 'labels': train_labels})
val_df = pd.DataFrame({'text': val_texts, 'labels': val_labels})
test_df = pd.DataFrame({'text': test_texts, 'labels': test_labels})

train_df.to_csv('/content/train_dataset.csv', index=False)
val_df.to_csv('/content/val_dataset.csv', index=False)
test_df.to_csv('/content/test_dataset.csv', index=False)

# Tokenize the text data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_data(text_list, tokenizer, max_length=128):
    return tokenizer(text_list, padding=True, truncation=True, max_length=max_length, return_tensors='pt')

train_encodings = tokenize_data(train_texts.tolist(), tokenizer)
val_encodings = tokenize_data(val_texts.tolist(), tokenizer)
test_encodings = tokenize_data(test_texts.tolist(), tokenizer)

# Create torch datasets
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = self.add_labels(encodings, labels)

    def add_labels(self, encodings, labels):
        encodings['labels'] = labels
        return encodings

    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings['input_ids'])

train_dataset = TextDataset(train_encodings, train_labels.tolist())
val_dataset = TextDataset(val_encodings, val_labels.tolist())
test_dataset = TextDataset(test_encodings, test_labels.tolist())

# Load the BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Set training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy='epoch',  # Evaluate at the end of each epoch
    save_strategy='epoch',  # Save the model at the end of each epoch
)

# Custom callback to compute training accuracy
class ComputeMetricsCallback(TrainerCallback):
    def on_evaluate(self, args, state, control, **kwargs):
        val_results = kwargs['metrics']
        print(f"\nEpoch {state.epoch} Validation Accuracy: {val_results['eval_accuracy']}")
        print(f"Epoch {state.epoch} Validation Loss: {val_results['eval_loss']}")

    def on_log(self, args, state, control, **kwargs):
        logs = kwargs['logs']
        if 'loss' in logs:
            print(f"Epoch {state.epoch} Training Loss: {logs['loss']}")

def compute_metrics(p):
    preds = p.predictions.argmax(-1)
    return {
        'accuracy': accuracy_score(p.label_ids, preds)
    }

# Create the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[ComputeMetricsCallback]
)

# Train the model
train_results = trainer.train()

# Print training metrics
print("Training metrics:")
for key, value in train_results.metrics.items():
    print(f"{key}: {value}")

# Evaluate the model on the validation set
val_results = trainer.evaluate(eval_dataset=val_dataset)
print("\nValidation results:")
for key, value in val_results.items():
    print(f"{key}: {value}")

# Evaluate the model on the test set
test_results = trainer.evaluate(eval_dataset=test_dataset)
print("\nTest results:")
for key, value in test_results.items():
    print(f"{key}: {value}")

# Predicting labels for the test set
predictions = trainer.predict(test_dataset)
predicted_labels = predictions.predictions.argmax(-1)

# Calculate accuracy
test_accuracy = accuracy_score(test_labels, predicted_labels)
print("\nTest Accuracy:", test_accuracy)

# Classification report
print(classification_report(test_labels, predicted_labels, target_names=['Human', 'Bot']))

# Print actual vs predicted labels
label_map = {0: "Human", 1: "Bot"}
actual_labels = [label_map[label] for label in test_labels]
predicted_labels_mapped = [label_map[label] for label in predicted_labels]
print("\nActual labels:", actual_labels)
print("Predicted labels:", predicted_labels_mapped)

# Save the model and tokenizer as pickle files
with open('/content/bert_model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

with open('/content/bert_tokenizer.pkl', 'wb') as tokenizer_file:
    pickle.dump(tokenizer, tokenizer_file)

print("\nModel and tokenizer saved as pickle files.")

# Save the model and tokenizer pickle files to Google Drive
pickle_model_path = 'bert_model.pkl'
pickle_tokenizer_path = 'bert_tokenizer.pkl'

with open(pickle_model_path, 'wb') as model_file:
    pickle.dump(model, model_file)

with open(pickle_tokenizer_path, 'wb') as tokenizer_file:
    pickle.dump(tokenizer, tokenizer_file)

print(f"Model and tokenizer saved to Google Drive at {pickle_model_path} and {pickle_tokenizer_path}")




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.0814,0.555061,0.891156
2,0.008,0.099912,0.981859
3,0.0004,0.0977,0.984127
4,0.0002,0.219273,0.963719
5,0.0,0.121187,0.984127
6,0.0,0.172427,0.975057
7,0.0,0.140259,0.981859
8,0.0,0.144021,0.981859
9,0.0,0.145384,0.984127
10,0.0,0.145926,0.984127


Epoch 0.020161290322580645 Training Loss: 0.6892
Epoch 0.04032258064516129 Training Loss: 0.6905
Epoch 0.06048387096774194 Training Loss: 0.675
Epoch 0.08064516129032258 Training Loss: 0.6649
Epoch 0.10080645161290322 Training Loss: 0.6563
Epoch 0.12096774193548387 Training Loss: 0.5989
Epoch 0.14112903225806453 Training Loss: 0.6274
Epoch 0.16129032258064516 Training Loss: 0.5418
Epoch 0.1814516129032258 Training Loss: 0.476
Epoch 0.20161290322580644 Training Loss: 0.4273
Epoch 0.2217741935483871 Training Loss: 0.3753
Epoch 0.24193548387096775 Training Loss: 0.3603
Epoch 0.2620967741935484 Training Loss: 0.2615
Epoch 0.28225806451612906 Training Loss: 0.2431
Epoch 0.3024193548387097 Training Loss: 0.177
Epoch 0.3225806451612903 Training Loss: 0.321
Epoch 0.34274193548387094 Training Loss: 0.3091
Epoch 0.3629032258064516 Training Loss: 0.1823
Epoch 0.38306451612903225 Training Loss: 0.1688
Epoch 0.4032258064516129 Training Loss: 0.2602
Epoch 0.42338709677419356 Training Loss: 0.3036
Ep


Epoch 10.0 Validation Accuracy: 0.9841269841269841
Epoch 10.0 Validation Loss: 0.14592595398426056

Validation results:
eval_loss: 0.14592595398426056
eval_accuracy: 0.9841269841269841
eval_runtime: 3.3947
eval_samples_per_second: 129.908
eval_steps_per_second: 16.496
epoch: 10.0

Epoch 10.0 Validation Accuracy: 0.9791288566243194
Epoch 10.0 Validation Loss: 0.19395247101783752

Test results:
eval_loss: 0.19395247101783752
eval_accuracy: 0.9791288566243194
eval_runtime: 8.3705
eval_samples_per_second: 131.653
eval_steps_per_second: 16.486
epoch: 10.0

Test Accuracy: 0.9791288566243194
              precision    recall  f1-score   support

       Human       0.99      0.97      0.98       565
         Bot       0.97      0.99      0.98       537

    accuracy                           0.98      1102
   macro avg       0.98      0.98      0.98      1102
weighted avg       0.98      0.98      0.98      1102


Actual labels: ['Human', 'Human', 'Bot', 'Human', 'Bot', 'Bot', 'Human', 'Human

In [7]:
import pandas as pd
from transformers import BertForSequenceClassification, Trainer
import torch
import pickle
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# Load the new dataset
new_dataset = pd.read_csv('/content/sentence_level_data.csv')

# Rename columns to 'text' and 'labels' if necessary
new_dataset.rename(columns={'sentence': 'text', 'class': 'labels'}, inplace=True)

# Balance the dataset if necessary
def balance_dataset(dataset):
    df_majority = dataset[dataset.labels == 0]
    df_minority = dataset[dataset.labels == 1]

    # Check if balancing is possible
    if len(df_minority) > len(df_majority):
        print("Minority class larger than majority class, oversampling minority class.")
        df_minority_upsampled = df_minority.sample(len(df_majority), replace=True, random_state=42)
        df_balanced = pd.concat([df_majority, df_minority_upsampled])
    else:
        df_majority_downsampled = df_majority.sample(len(df_minority), random_state=42)
        df_balanced = pd.concat([df_majority_downsampled, df_minority])

    df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)
    return df_balanced

df_balanced = balance_dataset(new_dataset)

# Load the tokenizer from pickle
tokenizer_path = '/content/bert_tokenizer.pkl'
with open(tokenizer_path, 'rb') as tokenizer_file:
    tokenizer = pickle.load(tokenizer_file)

max_length = 128

def tokenize_data(text_list, tokenizer, max_length=max_length):
    return tokenizer(text_list, padding=True, truncation=True, max_length=max_length, return_tensors='pt')

new_encodings = tokenize_data(df_balanced['text'].tolist(), tokenizer)

class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings['input_ids'])

new_dataset = TextDataset(new_encodings)

# Load the BERT model from pickle
model_path = '/content/bert_model.pkl'
with open(model_path, 'rb') as model_file:
    model = pickle.load(model_file)

# Predict labels for the new dataset
trainer = Trainer(model=model)
predictions = trainer.predict(new_dataset)
predicted_labels = predictions.predictions.argmax(-1)

# Map labels if necessary
label_map = {0: "Human", 1: "Bot"}
predicted_labels_mapped = [label_map[label] for label in predicted_labels]

# Print predicted labels
print("Predicted labels:", predicted_labels_mapped)

# Compute metrics
true_labels = df_balanced['labels'].tolist()

accuracy = accuracy_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)
precision = precision_score(true_labels, predicted_labels)
recall = recall_score(true_labels, predicted_labels)

print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test F1 Score: {f1:.4f}")
print(f"Test Precision: {precision:.4f}")
print(f"Test Recall: {recall:.4f}")


Minority class larger than majority class, oversampling minority class.


Predicted labels: ['Human', 'Bot', 'Bot', 'Human', 'Bot', 'Human', 'Bot', 'Bot', 'Bot', 'Bot', 'Bot', 'Bot', 'Human', 'Human', 'Bot', 'Human', 'Bot', 'Human', 'Bot', 'Bot', 'Bot', 'Bot', 'Bot', 'Human', 'Bot', 'Bot', 'Bot', 'Bot', 'Bot', 'Bot', 'Bot', 'Bot', 'Human', 'Bot', 'Bot', 'Bot', 'Bot', 'Bot', 'Bot', 'Bot', 'Bot', 'Bot', 'Bot', 'Bot', 'Human', 'Bot', 'Bot', 'Human', 'Bot', 'Bot', 'Bot', 'Bot', 'Bot', 'Bot', 'Bot', 'Bot', 'Bot', 'Human', 'Bot', 'Bot', 'Bot', 'Bot', 'Human', 'Bot', 'Human', 'Bot', 'Bot', 'Bot', 'Bot', 'Bot', 'Bot', 'Bot', 'Bot', 'Bot', 'Bot', 'Bot', 'Bot', 'Bot', 'Bot', 'Bot', 'Bot', 'Bot', 'Bot', 'Human', 'Bot', 'Bot', 'Human', 'Bot', 'Human', 'Bot', 'Bot', 'Bot', 'Bot', 'Bot', 'Human', 'Human', 'Bot', 'Human', 'Bot', 'Bot', 'Bot', 'Bot', 'Human', 'Human', 'Bot', 'Bot', 'Human', 'Human', 'Bot', 'Bot', 'Bot', 'Bot', 'Bot', 'Bot', 'Human', 'Bot', 'Bot', 'Bot', 'Bot', 'Bot', 'Bot', 'Human', 'Human', 'Human', 'Human', 'Bot', 'Human', 'Human', 'Human', 'Bot', 'Bot', 

In [10]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, Trainer
import torch
import pickle
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report

# Load the new dataset
new_dataset = pd.read_csv('/content/sentence_level_data.csv')

# Rename columns to 'text' and 'labels'
new_dataset.rename(columns={'sentence': 'text', 'class': 'labels'}, inplace=True)

# Load the tokenizer from pickle
with open('/content/bert_tokenizer.pkl', 'rb') as tokenizer_file:
    tokenizer = pickle.load(tokenizer_file)

# Tokenize the text data
max_length = 128

def tokenize_data(text_list, tokenizer, max_length=max_length):
    return tokenizer(text_list, padding=True, truncation=True, max_length=max_length, return_tensors='pt')

new_encodings = tokenize_data(new_dataset['text'].tolist(), tokenizer)

class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.encodings['labels'] = labels

    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings['input_ids'])

new_dataset = TextDataset(new_encodings, new_dataset['labels'].tolist())

# Load the BERT model
model_path = '/content/bert_model.pkl'
with open(model_path, 'rb') as model_file:
    model = pickle.load(model_file)

# Predict labels for the new dataset
trainer = Trainer(model=model)
predictions = trainer.predict(new_dataset)
predicted_labels = predictions.predictions.argmax(-1)

# Map labels
label_map = {0: "Human", 1: "Bot"}
predicted_labels_mapped = [label_map[label] for label in predicted_labels]
actual_labels_mapped = [label_map[label] for label in new_dataset.encodings['labels']]

# Print actual vs predicted labels
print("Actual vs Predicted Labels:")
for actual, predicted in zip(actual_labels_mapped, predicted_labels_mapped):
    print(f"Actual: {actual}, Predicted: {predicted}")

# Calculate and print evaluation metrics
accuracy = accuracy_score(new_dataset.encodings['labels'], predicted_labels)
f1 = f1_score(new_dataset.encodings['labels'], predicted_labels, average='weighted')
precision = precision_score(new_dataset.encodings['labels'], predicted_labels, average='weighted')
recall = recall_score(new_dataset.encodings['labels'], predicted_labels, average='weighted')

print(f"\nAccuracy: {accuracy}")
print(f"F1 Score: {f1}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")

# Print classification report
print("\nClassification Report:")
print(classification_report(new_dataset.encodings['labels'], predicted_labels, target_names=['Human', 'Bot']))


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Actual: Human, Predicted: Human
Actual: Human, Predicted: Bot
Actual: Human, Predicted: Human
Actual: Human, Predicted: Human
Actual: Human, Predicted: Bot
Actual: Human, Predicted: Human
Actual: Human, Predicted: Human
Actual: Human, Predicted: Bot
Actual: Human, Predicted: Bot
Actual: Human, Predicted: Human
Actual: Human, Predicted: Human
Actual: Human, Predicted: Human
Actual: Human, Predicted: Human
Actual: Human, Predicted: Human
Actual: Human, Predicted: Human
Actual: Human, Predicted: Human
Actual: Human, Predicted: Human
Actual: Human, Predicted: Human
Actual: Human, Predicted: Human
Actual: Human, Predicted: Bot
Actual: Human, Predicted: Human
Actual: Human, Predicted: Human
Actual: Human, Predicted: Human
Actual: Human, Predicted: Human
Actual: Human, Predicted: Bot
Actual: Human, Predicted: Bot
Actual: Human, Predicted: Human
Actual: Human, Predicted: Bot
Actual: Human, Predicted: Human
Actual: Human, Predicte