In [2]:
#Updates to keyboard shortcuts … On Thursday, August 1, 2024, Drive keyboard shortcuts will be updated to give you first-letters navigation.Learn more
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup, BertConfig
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import csv
import plotly.graph_objects as go

In [3]:
# Set up parameters
bert_model_name = 'bert-base-uncased'
num_classes = 2
max_length = 256
batch_size = 16
num_epochs = 3
learning_rate = 2e-5

In [4]:
def load_imdb_data(data_file):
    df = pd.read_csv(data_file)
    texts = df['text'].tolist()
    labels = df['generated'].tolist()  # Assuming 'label' column contains the labels directly
    return texts, labels

In [5]:
data_file = "C:/Users/HP VICTUS/Desktop/Book.csv"
texts, labels = load_imdb_data(data_file)

In [6]:
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
            self.texts = texts
            self.labels = labels
            self.tokenizer = tokenizer
            self.max_length = max_length
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label)}

class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):  # Correct indentation here
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        logits = self.fc(x)
        return logits

In [7]:
class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        super(BERTClassifier, self).__init__()
        # Include attention dropout in the configuration
        config = BertConfig.from_pretrained(bert_model_name)
        self.bert = BertModel.from_pretrained(bert_model_name, config=config)
        # self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        # x = self.dropout(pooled_output)
        logits = self.fc(pooled_output)
        return logits

In [8]:
def train(model, data_loader, optimizer, scheduler, device):
    model.train()  # Set the model to training mode
    total_loss = 0
    correct_predictions = 0
    total_examples = 0

    for batch_idx, batch in enumerate(data_loader):
        optimizer.zero_grad()  # Clear gradients before each optimization step
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        # Get model outputs, which are the logits in this case
        logits = model(input_ids=input_ids, attention_mask=attention_mask)

        # Calculate loss using the logits and actual labels
        loss = nn.CrossEntropyLoss()(logits, labels)



        # l2_reg_loss = sum(torch.norm(param) ** 2 for param in model.parameters())
        # loss += 0.5 * 0.05 * l2_reg_loss
        total_loss += loss.item()

        # Calculate accuracy
        _, predicted_labels = torch.max(logits, dim=1)
        correct_predictions += (predicted_labels == labels).sum().item()
        total_examples += labels.size(0)
        loss.backward()  # Backpropagate the error
        optimizer.step()  # Update parameters
        scheduler.step()  # Update learning rate

        if batch_idx % 100 == 0:
            print(f"Batch {batch_idx}/{len(data_loader)}: Loss {loss.item()}")

    average_loss = total_loss / len(data_loader)
    train_accuracy = correct_predictions / total_examples

    return average_loss, train_accuracy

In [9]:
def evaluate(model, data_loader, device):
    model.eval()
    total_loss = 0
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = nn.CrossEntropyLoss()(outputs, labels)
            # l2_reg_loss = sum(torch.norm(param) ** 2 for param in model.parameters())
            # loss += 0.5 * 0.05 * l2_reg_loss
            total_loss += loss.item()
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
            
    accuracy = accuracy_score(actual_labels, predictions)
    average_loss = total_loss / len(data_loader)
    return accuracy, average_loss,classification_report(actual_labels, predictions)

In [10]:
def predict_text_source(text, model, tokenizer, device, max_length=128):
    model.eval()
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)
    return "AI-generated" if preds.item() == 1 else "Human-written"

In [11]:
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.3, random_state=42)

In [12]:
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, max_length)
val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer, max_length)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

In [26]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Running on : ",device)
model = BERTClassifier(bert_model_name, num_classes).to(device)
#model.load_state_dict(torch.load("bert_classifier.pth"))

Running on :  cpu


In [14]:
optimizer = AdamW(model.parameters(), lr=learning_rate,weight_decay=0.05)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)



In [15]:
train_losses = []
val_losses = []
train_accuracies = []
val_accuracies = []

In [16]:
best_val_metric = float('-inf')  # Initialize best validation metric (can be accuracy or loss)
patience = 3  # Number of epochs to wait for improvement

In [17]:
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    train_loss,train_accuracy = train(model, train_dataloader, optimizer, scheduler, device)
    train_losses.append(train_loss)
    train_accuracies.append(train_accuracy)
    val_accuracy, val_loss,report = evaluate(model, val_dataloader, device)
    val_losses.append(val_loss)
    val_accuracies.append(val_accuracy)
    print(f"Validation Accuracy: {val_accuracy:.4f}")
    print(f"Training Accuracy: {train_accuracy:.4f}")
    print(f"Training Loss: {train_loss:.4f}")
    print(f"Validation Loss: {val_loss:.4f}")
    
    print(report)
    # if val_accuracy > best_val_metric:  # Replace 'val_accuracy' with loss if needed
    #     best_val_metric = val_accuracy
    #     patience_counter = 0  # Reset patience counter
    # else:
    #     patience_counter += 1

    # if patience_counter >= patience:
    #     print(f"Early stopping triggered after {epoch + 1} epochs")
    #     break

Epoch 1/3
Batch 0/23: Loss 0.68047696352005
Validation Accuracy: 0.9430
Training Accuracy: 0.9266
Training Loss: 0.3440
Validation Loss: 0.2165
              precision    recall  f1-score   support

           0       0.90      1.00      0.95        81
           1       1.00      0.88      0.94        77

    accuracy                           0.94       158
   macro avg       0.95      0.94      0.94       158
weighted avg       0.95      0.94      0.94       158

Epoch 2/3
Batch 0/23: Loss 0.12752029299736023
Validation Accuracy: 0.9430
Training Accuracy: 0.9592
Training Loss: 0.1532
Validation Loss: 0.1486
              precision    recall  f1-score   support

           0       0.90      1.00      0.95        81
           1       1.00      0.88      0.94        77

    accuracy                           0.94       158
   macro avg       0.95      0.94      0.94       158
weighted avg       0.95      0.94      0.94       158

Epoch 3/3
Batch 0/23: Loss 0.05893878638744354
Validati

In [40]:
import torch
import os
import json

# Ensure the model directory exists
model_directory = "model"
os.makedirs(model_directory, exist_ok=True)  # Create the directory if it doesn't exist

# Save the model weights to 'pytorch_model.bin'
torch.save(model.state_dict(), os.path.join(model_directory, "pytorch_model.bin"))

# Optionally save the configuration as a JSON file
config = {
    "num_labels": 2,  # Example: number of output labels
    "model_name": "bert-base-uncased",  # Base model used
}

# Save configuration to 'config.json'
with open(os.path.join(model_directory, "config.json"), "w") as f:
    json.dump(config, f)

print("Model and configuration have been saved successfully.")


Model and configuration have been saved successfully.


In [41]:
class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        super(BERTClassifier, self).__init__()
        # Include attention dropout in the configuration
        config = BertConfig.from_pretrained(bert_model_name)
        self.bert = BertModel.from_pretrained(bert_model_name, config=config)
        # self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        # x = self.dropout(pooled_output)
        logits = self.fc(pooled_output)
        return logits
with open("model/config.json", "r") as f:
    config = json.load(f)

# Reconstruct the model
model = BERTClassifier(config["model_name"], config["num_labels"])

# Load the saved weights
model.load_state_dict(torch.load("model/bert_classifier.pth", map_location="cpu"))

# Set the model to evaluation mode
model.eval()


  model.load_state_dict(torch.load("model/bert_classifier.pth", map_location="cpu"))


BERTClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [32]:
essay="dogs ar very friendly animals ,they are used to gaurd the house and can safegaurd owners belonging"

In [33]:
result=predict_text_source(essay, model, tokenizer, device)
print(result)

Human-written


In [None]:
epochs = range(1, num_epochs + 1)
# Create traces for training and validation loss
trace1 = go.Scatter(
    x=list(epochs),
    y=train_losses,
    mode='lines+markers',
    name='Training Loss',
    marker=dict(color='blue')
)

In [None]:
trace2 = go.Scatter(
    x=list(epochs),
    y=val_losses,
    mode='lines+markers',
    name='Validation Loss',
    marker=dict(color='red')
)

In [None]:
# Create traces for training and validation accuracy
trace3 = go.Scatter(
    x=list(epochs),
    y=train_accuracies,
    mode='lines+markers',
    name='Training Accuracy',
    marker=dict(color='purple')
)

In [None]:
trace4 = go.Scatter(
    x=list(epochs),
    y=val_accuracies,
    mode='lines+markers',
    name='Validation Accuracy',
    marker=dict(color='green')
)

In [None]:
# Create the figure and add traces for loss
fig = go.Figure()
fig.add_trace(trace1)
fig.add_trace(trace2)

In [None]:
# Set layout for loss plot
fig.update_layout(
    title='Training and Validation Loss',
    xaxis_title='Epoch',
    yaxis_title='Loss',
    legend_title='Legend',
    width=600,  # adjust size as needed
    height=400
)

In [None]:
# Show the figure
fig.show()

In [None]:
# Create a new figure for accuracy
fig2_accuracy = go.Figure()
fig2_accuracy.add_trace(trace3)
fig2_accuracy.add_trace(trace4)

In [None]:
# Set layout for accuracy plot
fig2_accuracy.update_layout(
    title='Training and Validation Accuracy',
    xaxis_title='Epoch',
    yaxis_title='Accuracy',
    legend_title='Legend',
    width=600,  # adjust size as needed
    height=400
)

In [None]:
# Show the figure
fig2_accuracy.show()