In [1]:
!pip install transformers



In [2]:
import numpy as np
import pandas as pd
import re
import string
from tqdm.notebook import tqdm
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report, accuracy_score
from transformers import (set_seed,
                          TrainingArguments,
                          Trainer,
                          GPT2Config,
                          GPT2Tokenizer,
                          AdamW,
                          get_linear_schedule_with_warmup,
                          GPT2ForSequenceClassification)

In [3]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
max_len = 512 # Max lenght of the text for input
batch_size = 4
epochs = 6
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
labels_ids = {'no': 0, 'yes': 1}
n_labels = len(labels_ids)

In [6]:
encoding = 'ISO-8859-1'

# Read Data

In [7]:
df1 = pd.read_csv("/content/drive/MyDrive/DataSet/csv dataset/enemyAI/genAI.csv", encoding=encoding)

In [8]:
df1.head()

Unnamed: 0,Content,Label
0,"""Data"" refers to raw facts, information, or st...",yes
1,Data redundancy refers to the duplication of d...,yes
2,Increased Storage Requirements: Storing the sa...,yes
3,To mitigate data redundancy and its associated...,yes
4,"In some cases, a certain degree of redundancy ...",yes


In [9]:
df2 = pd.read_csv('/content/drive/MyDrive/DataSet/csv dataset/enemyAI/human.csv', encoding=encoding)

In [10]:
df2['Label'] = df2['label'].fillna('No')
df2.head()

Unnamed: 0,Content,label,Label
0,"Natural language processing, in its simplest f...",no,no
1,"In a normal conversation between humans, thing...",no,no
2,"Up until now, we?ve been discussing artificial...",no,no
3,Feedforward neural networks constitute the bas...,no,no
4,Convolutional neural networks are well adapted...,no,no


In [11]:
df2 = df2.drop(columns=['label'], axis=1)

In [12]:
df1.shape, df2.shape

((2053, 2), (2254, 2))

In [13]:
df = pd.concat([df1, df2], ignore_index=False)

In [14]:
df.shape

(4307, 2)

In [15]:
# Shuffle the DataFrame
df = df.reset_index(drop=True)
df = df.sample(frac=1, random_state=42)
df

Unnamed: 0,Content,Label
151,"Overall, CNNs have become an essential tool fo...",yes
3866,when un fill a space like name or email messag...,no
3164,If user try to LOGIN with incorrect email id a...,no
2755,"We not only focus on verify the vaccination, w...",no
3450,Usability requirements make the products meet ...,no
...,...,...
3444,Documenting customer data. All information a c...,no
466,Accurate Translation of Named Entities: Named ...,yes
3092,the current pandemic restrictions by using dif...,no
3772,In Traditional methodology testing is done onc...,no


# Helper Function

In [16]:
class DatasetCreator(Dataset):
    def __init__(self, processed_data, train):
        self.data = processed_data
        self.train = train

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        line = self.data.iloc[index]
        if self.train:
            return {'text': line['Content'], 'label': line['Label']}
        else:
            return {'text': line['Content'], 'label': 0}

# Class to tokenize and process the text for input to the dataloader
class GPT2_collator(object):
    def __init__(self, tokenizer, max_seq_len=512):
        self.tokenizer = tokenizer
        self.max_seq_len = max_seq_len
        return

    def __call__(self, sequences):
        texts = [sequence['text'] for sequence in sequences]
        labels = [int(sequence['label']) for sequence in sequences]
        inputs = self.tokenizer(text=texts,
                                return_tensors='pt',
                                padding=True,
                                truncation=True,
                                max_length=self.max_seq_len)
        inputs.update({'labels': torch.tensor(labels)})
        return inputs

In [17]:
# Function for training
def train(dataloader, optimizer, scheduler, device):
    global model
    model.train()
    predictions_labels = []
    true_labels = []
    total_loss = 0

    for batch in tqdm(dataloader, total=len(dataloader)):
        true_labels += batch['labels'].numpy().flatten().tolist()
        batch = {k:v.type(torch.long).to(device) for k,v in batch.items()}
        optimizer.zero_grad()
        outputs = model(**batch)
        loss, logits = outputs[:2]
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        predictions_labels += logits.argmax(axis=-1).flatten().tolist()
    avg_epoch_loss = total_loss / len(dataloader)
    return predictions_labels, true_labels, avg_epoch_loss

In [18]:
# Function for validation
def validate(dataloader, device):
    global model
    model.eval()
    predictions_labels = []
    true_labels = []
    total_loss = 0

    for batch in tqdm(dataloader, total=len(dataloader)):
        true_labels += batch['labels'].numpy().flatten().tolist()
        batch = {k:v.type(torch.long).to(device) for k,v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
            loss, logits = outputs[:2]
            total_loss += loss.item()
            predictions_labels += logits.argmax(axis=-1).flatten().tolist()
    avg_epoch_loss = total_loss / len(dataloader)
    return predictions_labels, true_labels, avg_epoch_loss

In [19]:
def predict(dataloader, device):
    global model
    model.eval()
    predictions_labels = []

    for batch in tqdm(dataloader, total=len(dataloader)):
        batch = {k:v.type(torch.long).to(device) for k,v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
            _, logits = outputs[:2]
            predictions_labels += logits.argmax(axis=-1).flatten().tolist()
    return predictions_labels

In [20]:
# Remove unwanted tags
def clean_text(text):
    text = text.lower()
    # Remove commas and forward slashes
    text = re.sub(r'[,/]', '', text)
    text = re.sub(r'[\']', '', text)
    # Remove HTML tags like '<br />'
    text = re.sub(r'<.*?>', '', text)
    # Remove other unwanted symbols or characters
    text = re.sub(r'[!@#$%^&*()]', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text)
    return text

# Load Model and Tokenizer

In [21]:
print('Loading gpt-2 model')
model_config = GPT2Config.from_pretrained(pretrained_model_name_or_path='gpt2', num_labels=2)

print('Loading tokenizer...')
tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model_name_or_path='gpt2')
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token

print('Loading model...')
model = GPT2ForSequenceClassification.from_pretrained(pretrained_model_name_or_path='gpt2', config=model_config)
model.resize_token_embeddings(len(tokenizer))
model.config.pad_token_id = model.config.eos_token_id
model.to(device)

Loading gpt-2 model


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Loading tokenizer...


vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Loading model...


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=2, bias=False)
)

In [22]:
gpt2_collator = GPT2_collator(tokenizer=tokenizer, max_seq_len=max_len)

df['Content'] = df['Content'].apply(clean_text)
df['Label'] = df['Label'].map(labels_ids)

In [23]:
df_val = df[0:600]
df_val.shape

(600, 2)

In [24]:
train_data = DatasetCreator(df, train=True)
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=gpt2_collator)


val_data = DatasetCreator(df_val, train=True)
val_dataloader = DataLoader(val_data, batch_size=batch_size, shuffle=True, collate_fn=gpt2_collator)

In [25]:
optimizer = AdamW(model.parameters(), lr = 5e-5, eps = 1e-8, weight_decay=0.01)



# Training

In [26]:
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = total_steps)
loss = []
accuracy = []
val_loss_list = []
val_accuracy_list = []

for epoch in tqdm(range(epochs)):
    train_labels, true_labels, train_loss = train(train_dataloader, optimizer, scheduler, device)
    train_acc = accuracy_score(true_labels, train_labels)
    print('epoch: %.2f train accuracy %.2f' % (epoch, train_acc))
    loss.append(train_loss)
    accuracy.append(train_acc)

    val_labels, val_true_labels, val_loss = validate(val_dataloader, device)
    val_acc= accuracy_score(val_true_labels, val_labels)
    print('epoch: %.2f validation accuracy %.2f' % (epoch, val_acc))
    val_loss_list.append(val_loss)
    val_accuracy_list.append(val_acc)

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/1077 [00:00<?, ?it/s]

epoch: 0.00 train accuracy 0.89


  0%|          | 0/150 [00:00<?, ?it/s]

epoch: 0.00 validation accuracy 0.97


  0%|          | 0/1077 [00:00<?, ?it/s]

epoch: 1.00 train accuracy 0.96


  0%|          | 0/150 [00:00<?, ?it/s]

epoch: 1.00 validation accuracy 0.97


  0%|          | 0/1077 [00:00<?, ?it/s]

epoch: 2.00 train accuracy 0.98


  0%|          | 0/150 [00:00<?, ?it/s]

epoch: 2.00 validation accuracy 0.99


  0%|          | 0/1077 [00:00<?, ?it/s]

epoch: 3.00 train accuracy 0.99


  0%|          | 0/150 [00:00<?, ?it/s]

epoch: 3.00 validation accuracy 0.99


  0%|          | 0/1077 [00:00<?, ?it/s]

epoch: 4.00 train accuracy 0.99


  0%|          | 0/150 [00:00<?, ?it/s]

epoch: 4.00 validation accuracy 1.00


  0%|          | 0/1077 [00:00<?, ?it/s]

epoch: 5.00 train accuracy 1.00


  0%|          | 0/150 [00:00<?, ?it/s]

epoch: 5.00 validation accuracy 1.00


# Testing

In [29]:
from sklearn.metrics import confusion_matrix
import numpy as np

def predict_and_get_true_labels(dataloader, device):
    global model
    model.eval()
    predictions_labels = []
    true_labels = []

    for batch in tqdm(dataloader, total=len(dataloader)):
        # Assuming the batch contains 'labels' for true labels
        true_labels += batch['labels'].flatten().tolist()

        batch = {k: v.type(torch.long).to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
            _, logits = outputs[:2]
            predictions_labels += logits.argmax(axis=-1).flatten().tolist()

    return predictions_labels, true_labels

# Assuming dataloader contains true labels under the key 'labels'
predictions, true_labels = predict_and_get_true_labels(val_dataloader, device)

# Create the confusion matrix
conf_matrix = confusion_matrix(true_labels, predictions)

print("Confusion Matrix:")
print(conf_matrix)

  0%|          | 0/150 [00:00<?, ?it/s]

Confusion Matrix:
[[295   1]
 [  0 304]]


In [30]:
print("Classification Report:")
print(classification_report(true_labels, predictions))

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       296
           1       1.00      1.00      1.00       304

    accuracy                           1.00       600
   macro avg       1.00      1.00      1.00       600
weighted avg       1.00      1.00      1.00       600



In [31]:
model.push_to_hub("ErnestBeckham/gpt-2-finetuned-ai-content")

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ErnestBeckham/gpt-2-finetuned-ai-content/commit/5366f44f7e415e9eb518461002a1adc5e7285606', commit_message='Upload GPT2ForSequenceClassification', commit_description='', oid='5366f44f7e415e9eb518461002a1adc5e7285606', pr_url=None, pr_revision=None, pr_num=None)

In [33]:
tokenizer.push_to_hub("ErnestBeckham/gpt2-tokenizer-ai-content")

CommitInfo(commit_url='https://huggingface.co/ErnestBeckham/gpt2-tokenizer-ai-content/commit/d13575b2e4c9f202c2a4129d5b1980a0f28bfafc', commit_message='Upload tokenizer', commit_description='', oid='d13575b2e4c9f202c2a4129d5b1980a0f28bfafc', pr_url=None, pr_revision=None, pr_num=None)

In [40]:
sentence = """Abraham Lincoln, the 16th President of the United States, played a pivotal role in American history,
particularly during one of its most tumultuous periods, the Civil War.
Here is a concise summary of Abraham Lincoln's life: Birth and Childhood:
Abraham Lincoln was born on February 12, 1809, in a log cabin in Hardin County (now LaRue County), Kentucky, USA.
His family later moved to Indiana and settled in Illinois. Limited Formal Education: Lincoln had only about one year of formal education,
but his insatiable curiosity and love for reading led him to self-educate through books and newspapers.
"""

In [41]:
tokenized_input = tokenizer(sentence, return_tensors='pt')
tokenized_input = {key: value.to(model.device) for key, value in tokenized_input.items()}

In [42]:
with torch.no_grad():
    outputs = model(**tokenized_input)

# Extract the predicted logits
logits = outputs.logits

# Use argmax along the appropriate dimension to get the predicted class
predicted_class = torch.argmax(logits, dim=-1).item()

# Print the predicted class
print("Predicted Class:", predicted_class)

Predicted Class: 1
