In [1]:
!pip install datasets -q
from datasets import load_dataset
import pandas as pd

print("Import Success, Downloading Data")
dataset = load_dataset("shahxeebhassan/human_vs_ai_sentences")

df = pd.DataFrame(dataset['train']) # Puts data into panda frames

display(df.head())

Import Success, Downloading Data


README.md: 0.00B [00:00, ?B/s]



complete_dataset.csv:   0%|          | 0.00/12.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/105000 [00:00<?, ? examples/s]

Unnamed: 0,text,label
0,Another reason why all students should have to...,0
1,Also the Electoral College consists of 538 ele...,0
2,Many countries have made changes in there town...,0
3,I believe the process of choosing a president ...,0
4,A thick cloud of carbon dioxide and heats to h...,0


In [2]:
!pip install transformers -q
from transformers import AutoTokenizer

print("Import success, downloading tokenizer")
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

test_sentence = "Learning to train AI is incredibly fun."

translated_math = tokenizer(test_sentence)

print("Original English: ", test_sentence)
print("Computer Math (Token IDs): ", translated_math["input_ids"])

Import success, downloading tokenizer


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Original English:  Learning to train AI is incredibly fun.
Computer Math (Token IDs):  [101, 4083, 2000, 3345, 9932, 2003, 11757, 4569, 1012, 102]


In [3]:
# Used to define translation rules
def tokenize_function(examples):
    # max_length=128 means every sentence will be exactly 128 tokens long
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

print("Translating")
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# 3. Cleans up the dataset for the AI
tokenized_datasets = tokenized_datasets.remove_columns(["text"])

tokenized_datasets.set_format("torch")#Format using PyTorch sensors

print("Dataset successfully translated")

Translating


Map:   0%|          | 0/105000 [00:00<?, ? examples/s]

Dataset successfully translated


In [6]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

# 1. Open the "train" split first, shuffle and select
train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(100000))
eval_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(90000,100000))

print("Loading the DistilBERT model")
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", 
    num_labels=2 # 0 for Human, 1 for AI
)

# Rules for Training
training_args = TrainingArguments(
    output_dir="./ai_detector_model",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3, 
    weight_decay=0.01,
    report_to="none" 
)

# Creates the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

print("Initiating the training loop...")

# Trainer starts training
trainer.train()

Loading the DistilBERT model


Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

[1mDistilBertForSequenceClassification LOAD REPORT[0m from: distilbert-base-uncased
Key                     | Status     | 
------------------------+------------+-
vocab_transform.weight  | UNEXPECTED | 
vocab_projector.bias    | UNEXPECTED | 
vocab_layer_norm.weight | UNEXPECTED | 
vocab_layer_norm.bias   | UNEXPECTED | 
vocab_transform.bias    | UNEXPECTED | 
classifier.bias         | MISSING    | 
pre_classifier.weight   | MISSING    | 
pre_classifier.bias     | MISSING    | 
classifier.weight       | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


Initiating the training loop...




Step,Training Loss
500,0.542412
1000,0.377539
1500,0.346446
2000,0.232042
2500,0.214777
3000,0.215187
3500,0.160118
4000,0.138265
4500,0.133891


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]



Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]



Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]



Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]



Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]



Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]



Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]



Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]



Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]



Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

TrainOutput(global_step=4689, training_loss=0.2571945556523475, metrics={'train_runtime': 1787.0256, 'train_samples_per_second': 167.877, 'train_steps_per_second': 2.624, 'total_flos': 9935054899200000.0, 'train_loss': 0.2571945556523475, 'epoch': 3.0})

In [7]:
import numpy as np
from sklearn.metrics import accuracy_score

# How did we do 
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # The AI outputs probabilities
    final_guesses = np.argmax(predictions, axis=1) # np.argmax grabs the highest probability as its final "guess"
    return {"accuracy": accuracy_score(labels, final_guesses)} # Compares guesses to the actual answers

trainer.compute_metrics = compute_metrics

print("Running evaluation")
results = trainer.evaluate()

# 4. Prints the final %
print(f"Accuracy Score: {results['eval_accuracy'] * 100:.2f}%")

Running evaluation




Accuracy Score: 98.01%


In [8]:
!pip install huggingface_hub -q
from huggingface_hub import notebook_login
notebook_login() #To auto connect this to hugging face

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.svâ€¦

In [9]:
repo_name = "toothsocket/ai-detector-50k"

print("Uploading the model...")
model.push_to_hub(repo_name)# 1. Push the trained model
tokenizer.push_to_hub(repo_name) # 2. Push the tokenizer

print("Upload complete")

Uploading the model...


README.md: 0.00B [00:00, ?B/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

No files have been modified since last commit. Skipping to prevent empty commit.


Upload complete
