In [1]:
# Remove conflicting installs
!pip uninstall -y torch torchvision torchaudio

# Install exact versions known to work with transformers + T4 GPU
!pip install torch==2.0.1 torchvision==0.15.2 torchaudio==2.0.2 --index-url https://download.pytorch.org/whl/cu118

# Upgrade transformers again
!pip install --upgrade transformers


Found existing installation: torch 2.6.0+cu124
Uninstalling torch-2.6.0+cu124:
  Successfully uninstalled torch-2.6.0+cu124
Found existing installation: torchvision 0.21.0+cu124
Uninstalling torchvision-0.21.0+cu124:
  Successfully uninstalled torchvision-0.21.0+cu124
Found existing installation: torchaudio 2.6.0+cu124
Uninstalling torchaudio-2.6.0+cu124:
  Successfully uninstalled torchaudio-2.6.0+cu124
Looking in indexes: https://download.pytorch.org/whl/cu118
Collecting torch==2.0.1
  Downloading https://download.pytorch.org/whl/cu118/torch-2.0.1%2Bcu118-cp311-cp311-linux_x86_64.whl (2267.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 GB[0m [31m419.2 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchvision==0.15.2
  Downloading https://download.pytorch.org/whl/cu118/torchvision-0.15.2%2Bcu118-cp311-cp311-linux_x86_64.whl (6.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.1/6.1 MB[0m [31m81.3 MB/s[0m eta [36m0:00:00[0m


In [2]:
!pip install -q --upgrade torch transformers pandas scikit-learn

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m821.2/821.2 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m393.1/393.1 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m123.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m92.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.7/897.7 kB[0m [31m59.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m571.0/571.0 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.2/200.2 MB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
!pip install -q transformers datasets scikit-learn pandas

In [4]:
import zipfile
import os

zip_path = "/content/AI_Human.csv.zip"  # ✅ Corrected file name
extract_path = "/content/ai_human_text"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print("Files extracted to:", extract_path)



Files extracted to: /content/ai_human_text


In [5]:
import os

folder_path = "/content/ai_human_text"
files = os.listdir(folder_path)

print("Files in extracted folder:")
for f in files:
    print(f)


Files in extracted folder:
AI_Human.csv


In [6]:
import pandas as pd

# Load the single CSV
df = pd.read_csv("/content/ai_human_text/AI_Human.csv")

# Show sample rows
print("✅ Loaded dataset with shape:", df.shape)
print("Columns:", df.columns)
df.head()

✅ Loaded dataset with shape: (487235, 2)
Columns: Index(['text', 'generated'], dtype='object')


Unnamed: 0,text,generated
0,Cars. Cars have been around since they became ...,0.0
1,Transportation is a large necessity in most co...,0.0
2,"""America's love affair with it's vehicles seem...",0.0
3,How often do you ride in a car? Do you drive a...,0.0
4,Cars are a wonderful thing. They are perhaps o...,0.0


In [7]:
from transformers import DistilBertTokenizer, DistilBertModel
import torch
from tqdm import tqdm

# Load tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')
model.eval()  # Evaluation mode (no training)

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Function to get DistilBERT embedding for a single text
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    # Use the [CLS] token representation (first token) from last hidden state
    cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().cpu()
    return cls_embedding.numpy()

# Generate embeddings for all texts in your dataset
embeddings = []

print("⏳ Generating embeddings...")
for text in tqdm(df['text'].tolist()):
    emb = get_embedding(str(text))
    embeddings.append(emb)

print("✅ All embeddings generated.")



  warn(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

⏳ Generating embeddings...


100%|██████████| 487235/487235 [2:26:45<00:00, 55.33it/s]


✅ All embeddings generated.


In [9]:
print(df.columns)
df.head()


Index(['text', 'generated'], dtype='object')


Unnamed: 0,text,generated
0,Cars. Cars have been around since they became ...,0.0
1,Transportation is a large necessity in most co...,0.0
2,"""America's love affair with it's vehicles seem...",0.0
3,How often do you ride in a car? Do you drive a...,0.0
4,Cars are a wonderful thing. They are perhaps o...,0.0


In [10]:
import numpy as np
from sklearn.model_selection import train_test_split

# Convert embeddings list to array
X = np.array(embeddings)
y = df['generated'].astype(int).values  # Corrected label column

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("✅ Data ready for training")
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)


✅ Data ready for training
X_train shape: (389788, 768)
X_test shape: (97447, 768)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Train a better classifier
clf = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    random_state=42,
    class_weight='balanced'  # Automatically adjust for label imbalance
)
clf.fit(X_train, y_train)

# Predict on test set
y_pred = clf.predict(X_test)

# Evaluate
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("📊 Updated Classification Results (Random Forest):")
print(f"Accuracy:  {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall:    {rec:.4f}")
print(f"F1 Score:  {f1:.4f}")
print("\nDetailed report:\n", classification_report(y_test, y_pred, target_names=["Human", "AI"]))


In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Train the classifier
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# Predict on test set
y_pred = clf.predict(X_test)

# Evaluate
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("📊 Classification Results:")
print(f"Accuracy:  {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall:    {rec:.4f}")
print(f"F1 Score:  {f1:.4f}")
print("\nDetailed report:\n", classification_report(y_test, y_pred, target_names=["Human", "AI"]))


📊 Classification Results:
Accuracy:  0.9898
Precision: 0.9885
Recall:    0.9842
F1 Score:  0.9863

Detailed report:
               precision    recall  f1-score   support

       Human       0.99      0.99      0.99     61159
          AI       0.99      0.98      0.99     36288

    accuracy                           0.99     97447
   macro avg       0.99      0.99      0.99     97447
weighted avg       0.99      0.99      0.99     97447



In [None]:
from IPython.display import display
import ipywidgets as widgets

# Input text area
input_box = widgets.Textarea(
    value='',
    placeholder='Paste your text here...',
    description='Enter Text:',
    layout=widgets.Layout(width='100%', height='150px'),
    style={'description_width': 'initial'}
)

# Output display
output_area = widgets.Output()

# Button to trigger prediction
button = widgets.Button(description="Detect Author", button_style='success')

# Prediction function
def on_button_click(b):
    output_area.clear_output()
    with output_area:
        text = input_box.value.strip()
        if not text:
            print("⚠️ Please enter some text.")
            return

        # Tokenize and embed
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
        cls_emb = outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy().reshape(1, -1)

        # Predict
        pred = clf.predict(cls_emb)[0]
        prob = clf.predict_proba(cls_emb)[0][pred]

        # Smart confidence-aware output
        if pred == 1 and prob > 0.6:
            label = "🤖 AI-Generated"
        elif pred == 0 and prob > 0.6:
            label = "✍️ Human-Written"
        else:
            label = "⚠️ Not confident enough to decide clearly"

        print(f"Prediction: {label} (Confidence: {prob:.2f})")

# Connect the button to the prediction logic
button.on_click(on_button_click)

# Display UI
display(input_box, button, output_area)


In [12]:
def predict_text(text):
    # Convert to embedding
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    cls_emb = outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy().reshape(1, -1)

    # Predict
    pred = clf.predict(cls_emb)[0]
    prob = clf.predict_proba(cls_emb)[0][pred]

    label = "🤖 AI-Generated" if pred == 1 else "✍️ Human-Written"
    print(f"\nPrediction: {label} (Confidence: {prob:.2f})")

# 🔍 Example usage
predict_text("The revolution of artificial intelligence is transforming industries worldwide.")



Prediction: 🤖 AI-Generated (Confidence: 1.00)


In [13]:
from IPython.display import display
import ipywidgets as widgets

# Input text box
input_box = widgets.Textarea(
    value='',
    placeholder='Paste your text here...',
    description='Text:',
    layout=widgets.Layout(width='100%', height='150px'),
    style={'description_width': 'initial'}
)

# Output display
output_area = widgets.Output()

# Button to trigger prediction
button = widgets.Button(description="Detect Author", button_style='success')

# Function to run prediction
def on_button_click(b):
    output_area.clear_output()
    with output_area:
        text = input_box.value.strip()
        if not text:
            print("⚠️ Please enter some text.")
            return
        # Generate embedding
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
        cls_emb = outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy().reshape(1, -1)

        pred = clf.predict(cls_emb)[0]
        prob = clf.predict_proba(cls_emb)[0][pred]

        label = "🤖 AI-Generated" if pred == 1 else "✍️ Human-Written"
        print(f"Prediction: {label} (Confidence: {prob:.2f})")

# Connect button to function
button.on_click(on_button_click)

# Display UI
display(input_box, button, output_area)


Textarea(value='', description='Text:', layout=Layout(height='150px', width='100%'), placeholder='Paste your t…

Button(button_style='success', description='Detect Author', style=ButtonStyle())

Output()

In [None]:
# app.py

import torch
import numpy as np
import gradio as gr
from transformers import DistilBertTokenizer, DistilBertModel
import joblib  # for loading the trained classifier

# Load tokenizer and model
model_name = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertModel.from_pretrained(model_name)
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Load trained RandomForestClassifier
clf = joblib.load("plagiarism_model.pkl")  # We'll load this later after saving


def detect_plagiarism(text):
    if not text.strip():
        return "❌ No text entered", "0%"

    # Embed input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    emb = outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy().reshape(1, -1)

    # Predict using trained classifier
    pred = clf.predict(emb)[0]
    prob = clf.predict_proba(emb)[0][pred]

    label = "🤖 AI-Generated (Plagiarized)" if pred == 1 else "✍️ Human-Written"
    plagiarism_percent = f"{int(prob * 100)}%" if pred == 1 else "0%"

    return label, plagiarism_percent

# Gradio UI
with gr.Blocks(css=".gradio-container {background-image: url('https://i.imgur.com/gzG9CHz.jpg'); background-size: cover; background-position: center;}\ntextarea, button, label, h1, p {color: white !important;}") as demo:
    gr.Markdown(
        """
        <h1>📚 Vintage AI Plagiarism Detector</h1>
        <p>Paste your essay, paragraph, or story to check if it is AI-generated or human-written, with an estimated plagiarism percentage.</p>
        """
    )

    input_text = gr.Textbox(lines=10, placeholder="Paste your text here...", label="📝 Your Text")
    output_label = gr.Text(label="📌 Result")
    output_percent = gr.Text(label="📊 Plagiarism Percentage")

    btn = gr.Button("🔍 Detect")
    btn.click(fn=detect_plagiarism, inputs=input_text, outputs=[output_label, output_percent])

demo.launch()


In [None]:
import joblib
joblib.dump(clf, "plagiarism_model.pkl")


In [None]:
pip install -r requirements.txt
python app.py


In [None]:
clf = joblib.load("/content/plagiarism_model.pkl")
