# CNN

In [None]:
# ============================================
# 1️⃣ Import Libraries
# ============================================
import pandas as pd
import pandas as pd
import glob
import re
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from tensorflow.keras.callbacks import EarlyStopping
import numpy as np

In [None]:
# ============================================
df = pd.read_csv("/content/combined_dataset.csv")  # Replace with your path

print("✅ Dataset Loaded:", df.shape)
print(df.head())

✅ Dataset Loaded: (49268, 2)
                                                text  label
0  People can learn without making mistakes. Peop...      0
1  PHONES AND DRIVING\n\nIn this world in which w...      0
2  Okay, here's my essay:\n\nMaking Your Own Deci...      1
3   Dear : Principal\n\nI believe that allowing s...      1
4  Well for one if you seek more then one person ...      0


In [None]:
# ===  Basic Text Cleaning Function ===
def clean_text(text):
    text = str(text).lower()                       # lowercase
    text = re.sub(r"http\S+", " ", text)           # remove URLs
    text = re.sub(r"[^a-zA-Z0-9\s]", " ", text)    # remove punctuation & special chars
    text = re.sub(r"\s+", " ", text).strip()       # remove extra spaces
    return text

# Apply cleaning
df['clean_text'] = df['text'].apply(clean_text)

print("\n Text cleaning completed.")
print(df[['text', 'clean_text']].head(3))

# ===  Remove very short or empty texts ===
df = df[df['clean_text'].str.len() > 10]
df.reset_index(drop=True, inplace=True)

print("\nAfter cleaning:")
print("Shape:", df.shape)


 Text cleaning completed.
                                                text  \
0  People can learn without making mistakes. Peop...   
1  PHONES AND DRIVING\n\nIn this world in which w...   
2  Okay, here's my essay:\n\nMaking Your Own Deci...   

                                          clean_text  
0  people can learn without making mistakes peopl...  
1  phones and driving in this world in which we l...  
2  okay here s my essay making your own decisions...  

After cleaning:
Shape: (49268, 3)


In [None]:


# ============================================
# 2️⃣ Load Dataset


# ============================================
# 3️⃣ Prepare Text and Labels
# ============================================
X = df["clean_text"].astype(str)
y = df["label"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ============================================
# 4️⃣ Tokenization and Padding
# ============================================
max_words = 20000     # Vocabulary size
max_len = 300         # Max tokens per text

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding="post", truncating="post")
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding="post", truncating="post")

print("✅ Padded Sequences Shape:", X_train_pad.shape)

# ============================================
# 5️⃣ Build CNN Model
# ============================================
embedding_dim = 100

model = Sequential([
    Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_len),
    Conv1D(128, 5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')  # Binary output: Human vs AI
])

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

model.summary()

# ============================================
# 6️⃣ Train the Model
# ============================================
early_stop = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

history = model.fit(
    X_train_pad,
    y_train,
    validation_data=(X_test_pad, y_test),
    epochs=5,
    batch_size=64,
    callbacks=[early_stop],
    verbose=1
)

# ============================================
# 7️⃣ Evaluate Model
# ============================================
y_pred_prob = model.predict(X_test_pad)
y_pred = (y_pred_prob > 0.5).astype(int)

print("\n📊 Classification Report:")
print(classification_report(y_test, y_pred))
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

# ============================================
# 8️⃣ Save Model and Tokenizer
# ============================================
model.save("cnn_ai_human_model.h5")

import joblib
joblib.dump(tokenizer, "cnn_tokenizer.pkl")

print("\n✅ CNN Model and Tokenizer Saved Successfully!")

# ============================================
# 9️⃣ Test on Custom Text
# ============================================
custom_text = ["Cars have been around since the 1900s when Henry Ford created and built..."]

custom_seq = tokenizer.texts_to_sequences(custom_text)
custom_pad = pad_sequences(custom_seq, maxlen=max_len, padding="post", truncating="post")

pred = model.predict(custom_pad)[0][0]
print("\n🔍 Prediction:", "🤖 AI" if pred > 0.5 else "👤 Human")


✅ Padded Sequences Shape: (39414, 300)




Epoch 1/5
[1m616/616[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 15ms/step - accuracy: 0.8644 - loss: 0.2705 - val_accuracy: 0.9893 - val_loss: 0.0382
Epoch 2/5
[1m616/616[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - accuracy: 0.9947 - loss: 0.0191 - val_accuracy: 0.9890 - val_loss: 0.0422
Epoch 3/5
[1m616/616[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - accuracy: 0.9976 - loss: 0.0104 - val_accuracy: 0.9883 - val_loss: 0.0447
[1m308/308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step





📊 Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      6163
           1       0.99      0.98      0.99      3691

    accuracy                           0.99      9854
   macro avg       0.99      0.99      0.99      9854
weighted avg       0.99      0.99      0.99      9854

✅ Accuracy: 0.9893444286584129

Confusion Matrix:
 [[6134   29]
 [  76 3615]]

✅ CNN Model and Tokenizer Saved Successfully!
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 346ms/step

🔍 Prediction: 👤 Human


# ANN

In [None]:
# ============================================
# 1️⃣ Import Libraries
# ============================================
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import joblib


# ============================================
# 3️⃣ Split Data
# ============================================
X = df["clean_text"]
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ============================================
# 4️⃣ TF-IDF Vectorization
# ============================================
vectorizer = TfidfVectorizer(
    max_features=8000,   # You can tune this
    ngram_range=(1, 2),
    stop_words="english"
)

X_train_tfidf = vectorizer.fit_transform(X_train).toarray()
X_test_tfidf = vectorizer.transform(X_test).toarray()

print("✅ TF-IDF shape:", X_train_tfidf.shape)

# ============================================
# 5️⃣ Build ANN Model
# ============================================
model = Sequential([
    Dense(512, activation='relu', input_shape=(X_train_tfidf.shape[1],)),
    Dropout(0.3),
    Dense(256, activation='relu'),
    Dropout(0.3),
    Dense(128, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')  # Binary output: AI vs Human
])

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

model.summary()

# ============================================
# 6️⃣ Train Model
# ============================================
# early_stop = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

history = model.fit(
    X_train_tfidf,
    y_train,
    validation_data=(X_test_tfidf, y_test),
    epochs=10,
    batch_size=32,
    callbacks=[early_stop],
    verbose=1
)

# ============================================
# 7️⃣ Evaluate Model
# ============================================
y_pred_prob = model.predict(X_test_tfidf)
y_pred = (y_pred_prob > 0.5).astype(int)

print("\n📊 Classification Report:")
print(classification_report(y_test, y_pred))
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

# ============================================
# 8️⃣ Save Model and Vectorizer
# ============================================
model.save("ann_ai_human_model.h5")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

print("\n✅ Model and Vectorizer Saved Successfully!")

# ============================================
# 9️⃣ Test on Custom Text
# ============================================



✅ TF-IDF shape: (39414, 8000)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m1232/1232[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 7ms/step - accuracy: 0.9491 - loss: 0.1104 - val_accuracy: 0.9905 - val_loss: 0.0275
Epoch 2/10
[1m1232/1232[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - accuracy: 0.9968 - loss: 0.0098 - val_accuracy: 0.9901 - val_loss: 0.0309
Epoch 3/10
[1m1232/1232[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 5ms/step - accuracy: 0.9978 - loss: 0.0066 - val_accuracy: 0.9892 - val_loss: 0.0368
[1m308/308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step





📊 Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      6163
           1       0.99      0.98      0.99      3691

    accuracy                           0.99      9854
   macro avg       0.99      0.99      0.99      9854
weighted avg       0.99      0.99      0.99      9854

✅ Accuracy: 0.9904607266084838

Confusion Matrix:
 [[6130   33]
 [  61 3630]]

✅ Model and Vectorizer Saved Successfully!
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 273ms/step

🔍 Prediction: 👤 Human


In [None]:
custom_text = ["The number of features (vocabulary size) generated by your TfidfVectorizer for new predictions must match the number of input features expected by the first layer of your Keras model. This is achieved by ensuring the TfidfVectorizer is fitted on the entire training corpus and then only transforming new data with that same fitted instance"]
custom_features = vectorizer.transform(custom_text).toarray()
pred = model.predict(custom_features)[0][0]

print("\n🔍 Prediction:", "🤖 AI" if pred > 0.5 else "👤 Human")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 91ms/step

🔍 Prediction: 🤖 AI


# Transfar Learning

In [None]:
# ===============================
# 📘 TRANSFER LEARNING CODE (RoBERTa)
# ===============================
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
from sklearn.model_selection import train_test_split
import pandas as pd


X = df["clean_text"]
y = df["label"]


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Create Hugging Face Datasets
train_df = pd.DataFrame({'text': X_train, 'label': y_train})
test_df = pd.DataFrame({'text': X_test, 'label': y_test})

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# ======================================
# 2️⃣ TOKENIZE DATA
# ======================================
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

train_tokenized = train_dataset.map(preprocess_function, batched=True)
test_tokenized = test_dataset.map(preprocess_function, batched=True)

# ======================================
# 3️⃣ LOAD PRETRAINED MODEL (TRANSFER LEARNING)
# ======================================
model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=2)

# ======================================
# 4️⃣ TRAINING CONFIGURATION
# ======================================
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    eval_strategy="epoch",
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none" # Disable reporting to wandb
)

# ======================================
# 5️⃣ TRAIN MODEL
# ======================================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=test_tokenized,
    tokenizer=tokenizer,
)

trainer.train()

# ======================================
# 6️⃣ SAVE MODEL
# ======================================
model.save_pretrained("./fine_tuned_roberta")
tokenizer.save_pretrained("./fine_tuned_roberta")

# ======================================
# 7️⃣ TEST PREDICTION
# ======================================
test_text = "Cars have been around since the 1900s when Henry Ford created them."
inputs = tokenizer(test_text, return_tensors="pt", truncation=True, padding=True, max_length=128)

# Move inputs to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
inputs = {name: tensor.to(device) for name, tensor in inputs.items()}
model.to(device)


with torch.no_grad():
    outputs = model(**inputs)

prediction = torch.argmax(outputs.logits, dim=1).item()

if prediction == 0:
    print(" Predicted: Human-written text")
else:
    print(" Predicted: AI-generated text")

Map:   0%|          | 0/39414 [00:00<?, ? examples/s]

Map:   0%|          | 0/9854 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
