In [1]:
# Basic Libraries
import pandas as pd
import numpy as np
import re

# Visualization
import plotly.express as px
import plotly.graph_objects as go

# ML
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    f1_score,
    roc_auc_score,
    roc_curve,
    auc
)

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Handle imbalance
from sklearn.utils.class_weight import compute_class_weight

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("ai_human_detection_v1.csv")

df.head()

Unnamed: 0,id,text,human_or_ai,source_model,prompt,domain,language,edit_level,word_count,generation_date,version
0,e37e31c0-35d0-486d-9760-efadd4e0e289,Aprender programación en Python es fácil . Pri...,human,Human,,Technical Blog,es,none,34,2026-01-29T08:38:18.272755,v1.0
1,68e294cf-a499-4fbf-8616-2d6324570be9,Error: 400 Client Error: Bad Request for url: ...,ai,gemma2-9b-it,Discute el papel de la tecnología en el aprend...,Education,es,none,9,2026-01-29T08:58:20.717225,v1.0
2,b5e06ef0-9758-471f-81dc-182c6117ee81,Error: 404 Client Error: Not Found for url: ht...,ai,gemma2-9b-itllama-3.3-70b-versatile,Create a cold outreach email for business part...,Email,en,none,9,2026-01-29T09:13:20.114765,v1.0
3,6177fc40-1567-445b-af15-f3250ddd3403,**Global Economic Trends: A Shifting Landscape...,ai,llama-3.1-8b-instant,Write about economic trends affecting global m...,News,en,none,558,2026-01-29T08:39:55.917663,v1.0
4,192a9615-ed51-4750-b649-d18f8882f555,"""Recuerda que tú eres fuerte, capaz y única. N...",ai,llama-3.1-8b-instant,Escribe una publicación inspiradora para redes...,Social Media,es,none,147,2026-01-29T08:42:52.451370,v1.0


In [3]:
df.info()
df.describe(include="all")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 686 entries, 0 to 685
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   id               686 non-null    object
 1   text             686 non-null    object
 2   human_or_ai      686 non-null    object
 3   source_model     686 non-null    object
 4   prompt           505 non-null    object
 5   domain           686 non-null    object
 6   language         686 non-null    object
 7   edit_level       686 non-null    object
 8   word_count       686 non-null    int64 
 9   generation_date  686 non-null    object
 10  version          686 non-null    object
dtypes: int64(1), object(10)
memory usage: 59.1+ KB


Unnamed: 0,id,text,human_or_ai,source_model,prompt,domain,language,edit_level,word_count,generation_date,version
count,686,686,686,686,505,686,686,686,686.0,686,686
unique,686,686,3,4,163,6,7,3,,686,1
top,e37e31c0-35d0-486d-9760-efadd4e0e289,Aprender programación en Python es fácil . Pri...,ai,llama-3.1-8b-instant,Write product description for innovative tech ...,Social Media,en,none,,2026-01-29T08:38:18.272755,v1.0
freq,1,1,335,499,15,121,246,516,,1,686
mean,,,,,,,,,262.300292,,
std,,,,,,,,,203.517607,,
min,,,,,,,,,9.0,,
25%,,,,,,,,,57.0,,
50%,,,,,,,,,245.0,,
75%,,,,,,,,,393.5,,


In [4]:
df.isnull().sum()

id                   0
text                 0
human_or_ai          0
source_model         0
prompt             181
domain               0
language             0
edit_level           0
word_count           0
generation_date      0
version              0
dtype: int64

In [7]:
fig = px.histogram(
    df,
    x="human_or_ai",
    color="human_or_ai",
    title="Class Distribution",
    text_auto=True
)

fig.update_layout(bargap=0.2)
fig.show()

In [9]:
df["text_length"] = df["text"].apply(len)

fig = px.box(
    df,
    x="human_or_ai",   # ✅ corrected
    y="text_length",
    color="human_or_ai",   # ✅ corrected
    title="Text Length Distribution per Class"
)

fig.show()

In [10]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df["clean_text"] = df["text"].apply(clean_text)

In [12]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

df["encoded_label"] = le.fit_transform(df["human_or_ai"])

df[["human_or_ai", "encoded_label"]].drop_duplicates()

Unnamed: 0,human_or_ai,encoded_label
0,human,1
1,ai,0
10,post_edited_ai,2


In [13]:
X = df["clean_text"]
y = df["encoded_label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

In [14]:
classes = np.unique(y_train)
weights = compute_class_weight("balanced", classes=classes, y=y_train)
class_weights = dict(zip(classes, weights))

class_weights

{np.int64(0): np.float64(0.681592039800995),
 np.int64(1): np.float64(1.2685185185185186),
 np.int64(2): np.float64(1.3431372549019607)}

In [15]:
log_model = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=10000, ngram_range=(1,2))),
    ("clf", LogisticRegression(max_iter=1000, class_weight=class_weights))
])

log_model.fit(X_train, y_train)

0,1,2
,steps,"[('tfidf', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,"{np.int64(0): np.float64(0.681592039800995), np.int64(1): np.float64(1.2685185185185186), np.int64(2): np.float64(1.3431372549019607)}"
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [16]:
rf_model = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=10000)),
    ("clf", RandomForestClassifier(
        n_estimators=200,
        class_weight=class_weights,
        random_state=42
    ))
])

rf_model.fit(X_train, y_train)

0,1,2
,steps,"[('tfidf', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [17]:
log_cv = cross_val_score(log_model, X_train, y_train, cv=5, scoring="f1_macro")
rf_cv = cross_val_score(rf_model, X_train, y_train, cv=5, scoring="f1_macro")

print("Logistic Regression CV F1:", log_cv.mean())
print("Random Forest CV F1:", rf_cv.mean())

Logistic Regression CV F1: 0.6535107178803641
Random Forest CV F1: 0.641105717559051


In [18]:
def evaluate_model(model, X_test, y_test, model_name):
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)
    
    print(f"\n{model_name} Classification Report")
    print(classification_report(y_test, y_pred))
    
    f1 = f1_score(y_test, y_pred, average="macro")
    print("F1 Score (Macro):", f1)
    
    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    
    fig = px.imshow(
        cm,
        text_auto=True,
        title=f"{model_name} Confusion Matrix"
    )
    fig.show()
    
    # ROC AUC (One-vs-Rest)
    roc_auc = roc_auc_score(y_test, y_proba, multi_class="ovr")
    print("ROC-AUC Score:", roc_auc)
    
    return f1, roc_auc

In [19]:
log_f1, log_auc = evaluate_model(log_model, X_test, y_test, "Logistic Regression")
rf_f1, rf_auc = evaluate_model(rf_model, X_test, y_test, "Random Forest")


Logistic Regression Classification Report
              precision    recall  f1-score   support

           0       0.70      0.78      0.74        67
           1       0.94      0.81      0.87        37
           2       0.56      0.53      0.55        34

    accuracy                           0.72       138
   macro avg       0.73      0.71      0.72       138
weighted avg       0.73      0.72      0.73       138

F1 Score (Macro): 0.7175361384427065


ROC-AUC Score: 0.8358534595483524

Random Forest Classification Report
              precision    recall  f1-score   support

           0       0.68      0.94      0.79        67
           1       1.00      0.84      0.91        37
           2       0.71      0.29      0.42        34

    accuracy                           0.75       138
   macro avg       0.80      0.69      0.71       138
weighted avg       0.77      0.75      0.73       138

F1 Score (Macro): 0.7053104575163398


ROC-AUC Score: 0.8529354530556402


In [20]:
from sklearn.preprocessing import label_binarize

y_test_bin = label_binarize(y_test, classes=np.unique(y_test))
n_classes = y_test_bin.shape[1]

model = log_model
y_score = model.predict_proba(X_test)

fig = go.Figure()

for i in range(n_classes):
    fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_score[:, i])
    roc_auc = auc(fpr, tpr)
    
    fig.add_trace(go.Scatter(
        x=fpr,
        y=tpr,
        mode="lines",
        name=f"Class {i} (AUC = {roc_auc:.2f})"
    ))

fig.update_layout(
    title="ROC Curve (One-vs-Rest)",
    xaxis_title="False Positive Rate",
    yaxis_title="True Positive Rate"
)

fig.show()

In [21]:
param_grid = {
    "clf__C": [0.1, 1, 5],
    "clf__penalty": ["l2"]
}

grid = GridSearchCV(
    log_model,
    param_grid,
    cv=3,
    scoring="f1_macro",
    n_jobs=-1
)

grid.fit(X_train, y_train)

print("Best Parameters:", grid.best_params_)

Best Parameters: {'clf__C': 5, 'clf__penalty': 'l2'}


In [22]:
comparison_df = pd.DataFrame({
    "Model": ["Logistic Regression", "Random Forest"],
    "F1 Score": [log_f1, rf_f1],
    "ROC-AUC": [log_auc, rf_auc]
})

fig = px.bar(
    comparison_df,
    x="Model",
    y=["F1 Score", "ROC-AUC"],
    barmode="group",
    title="Model Performance Comparison"
)

fig.show()

In [23]:
import joblib

In [24]:
joblib.dump(log_model, "logistic_model.pkl")

['logistic_model.pkl']

In [25]:
joblib.dump(rf_model, "random_forest_model.pkl")

['random_forest_model.pkl']

In [26]:
joblib.dump(grid.best_estimator_, "optimized_logistic_model.pkl")

['optimized_logistic_model.pkl']

In [27]:
joblib.dump(le, "label_encoder.pkl")

['label_encoder.pkl']