In [1]:
import pandas as pd
import os
# Adjust the path to your username
file_path = os.path.expanduser('~/Downloads/AI_Human.csv')


In [2]:
pip install pandas

Note: you may need to restart the kernel to use updated packages.


In [3]:
# Read the CSV into a DataFrame
df = pd.read_csv(file_path)

# Display the first 5 rows
df.head()


Unnamed: 0,text,generated
0,Cars. Cars have been around since they became ...,0.0
1,Transportation is a large necessity in most co...,0.0
2,"""America's love affair with it's vehicles seem...",0.0
3,How often do you ride in a car? Do you drive a...,0.0
4,Cars are a wonderful thing. They are perhaps o...,0.0


In [4]:
# Drop missing values
df = df.dropna()

# Features and labels
X = df['text']
y = df['generated']


In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


In [8]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train_vec, y_train)


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [9]:
from sklearn.metrics import classification_report, accuracy_score

y_pred = model.predict(X_test_vec)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9923856044824366

Classification Report:
               precision    recall  f1-score   support

         0.0       0.99      1.00      0.99     61112
         1.0       0.99      0.98      0.99     36335

    accuracy                           0.99     97447
   macro avg       0.99      0.99      0.99     97447
weighted avg       0.99      0.99      0.99     97447



In [16]:
import joblib

# Save the trained model
joblib.dump(model, 'ai_detector_model.pkl')

# Save the vectorizer (assuming you used something like TfidfVectorizer or CountVectorizer)
joblib.dump(vectorizer, 'text_vectorizer.pkl')

print("✅ Model and vectorizer saved successfully!")


✅ Model and vectorizer saved successfully!


In [29]:
def predict_text_origin_with_score(text):
    vec = vectorizer.transform([text])
    proba = model.predict_proba(vec)[0]  # returns [prob_human, prob_ai]
    ai_confidence = proba[1] * 100       # Get AI prediction confidence in %
    
    if proba[1] > 0.5:
        result = "AI-Generated"
    else:
        result = "Human-Written"
    
    return f"{result} ({ai_confidence:.2f}% AI Confidence)"

# Test
sample_text = """I’ve always been interested in how technology and data can be used to tackle real-life challenges. This curiosity naturally led me to peruse a Bachelors of Computer Application where I built a solid foundation in computer science, programing and data manipulation.
In addition to my coursework, I completed a number of different internships in Unified Mentor ,Miso and many more, taking on roles as Data Science or Data Analysis Intern. These experiences allowed me tp get hands-on with Machine learning models ,run customer segmentation using K-means clustering , and design interactive dashboards with Power BI and Tableau . Working on these Projects gave me practical insights into data-driven problem solving and further fueled my interest in advance analytics. I hold numerous certification,  primarily from Google as a Data Analyst.
"""

print(predict_text_origin_with_score(sample_text))


AI-Generated (98.33% AI Confidence)


In [25]:
sample_text = """A heart attack occurs when the flow of blood to the heart is severely reduced or blocked. The blockage is usually due to a buildup of fat, cholesterol and other substances in the heart (coronary) arteries. The fatty, cholesterol-containing deposits are called plaques. The process of plaque buildup is called atherosclerosis.Sometimes, a plaque can rupture and form a clot that blocks blood flow. A lack of blood flow can damage or destroy part of the heart muscle.A heart attack occurs when an artery that sends blood and oxygen to the heart is blocked. Fatty, cholesterol-containing deposits build up over time, forming plaques in the heart's arteries. If a plaque ruptures, a blood clot can form. The clot can block arteries, causing a heart attack. During a heart attack, a lack of blood flow causes the tissue in the heart muscle to die.Prompt treatment is needed for a heart attack to prevent death. Call 911 or emergency medical help if you think you might be having a heart."""
print(predict_text_origin_with_score(sample_text))

Human-Written (46.95% AI Confidence)


In [23]:
import tkinter as tk
from tkinter import filedialog, messagebox
import joblib
import pandas as pd
import os

# Optional if using .docx
from docx import Document

# Load your trained model and vectorizer
model = joblib.load("ai_detector_model.pkl")          # Replace with your model file
vectorizer = joblib.load("text_vectorizer.pkl")       # Replace with your vectorizer file

# Function to extract text from .txt or .docx
def extract_text_from_file(file_path):
    ext = os.path.splitext(file_path)[1].lower()
    if ext == '.txt':
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read()
    elif ext == '.docx':
        doc = Document(file_path)
        return "\n".join([para.text for para in doc.paragraphs])
    else:
        return None

# Function to process the file and show prediction
def analyze_file():
    file_path = filedialog.askopenfilename(filetypes=[("Text files", "*.txt *.docx")])
    if not file_path:
        return

    text = extract_text_from_file(file_path)
    if text is None:
        messagebox.showerror("Error", "Unsupported file type.")
        return

    # Transform and predict
    transformed = vectorizer.transform([text])
    prob = model.predict_proba(transformed)[0][1]  # Assuming 1 = AI-generated
    percentage = round(prob * 100, 2)

    result = f"Estimated AI-generated content: {percentage}%"
    messagebox.showinfo("Result", result)

# Build GUI
root = tk.Tk()
root.title("AI Content Detector")
root.geometry("400x200")

label = tk.Label(root, text="Upload a text or docx file to check for AI-generated content", wraplength=350, justify="center")
label.pack(pady=20)

button = tk.Button(root, text="Upload File", command=analyze_file, bg="blue", fg="white", font=("Arial", 12))
button.pack(pady=10)

root.mainloop()


In [22]:
!pip install python-docx textract joblib scikit-learn


Collecting textract
  Downloading textract-1.6.5-py3-none-any.whl.metadata (2.5 kB)
  Downloading textract-1.6.4.tar.gz (17 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'error'


Requested textract from https://files.pythonhosted.org/packages/6b/3e/ac16b6bf28edf78296aea7d0cb416b49ed30282ac8c711662541015ee6f3/textract-1.6.5-py3-none-any.whl has invalid metadata: .* suffix can only be used with `==` or `!=` operators
    extract-msg (<=0.29.*)
                 ~~~~~~~^
Please use pip<24.1 if you need to use this version.
  error: subprocess-exited-with-error
  
  Getting requirements to build wheel did not run successfully.
  exit code: 1
  
  [3 lines of output]
  error in textract setup command: 'install_requires' must be a string or iterable of strings containing valid project/version requirement specifiers; .* suffix can only be used with `==` or `!=` operators
      extract-msg<=0.29.*
                 ~~~~~~~^
  [end of output]
  
  note: This error originates from a subprocess, and is likely not a problem with pip.
error: subprocess-exited-with-error

Getting requirements to build wheel did not run successfully.
exit code: 1

See above for output.

note: T