In [1]:
# Import necessary libraries
import pandas as pd
from datasets import load_dataset
from sklearn.model_selection import train_test_split
import nltk
import re
import unicodedata
import contractions
import spacy
from collections import Counter
from textstat import flesch_reading_ease, syllable_count
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

In [2]:
#get the dataset from Hello-SimpleAI/HC3
dataset = load_dataset("Hello-SimpleAI/HC3", "all")
df = pd.DataFrame(dataset["train"])

#take a look at the dataset
df.head()

Unnamed: 0,id,question,human_answers,chatgpt_answers,source
0,0,"Why is every book I hear about a "" NY Times # ...","[Basically there are many categories of "" Best...",[There are many different best seller lists th...,reddit_eli5
1,1,"If salt is so bad for cars , why do we use it ...",[salt is good for not dying in car crashes and...,[Salt is used on roads to help melt ice and sn...,reddit_eli5
2,2,Why do we still have SD TV channels when HD lo...,[The way it works is that old TV stations got ...,[There are a few reasons why we still have SD ...,reddit_eli5
3,3,Why has nobody assassinated Kim Jong - un He i...,[You ca n't just go around assassinating the l...,[It is generally not acceptable or ethical to ...,reddit_eli5
4,4,How was airplane technology able to advance so...,[Wanting to kill the shit out of Germans drive...,[After the Wright Brothers made the first powe...,reddit_eli5


In [3]:
#Combine the human answers and chatgpt answers into one column
df = df.melt(value_vars=["human_answers", "chatgpt_answers"], var_name="original_column", value_name="text")

#classify the human answers as 1 and the chatgpt answers as 0
df['classification'] = df['original_column'].apply(lambda x: 1 if x == 'human_answers' else 0)

#drop the original column
df = df.drop(columns=['original_column']).reset_index(drop=True)

# Use only a sample for debugging
df = df.sample(500)

#take a look at the dataset
df.head()

Unnamed: 0,text,classification
10802,[Photons are smaller than the air molecules . ...,1
18118,[Discs using the DVD-Video specification requi...,1
17763,[Over one million Jewish children were killed ...,1
37572,[Your voice may sound deeper in the morning be...,0
47333,[It can be helpful to hire a certified public ...,0


In [4]:
#get the summary of the dataset
df.shape

(500, 2)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 500 entries, 10802 to 36854
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   text            500 non-null    object
 1   classification  500 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 11.7+ KB


In [6]:
# Convert lists to strings
df['text'] = df['text'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)

df['word_count'] = df['text'].apply(lambda x: len(str(x)) if isinstance(x, str) else 0)

print(df.head())

total_words = df['word_count'].sum()

print(f"Total number of words in the text column: {total_words}")

                                                    text  classification  \
10802  Photons are smaller than the air molecules . T...               1   
18118  Discs using the DVD-Video specification requir...               1   
17763  Over one million Jewish children were killed i...               1   
37572  Your voice may sound deeper in the morning bec...               0   
47333  It can be helpful to hire a certified public a...               0   

       word_count  
10802         689  
18118         168  
17763         139  
37572         856  
47333        1295  
Total number of words in the text column: 658163


**Text Cleaning**

In [7]:
# Remove extra whitespaces
df['text'] = df['text'].apply(lambda x: re.sub(r'\s+', ' ', x.strip()) if isinstance(x, str) else x)

# Normalize Unicode characters
df['text'] = df['text'].apply(lambda x: unicodedata.normalize('NFKD', x) if isinstance(x, str) else x)

# Expand contractions (e.g., "don't" → "do not")
df['text'] = df['text'].apply(lambda x: contractions.fix(x) if isinstance(x, str) else x)

# Remove digits and special characters (keep punctuation)
df['text'] = df['text'].apply(lambda x: re.sub(r'[^a-zA-Z.,!?; ]+', '', x) if isinstance(x, str) else x)

In [8]:
# Load NLP model
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])

# Remove stopwords and apply lemmatization
stop_words = set(stopwords.words('english')) # set the language 

def preprocess_text(text):
    doc = nlp(text.lower())  # Convert to lowercase here
    tokens = [token.lemma_ for token in doc if token.is_alpha and token.text.lower() not in stop_words]
    return " ".join(tokens)
    
df["text"] = df["text"].apply(lambda x: preprocess_text(x) if isinstance(x, str) else x)

**Feature Extraction**

In [9]:
# Extract word count
df['word_count'] = df['text'].apply(lambda x: len(str(x).split()) if isinstance(x, str) else 0)

# Extract sentence count
df["sentence_count"] = df["text"].apply(lambda x: len(nltk.sent_tokenize(x)) if isinstance(x, str) else 0)

# Compute average sentence length
df["avg_sentence_length"] = df["word_count"] / (df["sentence_count"] + 1)  # Avoid division by zero

# Compute punctuation count
df["punctuation_count"] = df["text"].apply(lambda x: len(re.findall(r'[.,!?;]', x)) if isinstance(x, str) else 0)

# Compute readability score
df["readability_score"] = df["text"].apply(lambda x: flesch_reading_ease(x) if isinstance(x, str) else 0)

# Compute unique word ratio
df["unique_word_ratio"] = df["text"].apply(lambda x: len(set(x.split())) / len(x.split()) if isinstance(x, str) and len(x.split()) > 0 else 0)

# Compute text complexity metrics
df["avg_word_length"] = df["text"].apply(lambda x: sum(len(word) for word in x.split()) / len(x.split()) if isinstance(x, str) and len(x.split()) > 0 else 0)
df["avg_syllables_per_word"] = df["text"].apply(lambda x: syllable_count(x) / len(x.split()) if isinstance(x, str) and len(x.split()) > 0 else 0)

In [10]:
# Extract n-grams (bigrams & trigrams)
def get_ngram_frequencies(text, n=2):
    words = text.split()
    ngrams = zip(*[words[i:] for i in range(n)])
    return Counter(ngrams).most_common(10)

df["bigrams"] = df["text"].apply(lambda x: get_ngram_frequencies(x, 2))
df["trigrams"] = df["text"].apply(lambda x: get_ngram_frequencies(x, 3))

In [11]:
# Compute POS (Parts of Speech) tagging
def get_pos_tags(text):
    doc = nlp(text)
    pos_counts = Counter(token.pos_ for token in doc)
    return pos_counts

df["pos_tags"] = df["text"].apply(lambda x: get_pos_tags(x) if isinstance(x, str) else {})

In [12]:
# Compute sentiment analysis
df["sentiment_polarity"] = df["text"].apply(lambda x: TextBlob(x).sentiment.polarity if isinstance(x, str) else 0)
df["sentiment_subjectivity"] = df["text"].apply(lambda x: TextBlob(x).sentiment.subjectivity if isinstance(x, str) else 0)

In [13]:
# Compute TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
tfidf_features = vectorizer.fit_transform(df["text"]).toarray()

In [14]:
print(df.head())
print(f"Total words in dataset: {df['word_count'].sum()}")

                                                    text  classification  \
10802  photon small air molecule thus fly fast sound ...               1   
18118  disc use dvdvideo specification require dvd dr...               1   
17763  one million jewish child kill holocaust approx...               1   
37572  voice may sound deeply morning use much night ...               0   
47333  helpful hire certify public accountant cpa han...               0   

       word_count  sentence_count  avg_sentence_length  punctuation_count  \
10802          56               1                 28.0                  0   
18118          17               1                  8.5                  0   
17763          15               1                  7.5                  0   
37572          69               1                 34.5                  0   
47333         105               1                 52.5                  0   

       readability_score  unique_word_ratio  avg_word_length  \
10802           