In [None]:
# Import necessary libraries
import pandas as pd

# Load the balanced dataset
data_balanced = pd.read_csv('balanced_dataset.csv')
data_balanced.head()


Unnamed: 0,text,label,prompt_name,source,RDizzl3_seven,word_count,char_count,sentence_count
0,The Electoral College makes it so that candida...,0,Does the electoral college work?,persuade_corpus,True,417,2466,20
1,The positive I think about driverless cars it ...,0,Driverless cars,persuade_corpus,True,190,1001,9
2,A face on mars sounds kind of creepy right? Ho...,0,The Face on Mars,persuade_corpus,True,166,880,9
3,"In this digital age, libraries have been a top...",1,Distance learning,falcon_180b_v1,False,317,2085,19
4,I think that Lukes point of viewIf you were go...,0,"""A Cowboy Who Rode the Waves""",persuade_corpus,True,161,771,9


In [2]:
from collections import Counter
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

# Tokenize and count common words for each class
def get_common_words(texts, n=20):
    words = [word.lower() for text in texts for word in text.split() if word.lower() not in stop_words]
    return Counter(words).most_common(n)

# Common words in human-written text
human_texts = data_balanced[data_balanced['label'] == 0]['text']
common_human_words = get_common_words(human_texts)
print("Most common words in human-written text:", common_human_words)

# Common words in AI-generated text
ai_texts = data_balanced[data_balanced['label'] == 1]['text']
common_ai_words = get_common_words(ai_texts)
print("Most common words in AI-generated text:", common_ai_words)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/joshuaphilip/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Most common words in human-written text: [('students', 55759), ('would', 48539), ('people', 41462), ('school', 27249), ('could', 26929), ('get', 26310), ('like', 22552), ('one', 22019), ('help', 21930), ('also', 21016), ('make', 20329), ('many', 20134), ('think', 19650), ('car', 18988), ('cars', 17407), ('time', 17180), ('even', 16023), ('student', 15774), ('electoral', 15273), ('good', 14507)]
Most common words in AI-generated text: [('students', 33560), ('also', 23256), ('help', 21448), ('may', 19859), ('people', 18989), ('electoral', 18620), ('car', 17415), ('make', 17384), ('like,', 16090), ('important', 15424), ('college', 15303), ('school', 14217), ('would', 14021), ('one', 13455), ('could', 13270), ('time', 12808), ('limiting', 10892), ('lead', 10313), ('believe', 10002), ('usage', 9955)]


In [3]:
from sklearn.feature_extraction.text import CountVectorizer

# Sample a subset to avoid memory issues
sample_data = data_balanced['text'].sample(5000, random_state=42)  # Adjust if needed

# Define the vectorizer with reduced n-gram range and max features
vectorizer = CountVectorizer(ngram_range=(2, 2), max_features=500, stop_words='english')  # Bigrams only, limited to 500 features

# Fit and transform the sample data
X = vectorizer.fit_transform(sample_data)

# Get top n-grams with counts
ngram_counts = X.sum(axis=0).A1  # .A1 converts sparse matrix to array
ngram_features = vectorizer.get_feature_names()  # Use get_feature_names() for older versions
top_ngrams = sorted(zip(ngram_features, ngram_counts), key=lambda x: x[1], reverse=True)[:20]

# Display the top 20 n-grams
print("Top 20 n-grams:", top_ngrams)


Top 20 n-grams: [('electoral college', 3973), ('car usage', 2708), ('limiting car', 1957), ('popular vote', 1515), ('cell phones', 1133), ('driverless cars', 983), ('community service', 951), ('united states', 945), ('extracurricular activities', 845), ('high school', 740), ('distance learning', 673), ('cell phone', 646), ('public transportation', 603), ('air pollution', 599), ('summer projects', 595), ('help students', 591), ('traffic congestion', 589), ('online classes', 549), ('greenhouse gas', 528), ('electoral votes', 517)]


In [4]:
# Word count, sentence length features were calculated in the previous notebook. Display basic stats by class.
print("Word Count by Class:")
print(data_balanced.groupby('label')['word_count'].describe())

print("Sentence Count by Class:")
print(data_balanced.groupby('label')['sentence_count'].describe())


Word Count by Class:
         count        mean         std    min    25%    50%    75%     max
label                                                                     
0      17497.0  416.402469  188.217259  143.0  272.0  382.0  518.0  1656.0
1      17497.0  329.398983   94.256529    4.0  274.0  328.0  386.0   818.0
Sentence Count by Class:
         count       mean        std  min   25%   50%   75%    max
label                                                             
0      17497.0  20.593359  10.113202  0.0  13.0  19.0  26.0  216.0
1      17497.0  17.198891   6.097075  0.0  13.0  17.0  20.0  122.0


In [5]:
# Define a function to count punctuation
import string

def count_punctuation(text):
    return {p: text.count(p) for p in string.punctuation}

# Apply to each text entry
data_balanced['punctuation_counts'] = data_balanced['text'].apply(count_punctuation)

# Summarize average punctuation usage by class
punctuation_summary = data_balanced.groupby('label')['punctuation_counts'].apply(lambda x: pd.DataFrame(x.tolist()).mean())
print("Average punctuation usage by class:")
print(punctuation_summary)


Average punctuation usage by class:
label   
0      !    0.274047
       "    2.730754
       #    0.007944
       $    0.042122
       %    0.115105
              ...   
1      `    0.000686
       {    0.000572
       |    0.000000
       }    0.000572
       ~    0.000114
Name: punctuation_counts, Length: 64, dtype: float64


In [None]:

from textblob import TextBlob
import pandas as pd

# Sample data for demonstration, using the balanced dataset
data_balanced = pd.read_csv('balanced_dataset.csv')  # Load the balanced dataset

# Define a function to extract POS counts
def get_pos_counts(text):
    blob = TextBlob(text)
    pos_counts = {"NOUN": 0, "VERB": 0, "ADJ": 0, "ADV": 0}
    for word, pos in blob.tags:
        if pos.startswith("NN"):  # Nouns
            pos_counts["NOUN"] += 1
        elif pos.startswith("VB"):  # Verbs
            pos_counts["VERB"] += 1
        elif pos.startswith("JJ"):  # Adjectives
            pos_counts["ADJ"] += 1
        elif pos.startswith("RB"):  # Adverbs
            pos_counts["ADV"] += 1
    return pos_counts

# Apply POS tagging to the dataset and store results in new columns
data_balanced["pos_counts"] = data_balanced["text"].apply(get_pos_counts)

# Convert POS dictionary to separate columns for easy analysis
pos_df = pd.json_normalize(data_balanced["pos_counts"])
data_balanced = pd.concat([data_balanced, pos_df], axis=1)

# Check the average POS counts by class
print("Average POS counts by class:")
print(data_balanced.groupby('label')[["NOUN", "VERB", "ADJ", "ADV"]].mean())


Average POS counts by class:
            NOUN       VERB        ADJ        ADV
label                                            
0      99.620278  83.532263  30.493056  24.928673
1      88.519118  61.385037  31.467909  16.397897


In [13]:
import textstat

# Function to calculate readability score
data_balanced['readability_score'] = data_balanced['text'].apply(textstat.flesch_reading_ease)

# Describe readability scores by class
print("Readability Scores by Class:")
print(data_balanced.groupby('label')['readability_score'].describe())


Readability Scores by Class:
         count       mean        std     min    25%    50%    75%     max
label                                                                    
0      17497.0  69.284005  15.872008 -446.85  63.22  70.43  77.57  103.73
1      17497.0  55.589779  15.413828    4.58  44.34  54.02  64.88  102.41


In [14]:
from textblob import TextBlob

# Type-Token Ratio (TTR)
def ttr(text):
    words = text.split()
    return len(set(words)) / len(words) if len(words) > 0 else 0

data_balanced['ttr'] = data_balanced['text'].apply(ttr)

# Lexical Density (ratio of unique content words to all words)
def lexical_density(text):
    blob = TextBlob(text)
    content_words = [word for word, pos in blob.tags if pos.startswith(('NN', 'VB', 'JJ', 'RB'))]
    return len(content_words) / len(blob.words) if len(blob.words) > 0 else 0

data_balanced['lexical_density'] = data_balanced['text'].apply(lexical_density)

# Summarize by class
print("Vocabulary Richness by Class:")
print(data_balanced.groupby('label')[['ttr', 'lexical_density']].describe())



Vocabulary Richness by Class:
           ttr                                                              \
         count      mean       std       min       25%       50%       75%   
label                                                                        
0      17497.0  0.501099  0.084421  0.056763  0.442822  0.501377  0.559796   
1      17497.0  0.529742  0.092877  0.127660  0.461972  0.518950  0.596078   

                lexical_density                                          \
            max           count      mean       std       min       25%   
label                                                                     
0      0.774566         17497.0  0.566288  0.036557  0.427778  0.542279   
1      1.000000         17497.0  0.592393  0.047120  0.431373  0.558824   

                                     
            50%       75%       max  
label                                
0      0.565543  0.589091  0.984091  
1      0.587571  0.624434  1.050761  


In [16]:
# Average Word Length
import numpy as np

def average_word_length(text):
    words = text.split()
    return np.mean([len(word) for word in words]) if len(words) > 0 else 0

data_balanced['avg_word_length'] = data_balanced['text'].apply(average_word_length)


In [17]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

# Stop Word Ratio
def stop_word_ratio(text):
    words = text.split()
    stop_word_count = sum(1 for word in words if word.lower() in stop_words)
    return stop_word_count / len(words) if len(words) > 0 else 0

data_balanced['stop_word_ratio'] = data_balanced['text'].apply(stop_word_ratio)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/joshuaphilip/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
from textblob import TextBlob

# POS Diversity
def pos_diversity(text):
    blob = TextBlob(text)
    pos_tags = [pos for word, pos in blob.tags]
    unique_pos_tags = set(pos_tags)
    return len(unique_pos_tags) / len(pos_tags) if len(pos_tags) > 0 else 0

data_balanced['pos_diversity'] = data_balanced['text'].apply(pos_diversity)


In [19]:
# 4. Unique Word Ratio
def unique_word_ratio(text):
    words = text.split()
    return len(set(words)) / len(words) if len(words) > 0 else 0

data_balanced['unique_word_ratio'] = data_balanced['text'].apply(unique_word_ratio)


In [20]:
# 6. Word Entropy
def word_entropy(text):
    words = text.split()
    word_freq = Counter(words)
    total_words = sum(word_freq.values())
    return -sum((freq / total_words) * np.log2(freq / total_words) for freq in word_freq.values())

data_balanced['word_entropy'] = data_balanced['text'].apply(word_entropy)

In [21]:
# 3. Readability Indices
# Add advanced readability metrics
import textstat
data_balanced['gunning_fog'] = data_balanced['text'].apply(textstat.gunning_fog)
data_balanced['smog_index'] = data_balanced['text'].apply(textstat.smog_index)


In [22]:

# 2. Bigram and Trigram Counts
# Add bigram and trigram frequency features
vectorizer_bigram = CountVectorizer(ngram_range=(2, 2), stop_words='english', max_features=500)
vectorizer_trigram = CountVectorizer(ngram_range=(3, 3), stop_words='english', max_features=500)

bigram_matrix = vectorizer_bigram.fit_transform(data_balanced['text'])
trigram_matrix = vectorizer_trigram.fit_transform(data_balanced['text'])

data_balanced['bigram_count'] = bigram_matrix.sum(axis=1).A1
data_balanced['trigram_count'] = trigram_matrix.sum(axis=1).A1

In [23]:
def semantic_density(text):
    sentences = [s.strip() for s in text.split('.') if s.strip()]
    if not sentences:
        return 0
    avg_sentence_length = np.mean([len(s.split()) for s in sentences])
    return len(set(text.split())) / avg_sentence_length if avg_sentence_length > 0 else 0

data_balanced['semantic_density'] = data_balanced['text'].apply(semantic_density)


In [24]:
def avg_sentence_complexity(text):
    sentences = text.split('.')
    clause_counts = [sentence.count(',') + 1 for sentence in sentences]
    return np.mean(clause_counts) if len(clause_counts) > 0 else 0

data_balanced['avg_sentence_complexity'] = data_balanced['text'].apply(avg_sentence_complexity)


In [25]:
def burstiness(text):
    sentence_lengths = [len(sentence.split()) for sentence in text.split('.')]
    if len(sentence_lengths) == 0:
        return 0
    mean_length = np.mean(sentence_lengths)
    std_dev = np.std(sentence_lengths)
    return std_dev / mean_length if mean_length > 0 else 0

data_balanced['burstiness'] = data_balanced['text'].apply(burstiness)


In [26]:
data_balanced.to_csv('processed_data_with_features.csv', index=False)
