In [1]:
import os
import nltk
import spacy
import textstat
import pandas as pd
from collections import Counter
from tqdm import tqdm

# Download necessary NLTK data
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('stopwords')
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

# Load SpaCy model for syntactic features
nlp = spacy.load("en_core_web_sm")

In [None]:
# Lexical Features
def lexical_features(text):
    words = word_tokenize(text)
    sentences = sent_tokenize(text)
    stop_words = set(stopwords.words("english"))
    unique_words = set(words)
    
    features = {
        "word_count": len(words),
        "character_count": sum(len(word) for word in words),
        "average_word_length": sum(len(word) for word in words) / len(words),
        "sentence_count": len(sentences),
        "unique_words_ratio": len(unique_words) / len(words),
        "stopword_ratio": len([word for word in words if word.lower() in stop_words]) / len(words)
    }
    return features

# Syntactic Features
def syntactic_features(text):
    doc = nlp(text)
    pos_counts = Counter([token.pos_ for token in doc])
    
    features = {
        "noun_ratio": pos_counts.get("NOUN", 0) / len(doc),
        "verb_ratio": pos_counts.get("VERB", 0) / len(doc),
        "adjective_ratio": pos_counts.get("ADJ", 0) / len(doc),
        "average_sentence_length": sum(len(sent.text.split()) for sent in doc.sents) / len(list(doc.sents)),
        "entity_count": len(doc.ents)
    }
    return features

# Semantic Features
def semantic_features(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    
    features = {
        "named_entities": entities,
        "entity_count": len(entities)
    }
    return features

# Readability Scores
def readability_features(text):
    features = {
        "flesch_reading_ease": textstat.flesch_reading_ease(text),
        "gunning_fog_index": textstat.gunning_fog(text),
        "smog_index": textstat.smog_index(text),
        "automated_readability_index": textstat.automated_readability_index(text),
        "dale_chall_readbility": textstat.dale_chall_readability_score(text),

    }
    return features

# Stylometric Features
def stylometric_features(text):
    words = word_tokenize(text)
    bigrams = list(nltk.bigrams(words))
    trigrams = list(nltk.trigrams(words))
    
    features = {
        "bigram_count": len(bigrams),
        "trigrams_count": len(trigrams),
        "punctuation_count": sum(1 for char in text if char in ".,;!?")
    }
    return features

# Combine all features for one text
def extract_features_single_text(text):
    features = {}
    features.update(lexical_features(text))
    features.update(syntactic_features(text)) # takes majority of compute time
    #features.update(semantic_features(text))
    features.update(readability_features(text))
    features.update(stylometric_features(text))
    return features


In [7]:
def get_csv_path(folder_path, recursive=False):
    if recursive:
        # Walk through all subdirectories
        file_paths = [os.path.join(root, file) 
                      for root, _, files in os.walk(folder_path) 
                      for file in files if file.endswith('.csv')]
    else:
        # Get files in the root folder only
        file_paths = [os.path.join(folder_path, file) 
                      for file in os.listdir(folder_path) 
                      if file.endswith('.csv')]
    

    return file_paths

In [29]:
def calc_features(texts):
    results = []
    for text in tqdm(texts):
        features = extract_features_single_text(text)
        results.append(features)
    
    df = pd.DataFrame(results)
    return df

In [10]:
data_human_path = "../data/data_human"
data_ai_path = "../data/data_ai"
paths = get_csv_path(data_human_path) + get_csv_path(data_ai_path, recursive=True)
paths

['../data/data_human/xsum_human.csv',
 '../data/data_human/writingprompts_human.csv',
 '../data/data_human/raid_human.csv',
 '../data/data_human/nyt_articles_human.csv',
 '../data/data_human/tweets_human.csv',
 '../data/data_human/nyt_comments_human.csv',
 '../data/data_human/reddit_human.csv',
 '../data/data_human/blogs_human.csv',
 '../data/data_human/essays_human.csv',
 '../data/data_ai/blogs/blogs_Llama-3.2-1B-Instruct.csv',
 '../data/data_ai/nyt_articles/nyt_articles_Llama-3.2-1B-Instruct.csv',
 '../data/data_ai/writingprompts/writingprompts_Llama-3.2-1B-Instruct.csv',
 '../data/data_ai/tweets/tweets_Llama-3.2-1B-Instruct.csv',
 '../data/data_ai/nyt_comments/nyt_comments_Llama-3.2-1B-Instruct.csv',
 '../data/data_ai/reddit/reddit_Llama-3.2-1B-Instruct.csv',
 '../data/data_ai/xsum/xsum_Llama-3.2-1B-Instruct.csv',
 '../data/data_ai/raid/raid_Llama-3.2-1B-Instruct.csv',
 '../data/data_ai/essays/essays_Llama-3.2-1B-Instruct.csv']

In [None]:
for path in paths:
    df = pd.read_csv(path)
    texts = df["text"]
    df_features = calc_features(texts)
    df_features.to_csv(path.replace(".csv", "_features.csv"), index=False)