In [21]:
import os
import csv
import nltk
import spacy
import textstat
import pandas as pd
from collections import Counter
from tqdm import tqdm

# Download necessary NLTK data
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('stopwords')
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

# Load SpaCy model for syntactic features
nlp = spacy.load("en_core_web_sm")

In [80]:
# Lexical Features
def lexical_features(text):
    words = word_tokenize(text)
    sentences = sent_tokenize(text)
    stop_words = set(stopwords.words("english"))
    unique_words = set(words)
    
    features = {
        "word_count": len(words),
        "character_count": sum(len(word) for word in words),
        "average_word_length": sum(len(word) for word in words) / len(words),
        "sentence_count": len(sentences),
        "unique_words_ratio": len(unique_words) / len(words),
        "stopword_ratio": len([word for word in words if word.lower() in stop_words]) / len(words)
    }
    return features

# Syntactic Features
def syntactic_features(text):
    doc = nlp(text)
    pos_counts = Counter([token.pos_ for token in doc])
    
    features = {
        "noun_ratio": pos_counts.get("NOUN", 0) / len(doc),
        "verb_ratio": pos_counts.get("VERB", 0) / len(doc),
        "adjective_ratio": pos_counts.get("ADJ", 0) / len(doc),
        "average_sentence_length": sum(len(sent.text.split()) for sent in doc.sents) / len(list(doc.sents)),
        "entity_count": len(doc.ents)
    }
    return features

# Semantic Features
def semantic_features(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    
    features = {
        "named_entities": entities,
        "entity_count": len(entities)
    }
    return features

# Readability Scores
def readability_features(text):
    features = {
        "flesch_reading_ease": textstat.flesch_reading_ease(text),
        "gunning_fog_index": textstat.gunning_fog(text),
        "smog_index": textstat.smog_index(text),
        "automated_readability_index": textstat.automated_readability_index(text),
        "dale_chall_readbility": textstat.dale_chall_readability_score(text),

    }
    return features

# Stylometric Features
def stylometric_features(text):
    words = word_tokenize(text)
    bigrams = list(nltk.bigrams(words))
    trigrams = list(nltk.trigrams(words))
    
    features = {
        "bigram_count": len(bigrams),
        "trigrams_count": len(trigrams),
        "punctuation_count": sum(1 for char in text if char in ".,;!?")
    }
    return features

# Combine all features for one text
def extract_features_single_text(text):
    features = {}
    features.update(lexical_features(text))
    # features.update(syntactic_features(text)) # takes majority of compute time
    # features.update(semantic_features(text))
    features.update(readability_features(text))
    features.update(stylometric_features(text))
    return features


In [81]:
def calc_features(texts):
    results = []
    for text in tqdm(texts):
        features = extract_features_single_text(text)
        results.append(features)
    
    df = pd.DataFrame(results)
    return df

In [None]:
def get_csv_paths(folder_path, recursive=False):
    if recursive:
        # Walk through all subdirectories
        file_paths = [os.path.join(root, file) 
                      for root, _, files in os.walk(folder_path) 
                      for file in files if file.endswith('.csv')]
    else:
        # Get files in the root folder only
        file_paths = [os.path.join(folder_path, file) 
                      for file in os.listdir(folder_path) 
                      if file.endswith('.csv')]
    
    return file_paths

In [None]:
def save_feature_stats(df, stats, data_path, save_path):
    df_stat = df.agg(stats).reset_index()

    data_name, model = data_path.split("/")[-1].split("_")
    model = model.removesuffix(".csv")

    df_stat["model"] = model
    df_stat["data"] = data_name
    df_stat.rename(columns={"index": "stat"}, inplace=True)
    df_stat.to_csv(save_path, mode="a", index=False, header=not pd.io.common.file_exists(save_path))

In [84]:
def percentile(n):
    def percentile_(x):
        return x.quantile(n)
    percentile_.__name__ = 'percentile_{:02.0f}'.format(n*100)
    return percentile_

In [85]:
DATA_HUMAN_PATH = "../data/data_human"
DATA_AI_PATH = "../data/data_ai"
FEATURES_PATH = "../data/features/"
STATS_PATH = "../data/features/features_stats_master.csv"
STATS = ['mean', 'std', 'min', 'max', 'median', 'skew', 'kurtosis', 'var', percentile(0.1), percentile(0.2), percentile(0.3), percentile(0.4), percentile(0.5), percentile(0.6), percentile(0.7), percentile(0.8), percentile(0.9)]

In [105]:
paths = get_csv_paths(DATA_HUMAN_PATH) + get_csv_paths(DATA_AI_PATH, recursive=True)

In [None]:
for path in paths:
    if path.split("_")[-1] == "human.csv":
        features_path = os.path.join(FEATURES_PATH, path.split("/")[-2], path.split("/")[-1].replace(".csv", "_features.csv"))
    else:
        features_path = os.path.join(FEATURES_PATH, path.split("/")[-3], path.split("/")[-2], path.split("/")[-1].replace(".csv", "_features.csv"))

    df = pd.read_csv(path)
    texts = df["text"].values[:3000]
    df_features = calc_features(texts)
    df_features.to_csv(features_path, index=False)

    save_feature_stats(df_features, STATS, path, STATS_PATH)

100%|██████████| 3000/3000 [00:26<00:00, 111.72it/s]
100%|██████████| 3000/3000 [00:38<00:00, 77.69it/s] 
100%|██████████| 3000/3000 [00:14<00:00, 205.80it/s]
100%|██████████| 3000/3000 [00:01<00:00, 1932.83it/s]
100%|██████████| 3000/3000 [00:03<00:00, 913.13it/s] 
100%|██████████| 3000/3000 [00:05<00:00, 523.53it/s]
100%|██████████| 3000/3000 [00:15<00:00, 192.25it/s]
100%|██████████| 3000/3000 [00:02<00:00, 1381.62it/s]
100%|██████████| 2638/2638 [00:35<00:00, 74.48it/s] 
100%|██████████| 384/384 [00:02<00:00, 148.52it/s]
100%|██████████| 384/384 [00:01<00:00, 340.23it/s]
100%|██████████| 24/24 [00:00<00:00, 103.95it/s]
100%|██████████| 24/24 [00:00<00:00, 1130.51it/s]
100%|██████████| 24/24 [00:00<00:00, 444.72it/s]
100%|██████████| 24/24 [00:00<00:00, 815.84it/s]
100%|██████████| 24/24 [00:00<00:00, 434.17it/s]
100%|██████████| 24/24 [00:00<00:00, 177.39it/s]
100%|██████████| 24/24 [00:00<00:00, 139.46it/s]
