In [1]:
import os
import csv
import pandas as pd
from nltk.util import ngrams
from collections import Counter
from nltk.tokenize import word_tokenize
from tqdm import tqdm

In [2]:
def get_csv_paths(folder_path, recursive=False):
    if recursive:
        # Walk through all subdirectories
        file_paths = [os.path.join(root, file) 
                      for root, _, files in os.walk(folder_path) 
                      for file in files if file.endswith('.csv')]
    else:
        # Get files in the root folder only
        file_paths = [os.path.join(folder_path, file) 
                      for file in os.listdir(folder_path) 
                      if file.endswith('.csv')]
    
    return file_paths

In [22]:
def calc_ngrams(texts, min_n, max_n):
    ngrams_frequencies = {}

    for text in tqdm(texts):
        tokens = word_tokenize(text.lower())
        
        words_only = [token for token in tokens if token.isalpha()]
        
        for n in range(min_n, max_n + 1):
            if n not in ngrams_frequencies:
                ngrams_frequencies[n] = Counter()
            ngrams_generated = ngrams(words_only, n)
            ngrams_frequencies[n].update(ngrams_generated)

    return ngrams_frequencies

In [23]:
def save_ngrams_to_csv(ngrams_frequencies, csv_filename):
    with open(csv_filename, mode="w", newline="", encoding="utf-8") as csv_file:
        writer = csv.writer(csv_file)

        writer.writerow(["n", "ngram", "frequency"])

        for n, counter in ngrams_frequencies.items():
            sorted_ngrams = sorted(counter.items(), key=lambda x: x[1], reverse=True)
            for ngram, count in sorted_ngrams:
                writer.writerow([n, " ".join(ngram), count])

In [24]:
DATA_HUMAN_PATH = "../data/data_human"
DATA_AI_PATH = "../data/data_ai"
NGRAMS_PATH = "../data/ngrams/"
MIN_N = 1
MAX_N = 4

In [25]:
paths = get_csv_paths(DATA_HUMAN_PATH) + get_csv_paths(DATA_AI_PATH, recursive=True)

In [26]:
for path in paths:
    if path.split("_")[-1] == "human.csv":
        ngrams_path = os.path.join(NGRAMS_PATH, path.split("/")[-2], path.split("/")[-1].replace(".csv", "_ngrams.csv"))
    else:
        ngrams_path = os.path.join(NGRAMS_PATH, path.split("/")[-3], path.split("/")[-2], path.split("/")[-1].replace(".csv", "_ngrams.csv"))

    df = pd.read_csv(path)
    texts = df["text"].values[:3000]
    ngrams_frequencies = calc_ngrams(texts, MIN_N, MAX_N)

    save_ngrams_to_csv(ngrams_frequencies, ngrams_path)

100%|██████████| 3000/3000 [00:07<00:00, 404.59it/s]
100%|██████████| 3000/3000 [00:13<00:00, 228.03it/s]
100%|██████████| 3000/3000 [00:04<00:00, 729.41it/s]
100%|██████████| 3000/3000 [00:00<00:00, 11196.73it/s]
100%|██████████| 3000/3000 [00:00<00:00, 3231.09it/s]
100%|██████████| 3000/3000 [00:01<00:00, 2253.01it/s]
100%|██████████| 3000/3000 [00:04<00:00, 679.56it/s] 
100%|██████████| 3000/3000 [00:00<00:00, 6316.19it/s]
100%|██████████| 2638/2638 [00:27<00:00, 97.61it/s] 
100%|██████████| 384/384 [00:00<00:00, 423.33it/s]
100%|██████████| 384/384 [00:00<00:00, 1182.84it/s]
100%|██████████| 24/24 [00:00<00:00, 305.94it/s]
100%|██████████| 24/24 [00:00<00:00, 3552.49it/s]
100%|██████████| 24/24 [00:00<00:00, 1257.95it/s]
100%|██████████| 24/24 [00:00<00:00, 3028.11it/s]
100%|██████████| 24/24 [00:00<00:00, 1244.39it/s]
100%|██████████| 24/24 [00:00<00:00, 660.13it/s]
100%|██████████| 24/24 [00:00<00:00, 473.70it/s]
