In [1]:
import os
import glob
import PyPDF2
import pandas as pd
from collections import Counter, defaultdict
from nltk.util import ngrams
import re

In [2]:
def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            text += page.extract_text() or ""
    return text


In [3]:
def tokenize(text):
    # regex tokenizer (safer than nltk.word_tokenize)
    return re.findall(r'\b[a-zA-Z]+\b', text.lower())

In [4]:
def generate_ngrams(tokens, n):
    return [' '.join(gram) for gram in ngrams(tokens, n)]

In [5]:
pdf_folder = "batchspdf/"
pdf_files = glob.glob(os.path.join(pdf_folder, "*.pdf"))
csv_folder = "csv/"   # subfolder for saving CSVs   
os.makedirs(csv_folder, exist_ok=True)


In [6]:
aggregate_counts = defaultdict(lambda: {"total_freq": 0, "doc_count": 0})

In [7]:
for pdf in pdf_files:
    text = extract_text_from_pdf(pdf)
    tokens = tokenize(text)
    
    # per-document counters
    doc_counters = {
        "unigrams": Counter(tokens),
        "bigrams": Counter(generate_ngrams(tokens, 2)),
        "trigrams": Counter(generate_ngrams(tokens, 3)),
        "quadgrams": Counter(generate_ngrams(tokens, 4))
    }
    
    # update aggregate
    for category, counter in doc_counters.items():
        for term, freq in counter.items():
            aggregate_counts[(category, term)]["total_freq"] += freq
            aggregate_counts[(category, term)]["doc_count"] += 1
    
    # save per-document CSV
    rows_doc = []
    for category, counter in doc_counters.items():
        for term, freq in counter.items():
            rows_doc.append([category, term, freq])
    df_doc = pd.DataFrame(rows_doc, columns=["Category", "Term", "Frequency"])
    pdf_name = os.path.basename(pdf).replace(".pdf", "")
    df_doc.to_csv(os.path.join(csv_folder, f"{pdf_name}_ngrams.csv"), index=False)

PdfReadError: Missed the stop code in LZWDecode!

In [None]:
rows = []
for (category, term), data in aggregate_counts.items():
    total = data["total_freq"]
    docs = data["doc_count"]
    avg = total / docs if docs > 0 else 0
    rows.append([category, term, total, docs, avg])

df = pd.DataFrame(rows, columns=["Category", "Term", "Total_Frequency", "Doc_Count", "Avg_Freq_per_Doc"])
df.to_csv(os.path.join(csv_folder, "ngrams_summary.csv"), index=False)

print(df)

        Category                                  Term  Total_Frequency  \
0       unigrams                                 ipynb               38   
1       unigrams                          colaboratory               38   
2       unigrams                                 https               81   
3       unigrams                                 colab               39   
4       unigrams                              research               47   
...          ...                                   ...              ...   
59337  quadgrams                 to get required value                1   
59338  quadgrams               get required value thus                1   
59339  quadgrams          required value thus standard                1   
59340  quadgrams          value thus standard sampling                1   
59341  quadgrams  thus standard sampling distributions                1   

       Doc_Count  Avg_Freq_per_Doc  
0              1         38.000000  
1              1         