In [8]:
import pandas as pd
from pypdf import PdfReader
import string
import os
import re


In [9]:
def fix_text(raw_text: str) -> str:
    def fix_letter_spacing(text):
        letters_to_fix = [
            char for char in string.ascii_letters if char not in ['A', 'a', 'I', 'i']]

        for letter in letters_to_fix:
            text = text.replace(f" {letter} ", f"{letter} ")
            text = text.replace(f" {letter}.", f"{letter}.")

        return text

    trimmed_text = raw_text.strip()
    trimmed_text = " ".join(trimmed_text.split())
    trimmed_text = trimmed_text.replace(" .", ".")
    trimmed_text = trimmed_text.replace(" ,", ",")
    trimmed_text = trimmed_text.replace(" )", ")")
    trimmed_text = trimmed_text.replace("( ", "(")
    trimmed_text = trimmed_text.replace(" -", "-")
    trimmed_text = fix_letter_spacing(trimmed_text)
    return trimmed_text


In [10]:

# Get all the document names
doc_directory = "PDFS"
names = os.listdir(doc_directory)

# initialize dictionary
# KEY: DOCUMENT NAME
# VALUE (1-indexed): [START PAGE INCLUSIVE, END PAGE EXLUCISVE]


docs: dict[str, (int, int)] = {'Bridging Cognition and Socioculturalism Within Conceptual Change Research- Unnecessary Foray or Unachievable Feat.pdf': (2, 6),
                               'Cognitive Affective Engagement Model of Multiple Source Use.pdf': (2, 14),
                               'Confronting the Challenges of Undergraduates’ Argumentation Writing in a “Learning How to Learn” Course.pdf': (2, 30),
                               'Engagement and literacy- reading between the lines.pdf': (2, 7),
                               'Evolution of a Learning Theory- In Praise of Scientific Speculation.pdf': (2, 18),
                               'Hybridizing Psychological Theories- Weighing the Ends Against the Means.pdf': (2, 11),
                               'Individual differences in college-age learners- The importance of relational reasoning for learning and assessment in higher education.pdf': (2, 10),
                               'Investing a Novel Approach to Assessing Vocabulary Knowledge.pdf': (3, 33),
                               'Leveraging What Students Know to Make Sense of Texts- What the Research Says About Prior Knowledge Activation.pdf': (2, 31),
                               'Looking down the road- Future directions for research on depth and regulation of strategic processing.pdf': (2, 13),
                               'Relational Reasoning in Tertiary Education- What Is Its Value and How Can It Be Assessed and Trained.pdf': (2, 12),
                               'RR INSTRUCTION MANUAL.pdf': (2, 9),
                               'Seeking Common Ground- Surveying the Theoretical and Empirical Landscapes for Curiosity and Interest.pdf': (2, 8),
                               'Shared Discursive History- Rethinking Teachers as Role Models.pdf': (2, 22),
                               'The Effects of Processing Multimodal Texts in Print and Digitally on Comprehension and Calibration.pdf': (2, 19),
                               'The Relevance of Relevance for Learning and Performance.pdf': (2, 11),
                               'Through Myth to Reality- Reframing Education as Academic Development.pdf': (2, 16),
                               'What is Learning Anyway- A Topological Perspetive Considered.pdf': (2, 15),
                               'What Research Has Revealed About Readers’ Struggles With Comprehension in the Digital Age- Moving Beyond the Phonics Versus Whole Language Debate.pdf': (2, 7),
                               'Why This and Why Now- Introduction to the Special Issue on Metacognition, Self-Regulation, and Self-Regulated Learning.pdf': (2, 4),
                               'Yes…But- Footnotes To Sage Advice.pdf': (2, 7),
                               'apple': (1, 1),
                               '“Here Be Dragons!” Mapping the Realm of Higher-Order, Critical, and Critical-Analytic Thinking.pdf': (2, 15),

                               }

doc_names = list(docs.keys())

# double check if the keys are actually files
exists = []
for doc_name in doc_names:
    path = os.path.join(doc_directory, doc_name)
    exists.append((doc_name, os.path.exists(path)))

for name, status in exists:
    if status == False:
        print(f'document not found: {name}')
        docs.pop(name)

doc_names = list(docs.keys())


document not found: apple


In [11]:
def pdfToCSV(pdfFile: str):

    reader = PdfReader(f'./PDFS/{pdfFile}')

    start_page: int = docs[pdfFile][0]
    end_page: int = docs[pdfFile][1]
    if not isinstance(start_page, int) or not isinstance(end_page, int):
        print("Start page and end page must be integers.")
        return 0

    df = pd.DataFrame(columns=["SENTENCE", "NUM_WORDS", "SENTENCE_TYPE"])

    pages = reader.pages[start_page-1: end_page-1]
    sentences = []
    for i, page in enumerate(pages):
        raw_text = page.extract_text()

        fixed_text = fix_text(raw_text)
        # splitting sentences on both question marks and on periods
        split_sentences = re.split(r'[.\s?\s]\s+', fixed_text)

        # Only add sentences that have more than 4 words
        filtered_sentences = [
            sentence for sentence in split_sentences if len(sentence.split()) >= 12 and len(sentence.split()) <= 100]

        # Append these filtered sentences to the existing 'sentences' list
        sentences += filtered_sentences

    df = pd.DataFrame(sentences, columns=["SENTENCES"])
    csvFile = pdfFile.replace('.pdf', '.csv')
    # pdfFile = pdfFile[0:-4]

    output_file_path = f"CSVS/{csvFile}"
    csv = df.to_csv(output_file_path, index=False)

    # counting lines parsed
    df = pd.read_csv(output_file_path)

    # Count the number of rows
    num_rows = len(df)
    print(f'CSV Location:: {output_file_path}, Number of rows: {num_rows}')
    return num_rows

# test pdfToCSV


Testing pdfToCSV function

In [12]:
pdfToCSV('Bridging Cognition and Socioculturalism Within Conceptual Change Research- Unnecessary Foray or Unachievable Feat.pdf')

CSV Location:: CSVS/Bridging Cognition and Socioculturalism Within Conceptual Change Research- Unnecessary Foray or Unachievable Feat.csv, Number of rows: 97


97

In [13]:
rows = 0
for pdf in doc_names:
    rows += pdfToCSV(pdf)

print(f"Total rows: {rows}")


CSV Location:: CSVS/Bridging Cognition and Socioculturalism Within Conceptual Change Research- Unnecessary Foray or Unachievable Feat.csv, Number of rows: 97
CSV Location:: CSVS/Cognitive Affective Engagement Model of Multiple Source Use.csv, Number of rows: 313
CSV Location:: CSVS/Confronting the Challenges of Undergraduates’ Argumentation Writing in a “Learning How to Learn” Course.csv, Number of rows: 356
CSV Location:: CSVS/Engagement and literacy- reading between the lines.csv, Number of rows: 92
CSV Location:: CSVS/Evolution of a Learning Theory- In Praise of Scientific Speculation.csv, Number of rows: 272
CSV Location:: CSVS/Hybridizing Psychological Theories- Weighing the Ends Against the Means.csv, Number of rows: 146
CSV Location:: CSVS/Individual differences in college-age learners- The importance of relational reasoning for learning and assessment in higher education.csv, Number of rows: 144
CSV Location:: CSVS/Investing a Novel Approach to Assessing Vocabulary Knowledge.cs

Checking The Lines Parsed.

In [14]:

total_rows = 0
# Directory containing CSV files
csv_directory = 'CSVS'

# Iterate over all files in the directory
for i, filename in enumerate(os.listdir(csv_directory)):
    # Check if the file is a CSV
    if filename.endswith(".csv"):
        # Construct the full file path
        file_path = os.path.join(csv_directory, filename)
        
        # Read the CSV into a DataFrame
        df = pd.read_csv(file_path)
        
        # Count the number of rows
        num_rows = len(df)
        total_rows += num_rows
        
        # Output the result
        print(f'{i+1}. {filename}: {num_rows} rows')

print(f'TOTAL ROWS: {total_rows}')

1. Bridging Cognition and Socioculturalism Within Conceptual Change Research- Unnecessary Foray or Unachievable Feat.csv: 97 rows
2. Cognitive Affective Engagement Model of Multiple Source Use.csv: 313 rows
3. Confronting the Challenges of Undergraduates’ Argumentation Writing in a “Learning How to Learn” Course.csv: 356 rows
4. Engagement and literacy- reading between the lines.csv: 92 rows
5. Evolution of a Learning Theory- In Praise of Scientific Speculation.csv: 272 rows
6. Hybridizing Psychological Theories- Weighing the Ends Against the Means.csv: 146 rows
7. Individual differences in college-age learners- The importance of relational reasoning for learning and assessment in higher education.csv: 144 rows
8. Investing a Novel Approach to Assessing Vocabulary Knowledge.csv: 310 rows
9. Leveraging What Students Know to Make Sense of Texts- What the Research Says About Prior Knowledge Activation.csv: 458 rows
10. Looking down the road- Future directions for research on depth and reg