In [None]:
#To use this script, provide the path to a zip file containing PDF files and an output folder path. The script will extract text from the PDFs, clean and process the text, and save the aggregated data for topic modeling in a CSV file.

# Install necessary libraries
pip install PyMuPDF

import fitz  # PyMuPDF for working with PDF files
import zipfile  # Import the zipfile module to work with zip files
import os  # Import the os module for file and directory operations
import nltk  # Import the nltk module for natural language processing
import re  # Import the regular expressions library
from nltk.corpus import stopwords  # Import stopwords from nltk corpus
from nltk.tokenize import word_tokenize  # Import word_tokenize from nltk for tokenizing words
import pandas as pd  # Import pandas for data manipulation

# Ensure necessary NLTK data is downloaded
nltk.download('punkt')
nltk.download('stopwords')

def extract_3fullstopbase(zip_file_path, output_folder):
    # Step 1: Extract all PDF files from the zip file
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall(output_folder)

    # Define custom stop words
    custom_stop_words = { }
    stop_words = set(stopwords.words('english'))
    stop_words.update(custom_stop_words)

    # Step 2: Extract text from each PDF file
    for root, dirs, files in os.walk(output_folder):
        for file in files:
            if file.endswith(".pdf") and not file.startswith('.') and '__MACOSX' not in root:
                pdf_path = os.path.join(root, file)

                try:
                    doc = fitz.open(pdf_path)
                    text = ""
                    for page in doc:
                        text += page.get_text()
                    doc.close()

                    # Step 3: Pre-process the text
                    # Various replacements to clean up the text
                    replacements = { }
                    for old, new in replacements.items():
                        text = text.replace(old, new)

                    text = text.lower()

                    # Keep only English alphabet and full stop
                    text = re.sub(r'[^a-zA-Z. ]', '', text)

                    # Tokenize the text
                    words = word_tokenize(text)

                    # Remove stop words and numeric tokens
                    filtered_text = [word for word in words if word.lower() not in stop_words and not word.isnumeric()]

                    # Reconstruct the text
                    cleaned_text = ' '.join(filtered_text)

                    # Post-processing to clean and format the text
                    post_replacements = { }
                    for old, new in post_replacements.items():
                        cleaned_text = cleaned_text.replace(old, new)

                    cleaned_text = cleaned_text.replace('.', '')

                    # Eliminate lines with fewer than 20 alphabetic characters
                    cleaned_lines = [line for line in cleaned_text.split('\n') if sum(c.isalpha() for c in line) >= 20]
                    cleaned_text = '\n'.join(cleaned_lines)

                    # Save the cleaned text to a file
                    text_file_name = os.path.splitext(file)[0] + '.txt'
                    text_file_path = os.path.join(root, text_file_name)

                    with open(text_file_path, 'w', encoding='utf-8') as text_file:
                        text_file.write(cleaned_text)

                    print(f"Converted and modified '{file}' to '{text_file_name}'")
                except Exception as e:
                    print(f"Failed to convert '{file}': {e}")

    # Step 4: Aggregate cleaned text for topic modeling
    # Path to the folder containing the text files
    text_folder_path = output_folder

    # Initialize a list to store the data
    data = []

    # List all the text files in the folder
    text_files = [f for f in os.listdir(text_folder_path) if f.endswith('.txt')]

    for text_file in text_files:
        # Construct the full file path
        text_file_path = os.path.join(text_folder_path, text_file)

        # Initialize a list to store lines for the current file
        lines = []

        # Open and read the text file
        with open(text_file_path, 'r', encoding='utf-8') as file:
            for line in file:
                # Strip leading and trailing whitespace from the line
                clean_line = line.strip()
                # Append the line to the lines list
                lines.append(clean_line)

        # Aggregate every three lines into one row
        for i in range(0, len(lines), 3):
            # Join three lines, if less than three remain, join all remaining
            aggregated_line = ' '.join(lines[i:i+3])
            # Count the words in the aggregated line
            wordcount = len(aggregated_line.split())
            # Append the aggregated line, filename, and word count to the data list
            data.append({'line': aggregated_line, 'filename': text_file, 'wordcount': wordcount})

    # Step 5: Create a DataFrame from the data
    dft = pd.DataFrame(data)

    # Show the DataFrame
    print(dft)

    # Step 6: Save the DataFrame to a CSV file
    dft.to_csv('file_path', index=False)

    print("File saved successfully.")


In [None]:
#To use this script, provide the path to a zip file containing PDF files and an output folder path. The script will extract text from the PDFs, clean and process the text, and save the aggregated data for topic modeling in a CSV file.

# Install necessary libraries
pip install PyMuPDF

import fitz  # PyMuPDF for working with PDF files
import zipfile  # Import the zipfile module to work with zip files
import os  # Import the os module for file and directory operations
import nltk  # Import the nltk module for natural language processing
import re  # Import the regular expressions library
from nltk.corpus import stopwords  # Import stopwords from nltk corpus
from nltk.tokenize import word_tokenize  # Import word_tokenize from nltk for tokenizing words
import pandas as pd  # Import pandas for data manipulation

# Ensure necessary NLTK data is downloaded
nltk.download('punkt')
nltk.download('stopwords')

def extract_filebase(zip_file_path, output_folder):
    # Step 1: Create output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Step 2: Extract all PDF files from the zip file into the output folder
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall(output_folder)

    # Define custom stop words
    custom_stop_words = { }
    stop_words = set(stopwords.words('english'))
    stop_words.update(custom_stop_words)

    # Step 3: Extract text from each PDF file
    for root, dirs, files in os.walk(output_folder):
        for file in files:
            if file.endswith(".pdf") and not file.startswith('.') and '__MACOSX' not in root:
                pdf_path = os.path.join(root, file)

                try:
                    doc = fitz.open(pdf_path)
                    text = ""
                    for page in doc:
                        text += page.get_text()
                    doc.close()

                    # Step 4: Pre-process the text
                    # Various replacements to clean up the text
                    replacements = { }

                    for old, new in replacements.items():
                        text = text.replace(old, new)

                    text = text.lower()

                    # Keep only English alphabet and full stop
                    text = re.sub(r'[^a-zA-Z. ]', '', text)

                    # Tokenize the text
                    words = word_tokenize(text)

                    # Remove stop words and numeric tokens
                    filtered_text = [word for word in words if word.lower() not in stop_words and not word.isnumeric()]

                    # Reconstruct the text
                    cleaned_text = ' '.join(filtered_text)

                    # Eliminate lines with fewer than 20 alphabetic characters
                    cleaned_lines = [line for line in cleaned_text.split('\n') if sum(c.isalpha() for c in line) >= 20]
                    cleaned_text = '\n'.join(cleaned_lines)

                    # Step 5: Post-process the text
                    post_replacements = { }
                    for old, new in post_replacements.items():
                        cleaned_text = cleaned_text.replace(old, new)

                    cleaned_text = cleaned_text.replace('.', '')

                    # Save the cleaned text to a file
                    text_file_name = os.path.splitext(file)[0] + '.txt'
                    text_file_path = os.path.join(root, text_file_name)

                    with open(text_file_path, 'w', encoding='utf-8') as text_file:
                        text_file.write(cleaned_text)

                    print(f"Converted and modified '{file}' to '{text_file_name}'")
                except Exception as e:
                    print(f"Failed to convert '{file}': {e}")

    # Step 6: Aggregate cleaned text for topic modeling
    # Path to the folder containing the text files
    text_folder_path = output_folder

    # Initialize a list to store the data
    data = []

    # List all the text files in the folder
    text_files = [f for f in os.listdir(text_folder_path) if f.endswith('.txt')]

    for text_file in text_files:
        # Construct the full file path
        text_file_path = os.path.join(text_folder_path, text_file)

        # Initialize a list to store lines for the current file
        lines = []

        # Open and read the text file
        with open(text_file_path, 'r', encoding='utf-8') as file:
            for line in file:
                # Strip leading and trailing whitespace from the line
                clean_line = line.strip()
                # Append the line to the lines list
                lines.append(clean_line)

        # Step 7: Process each line in the text file
        for i in range(len(lines)):
            # Get the current line
            current_line = lines[i]
            # Count the words in the current line
            wordcount = len(current_line.split())
            # Append the current line, filename, and word count to the data list
            data.append({'line': current_line, 'filename': text_file, 'wordcount': wordcount})

    # Step 8: Create a DataFrame from the data
    dfd = pd.DataFrame(data)

    # Show the DataFrame
    print(dfd)

    # Step 9: Save the DataFrame to a CSV file
    dfd.to_csv('file_path', index=False)

    print("File saved successfully.")