In [None]:
#To use this script, provide the path to a zip file containing PDF files and a list of additional stopwords if needed. The script will generate and display a word cloud from the text extracted from the PDFs.

# Install necessary libraries
pip install PyMuPDF wordcloud matplotlib

import zipfile  # Import the zipfile module to work with zip files
import os  # Import the os module for file and directory operations
import fitz  # PyMuPDF: Import the fitz module to work with PDF files
from wordcloud import WordCloud, STOPWORDS  # Import WordCloud and STOPWORDS from the wordcloud module
import matplotlib.pyplot as plt  # Import matplotlib for plotting the word cloud
import shutil  # Import the shutil module for file and directory operations
import re  # Import the regular expressions module

def unigram_wordcloud(zip_path, additional_stopwords=None):
    # Step 1: Extract all PDF files from the zip file
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        # Extract all files into a temporary directory
        temp_dir = "./temp_pdf_extraction"
        zip_ref.extractall(temp_dir)

        # Find all PDF files extracted
        pdf_files = [os.path.join(temp_dir, f) for f in os.listdir(temp_dir) if f.endswith('.pdf')]

    # Step 2: Extract text from each PDF file
    combined_text = ""
    for pdf_file in pdf_files:
        with fitz.open(pdf_file) as doc:
            for page in doc:
                combined_text += page.get_text()
    all_text = combined_text.lower()

    # Clean up the temporary directory
    shutil.rmtree(temp_dir)

    # Step 3: Generate a word cloud from the combined text
    # Set up the stopwords
    stopwords = set(STOPWORDS)
    stopwords.update(additional_stopwords)

    # Generate the word cloud
    wordcloud = WordCloud(
        width=1000, height=1000,
        max_words=100,
        background_color='white',
        stopwords=stopwords,
        font_path='font_path',
        min_font_size=10
    ).generate(all_text)

    # Display the generated word cloud
    plt.figure(figsize=(8, 8), facecolor=None)
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.tight_layout(pad=0)
    plt.show()

    # Print the word frequencies in the word cloud
    print(wordcloud.words_)

    # Noted:
    # additional_stopwords = ['exampleword1', 'exampleword2']