In [None]:
#To use this script, provide the path to a zip file containing PDF files and a set of additional stopwords if needed. The script will generate and display a word cloud from the bigrams extracted from the PDFs.

# Install necessary libraries
pip install PyMuPDF wordcloud matplotlib nltk

import zipfile  # Import the zipfile module to work with zip files
import os  # Import the os module for file and directory operations
import fitz  # PyMuPDF: Import the fitz module to work with PDF files
from wordcloud import WordCloud  # Import WordCloud from the wordcloud module
import matplotlib.pyplot as plt  # Import matplotlib for plotting the word cloud
from nltk.util import bigrams  # Import bigrams utility from nltk
from nltk import FreqDist  # Import FreqDist from nltk for frequency distribution
from nltk.tokenize import word_tokenize  # Import word_tokenize from nltk for tokenizing words
import nltk  # Import the nltk module
import shutil  # Import the shutil module for file and directory operations
from nltk.corpus import stopwords  # Import stopwords from nltk corpus

# Ensure necessary NLTK data is downloaded
nltk.download('punkt')
nltk.download('stopwords')

def bigram_wordcloud(zip_path, additional_stopwords=None):
    # Step 1: Extract all PDF files from the zip file
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        # Extract all files into a temporary directory
        temp_dir = "./temp_pdf_extraction"
        zip_ref.extractall(temp_dir)

        # Find all PDF files extracted
        pdf_files = [os.path.join(temp_dir, f) for f in os.listdir(temp_dir) if f.endswith('.pdf')]

    # Step 2: Extract text from each PDF file
    combined_text = ""
    for pdf_file in pdf_files:
        with fitz.open(pdf_file) as doc:
            for page in doc:
                combined_text += page.get_text()
    all_text = combined_text.lower()

    # Clean up the temporary directory more robustly
    shutil.rmtree(temp_dir)

    # Step 3: Preprocess text for bigrams
    # Load the default set of stop words from NLTK
    stop_words = set(stopwords.words('english'))

    # Extend the stop words list with any additional stop words provided
    if additional_stopwords:
        stop_words.update(additional_stopwords)

    # Tokenize the text and remove stop words
    tokens = [word for word in word_tokenize(all_text.lower()) if word.isalpha() and word not in stop_words]

    # Generate bigrams
    bg = bigrams(tokens)

    # Compute frequency distribution of bigrams
    bg_freq = FreqDist(bg)

    # Sort bigrams by frequency and select top 100
    top_bigrams = bg_freq.most_common(100)

    # Convert frequency distribution to a dictionary with bigram strings as keys
    bg_freq_dict = {' '.join(k): v for k, v in top_bigrams}

    # Step 4: Generate a word cloud from bigrams
    wordcloud = WordCloud(
        width=1000, height=1000,
        max_words=100,
        background_color='white',
        font_path='font_path',
        min_font_size=10
    ).generate_from_frequencies(bg_freq_dict)

    # Display the generated word cloud
    plt.figure(figsize=(8, 8), facecolor=None)
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.tight_layout(pad=0)
    plt.show()

    # Print the word frequencies in the word cloud
    print(wordcloud.words_)

# Noted:
# additional_stopwords = {'exampleword1', 'exampleword2'}