In [3]:
import streamlit as st
import PyPDF2
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.stem import PorterStemmer
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import string

# Ensure nltk resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')

# Set up the stopwords and stemmer
en_stopwords = set(stopwords.words("english"))
stemmer = PorterStemmer()

# List of sustainability-related words
words = ['sustainability', 'sustainable', 'environment', 'climate', 'carbon', 
         'renewable', 'green energy', 'recycling', 'biodiversity', 'emissions',
         'conservation', 'eco-friendly', 'solar', 'wind', 'energy', 'water', 'conservation']

# Stem each word
stems = [stemmer.stem(word) for word in words]

# Function to process the PDF and extract relevant sentences
def extract_text_from_pdf(pdf_file):
    reader = PyPDF2.PdfReader(pdf_file)
    text = ""
    for page_num in range(len(reader.pages)):
        page = reader.pages[page_num]
        text += page.extract_text()
    return text

# Function to find sentences containing sustainability keywords
def find_sustainability_sentences_with_stems(text, stems):
    sentences = re.split(r'(?<=[.!?]) +', text)
    relevant_sentences = []
    for sentence in sentences:
        sentence_words = re.findall(r'\b\w+\b', sentence)
        stemmed_sentence_words = [stemmer.stem(word.lower()) for word in sentence_words]
        if any(stem in stemmed_sentence_words for stem in stems):
            relevant_sentences.append(sentence)
    return relevant_sentences

# Streamlit UI
st.title("Sustainability Keyword Extraction from PDF")

# File uploader
pdf_file = st.file_uploader("Upload a PDF file", type="pdf")

if pdf_file is not None:
    # Step 1: Extract text from the uploaded PDF
    text = extract_text_from_pdf(pdf_file)
    text = re.sub(r'[^a-zA-Z0-9.,]', ' ', text) 
    text = re.sub('[0-9]+', '', text)
    
    # Step 2: Find sentences related to sustainability
    sustainability_sentences = find_sustainability_sentences_with_stems(text, stems)
    
    # Display the sustainability sentences
    if sustainability_sentences:
        st.subheader("Sustainability-related Sentences:")
        for sentence in sustainability_sentences:
            st.write(sentence)
    else:
        st.write("No sustainability-related sentences found.")
    
    # Step 3: Tokenize and filter words
    all_sustainability_words = []
    for sentence in sustainability_sentences:
        sustain_words = word_tokenize(sentence)
        sustain_words_filter = [w for w in sustain_words if w.lower() not in en_stopwords and w not in string.punctuation]
        sustain_words_filter = [w for w in sustain_words_filter if len(w) > 2]
        all_sustainability_words.extend(sustain_words_filter)
    
    # Step 4: Frequency Distribution of words
    sustain_freq = FreqDist(all_sustainability_words)
    
    # Display frequent words as a table
    st.subheader("Frequent Words and Their Frequencies")
    freq_table = [(word, sustain_freq[word]) for word in sustain_freq]
    freq_table.sort(key=lambda x: x[1], reverse=True)
    st.write(freq_table)
    
    # Step 5: Generate and display word cloud
    wordcloud = WordCloud(width=1000, height=500, stopwords=en_stopwords,
                          colormap="plasma", collocations=False, max_words=700).generate(' '.join(all_sustainability_words))
    
    st.subheader("Word Cloud of Sustainability-related Words")
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis('off')
    st.pyplot()


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/rajeshprabhakarkaila/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rajeshprabhakarkaila/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
2024-12-08 11:43:09.086 
  command:

    streamlit run /Users/rajeshprabhakarkaila/anaconda3/lib/python3.11/site-packages/ipykernel_launcher.py [ARGUMENTS]
