In [30]:
import streamlit as st
from txtai.pipeline import Summary
from transformers import pipeline
from PyPDF2 import PdfReader
from rake_nltk import Rake
from textblob import TextBlob
import nltk
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [32]:
@st.cache_resource
def summary_text(text):
    summary = Summary()
    result = summary(text)
    return result

In [34]:
# Sentiment Analysis
def sentiment_analysis(text):
    analysis = TextBlob(text)
    return analysis.sentiment.polarity, analysis.sentiment.subjectivity

In [52]:
def extract_text_from_pdf(file_path):
    with open(file_path, 'rb') as f:
        reader = PdfReader(f)
        page = reader.pages[0]  
        text = page.extract_text()  # Extract the text from the page
    return text

In [38]:
def clean_text(text):
    # Remove extra spaces and non-alphabetic characters
    cleaned_text = re.sub(r'[^a-zA-Z\s]', '', text)  # Keep only letters and spaces
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)  # Replace multiple spaces with a single space
    return cleaned_text.strip()

def generate_meaningful_topic(text, num_words=3):
    # Clean the input text
    text = clean_text(text)
    
    # Initialize CountVectorizer with word-level tokenization
    vectorizer = CountVectorizer(stop_words='english', analyzer='word')
    doc_term_matrix = vectorizer.fit_transform([text])
    
    # Apply LDA for topic extraction
    lda_model = LatentDirichletAllocation(n_components=1, random_state=42)
    lda_model.fit(doc_term_matrix)
    
    # Extract the top `num_words` words from the topic
    words = [vectorizer.get_feature_names_out()[i] for i in lda_model.components_[0].argsort()[-num_words:]]
    
    # Combine the words into a single string for a meaningful topic
    meaningful_topic = " ".join(words).capitalize()
    
    return meaningful_topic

In [40]:
st.set_page_config(layout="wide")

In [42]:
choice = st.sidebar.selectbox("Select your choice", ["Summarize Text", "Summarize Document"])

In [44]:
# Function to extract top N keywords using RAKE
def extract_top_keywords(text, top_n=5):
    r = Rake()  # Initialize RAKE (Rapid Automatic Keyword Extraction)
    r.extract_keywords_from_text(text)
    ranked_phrases = r.get_ranked_phrases_with_scores()  # Get ranked phrases with their scores
    # Sort by score and take top N keywords
    top_keywords = sorted(ranked_phrases, key=lambda x: x[0], reverse=True)[:top_n]
    return top_keywords

In [50]:
import nltk
nltk.download('stopwords')

# Summarize Text option
if choice == "Summarize Text":
    st.subheader("Summarize, Extract Keywords, and Analyze Sentiment")
    input_text = st.text_area("Enter your text here")
    
    if input_text:
        if st.button("Process Text"):
            col1, col2, col3 = st.columns([1, 1, 1])  # Only three columns
            
            with col1:
                st.markdown("**Your Input Text**")
                st.info(input_text, icon="ℹ️")
            
            # Summarization
            with col2:
                st.markdown("**Summarized Text**")
                summarized_result = summary_text(input_text)
                st.success(summarized_result)

            # Extract top keywords
            with col2:
                st.markdown("**Top Extracted Keywords**")
                top_keywords = extract_top_keywords(summarized_result, top_n=5)
                # Show keywords with their scores
                for score, phrase in top_keywords:
                    st.write(f"{phrase} (Score: {score})")

            # Sentiment Analysis
            with col3:
                st.markdown("**Sentiment Analysis**")
                polarity, subjectivity = sentiment_analysis(input_text)
                st.write(f"Polarity: {polarity}, Subjectivity: {subjectivity}")

            # Topic Modeling (Move to a separate section below)
            st.markdown("**Extracted Topics**")
            topics = generate_meaningful_topic(summarized_result)
            st.success(" ".join(topics))

# Summarize Document option
elif choice == "Summarize Document":
    st.subheader("Summarize Document, Extract Keywords, and Analyze Sentiment")
    input_file = st.file_uploader("Upload your document", type=["pdf"])
    
    if input_file:
        if st.button("Process Document"):
            with open("doc_file.pdf", 'wb') as f:
                f.write(input_file.getbuffer())  # Save the uploaded PDF to a local file
            
            col1, col2, col3 = st.columns([1, 1, 1])  # Only three columns
            
            with col1:
                st.markdown("**Extracted Text from Document**")
                extracted_text = extract_text_from_pdf("doc_file.pdf")  # Extract text from the uploaded PDF
                st.info(extracted_text)
            
            # Summarization
            with col2:
                st.markdown("**Summarized Text**")
                summarized_result = summary_text(extracted_text)  # Summarize the extracted text
                st.success(summarized_result)
            
            # Keyword Extraction
            with col2:
                st.markdown("**Top Extracted Keywords**")
                top_keywords = extract_top_keywords(summarized_result, top_n=5)
                # Show keywords with their scores
                for score, phrase in top_keywords:
                    st.write(f"{phrase} (Score: {score})")

            
            # Sentiment Analysis
            with col3:
                st.markdown("**Sentiment Analysis**")
                polarity, subjectivity = sentiment_analysis(extracted_text)
                st.write(f"Polarity: {polarity}\nSubjectivity: {subjectivity}")

            # Topic Modeling (Move to a separate section below)
            st.markdown("**Extracted Topics**")
            topics = generate_meaningful_topic(summarized_result)
            st.success(" ".join(topics))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
