In [None]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
import networkx as nx
import numpy as np
from pypdf import PdfReader
from docx import Document

import ipywidgets as widgets
from IPython.display import display, clear_output


In [None]:
class TextSummarizer:
    def __init__(self):
        # Initialize spaCy model
        try:
            self.nlp = spacy.load("en_core_web_sm")
        except OSError:
            print("Error: Please install the spaCy model by running:")
            print("python -m spacy download en_core_web_sm")
            raise
    
    def summarize(self, text, num_sentence=5):
        """
        Generate a summary of the given text using TextRank algorithm
        
        Args:
            text (str): The text to summarize
            percent (float): Percentage of original text to include in summary (0.1 to 0.5)
            
        Returns:
            str: The generated summary
        """
        if not isinstance(text, str) or text.strip() == "":
            return "No text provided for summarization."
        
        return self.textrank_summary(text, num_sentence=5)
    
    def textrank_summary(self, text, num_sentence=5):
        """Generate text summary using TextRank algorithm"""
        doc = self.nlp(text)
        sentences = list(doc.sents)
        
        if len(sentences) <= 1:
            return text
        
        # Create sentence vectors using spaCy's word vectors
        sentence_vectors = []
        for sent in sentences:
            # Skip sentences with no words with vectors
            if not any(token.has_vector for token in sent):
                sent_vec = np.zeros((len(sent), 96))  # Default embedding dimension
            else:
                words_with_vectors = [token.vector for token in sent if token.has_vector]
                if not words_with_vectors:
                    sent_vec = np.zeros(96)  # Default dimension
                else:
                    sent_vec = np.mean(words_with_vectors, axis=0)
            sentence_vectors.append(sent_vec)
        
        # Create similarity matrix
        sim_mat = np.zeros([len(sentences), len(sentences)])
        
        # Fill the similarity matrix
        for i in range(len(sentences)):
            for j in range(len(sentences)):
                if i != j:
                    # Make sure we don't divide by zero
                    if np.linalg.norm(sentence_vectors[i]) * np.linalg.norm(sentence_vectors[j]) == 0:
                        sim_mat[i][j] = 0
                    else:
                        sim_mat[i][j] = self._cosine_similarity(sentence_vectors[i], sentence_vectors[j])
        
        # Create networkx graph and add edges with weights
        nx_graph = nx.from_numpy_array(sim_mat)
        
        # Apply PageRank algorithm
        scores = nx.pagerank(nx_graph)
        
        # Sort sentences by score and select top sentences
        ranked_sentences = sorted(((scores[i], i, s) for i, s in enumerate(sentences)), reverse=True)

        
        # Get top N sentences and sort them by original position
        top_sentences = sorted(ranked_sentences[:num_sentence], key=lambda x: x[1])
        
        # Combine sentences into summary
        summary = " ".join([s.text for _, _, s in top_sentences])
        
        return summary
    
    def _cosine_similarity(self, vec1, vec2):
        """Calculate cosine similarity between two vectors"""
        # Handle zero vectors
        if np.all(vec1 == 0) or np.all(vec2 == 0):
            return 0
        
        # Calculate cosine similarity
        return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))


# Example usage
# if __name__ == "__main__":
#     # Sample text for demonstration
#     sample_text = """
#     Artificial intelligence (AI) is intelligence demonstrated by machines, as opposed to natural intelligence displayed by animals including humans. 
#     AI research has been defined as the field of study of intelligent agents, which refers to any system that perceives its environment and takes actions that maximize its chance of achieving its goals.
#     The term "artificial intelligence" had previously been used to describe machines that mimic and display "human" cognitive skills that are associated with the human mind, such as "learning" and "problem-solving". 
#     This definition has since been rejected by major AI researchers who now describe AI in terms of rationality and acting rationally, which does not limit how intelligence can be articulated.
#     AI applications include advanced web search engines, recommendation systems, understanding human speech, self-driving cars, automated decision-making and competing at the highest level in strategic game systems.
#     As machines become increasingly capable, tasks considered to require "intelligence" are often removed from the definition of AI, a phenomenon known as the AI effect. 
#     For instance, optical character recognition is frequently excluded from things considered to be AI, having become a routine technology.
#     """
    
#     # Create a summarizer
#     summarizer = TextSummarizer()
    
#     # Generate summary at 30% length
#     summary = summarizer.summarize(sample_text, 0.5)
    
#     # Print results
#     print("Original Text Length:", len(sample_text.split()), "words")
#     print("Summary Length:", len(summary.split()), "words")
#     print("\n--- SUMMARY ---\n")
#     print(summary)



In [None]:

def extract_text_from_pdf(pdf_file):
    """Extract text from a PDF file and return as a single string."""
    reader = PdfReader(pdf_file)
    text_content = []
    for page in reader.pages:
        text = page.extract_text()
        if text.strip():
            text_content.append(text.strip())
    return " ".join(text_content)

def extract_text_from_docx(docx_file):
    """Extract text from a Word document and return as a single string."""
    doc = Document(docx_file)
    text_content = []
    for paragraph in doc.paragraphs:
        if paragraph.text.strip():
            text_content.append(paragraph.text.strip())
    return " ".join(text_content)

def extract_text(file):
    """Extract text from either PDF or Word document."""
    file_type = file.name.split('.')[-1].lower()
    if file_type == 'pdf':
        return extract_text_from_pdf(file)
    elif file_type in ['docx', 'doc']:
        return extract_text_from_docx(file)
    else:
        raise ValueError(f"Unsupported file type: {file_type}")

In [None]:
# File upload widget
upload = widgets.FileUpload(accept='.pdf, .docx', multiple=False)
process_button = widgets.Button(description="Process Request")
text_output = widgets.Output()


# Process file and summarize
def process_file(change):
    summarizer = TextSummarizer()
    
    text_output.clear_output()
    uploaded_file = list(upload.value.values())[0]["content"]
    file_name = list(upload.value.keys())[0]

    content = extract_text(uploaded_file)
        
    if not content.strip():
        with text_output:
            print("No extractable text found in the document.")
        return
    
    summary = summarizer.summarize(content, 5)
    
    with text_output:
        clear_output()
        print("Summary:")
        print(summary)

process_button.on_click(process_file)

display(upload, process_button, text_output)