<a href="https://colab.research.google.com/github/Negativeice-0/Articles/blob/master/skleatn-numpy-conflict.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
!pip install PyPDF2 nltk scikit-learn rake-nltk spacy gensim ipywidgets matplotlib

import PyPDF2
import nltk
import numpy as np
import re
import os
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
import string
from rake_nltk import Rake
from google.colab import files  # While we import it, the GUI will handle uploads
import matplotlib.pyplot as plt
from IPython.display import display, HTML
import ipywidgets as widgets
from IPython.display import clear_output

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# --- Backend Functions (Mostly Unchanged, with minor error handling) ---

def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            for page_num in range(len(pdf_reader.pages)):
                page = pdf_reader.pages[page_num]
                text += page.extract_text()
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
    return text

def preprocess_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s.?!]', '', text)
    text = text.lower()
    return text

def tokenize_text(text):
    stop_words = set(stopwords.words('english'))
    custom_stops = {'fig', 'figure', 'et', 'al', 'ie', 'eg', 'chapter', 'section'}
    stop_words.update(custom_stops)
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens
                       if word not in stop_words
                       and word not in string.punctuation
                       and word.isalnum()
                       and len(word) > 2]
    return filtered_tokens

def extract_keywords_tfidf(text, top_n=25):
    sentences = sent_tokenize(text)
    vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
    try:
        tfidf_matrix = vectorizer.fit_transform(sentences)
        feature_names = vectorizer.get_feature_names_out()
        importance = {}
        for i in range(len(sentences)):
            feature_index = tfidf_matrix[i,:].nonzero()[1]
            tfidf_scores = zip(feature_index, [tfidf_matrix[i, x] for x in feature_index])
            for idx, score in tfidf_scores:
                word = feature_names[idx]
                importance[word] = importance.get(word, 0) + score
        sorted_keywords = sorted(importance.items(), key=lambda x: x[1], reverse=True)
        return [keyword for keyword, score in sorted_keywords[:top_n]]
    except ValueError:
        return [] # Handle cases with no text after preprocessing

def extract_keywords_rake(text, top_n=25):
    rake = Rake()
    rake.extract_keywords_from_text(text)
    keyword_phrases = rake.get_ranked_phrases()[:top_n]
    keywords = []
    for phrase in keyword_phrases:
        words = phrase.split()
        keywords.extend(words)
    counter = Counter(keywords)
    top_keywords = [word for word, _ in counter.most_common(top_n)]
    return top_keywords

def extract_keywords_combined(text, top_n=25):
    tfidf_keywords = extract_keywords_tfidf(text, top_n=top_n)
    rake_keywords = extract_keywords_rake(text, top_n=top_n)
    combined_keywords = {}
    for i, word in enumerate(tfidf_keywords):
        combined_keywords[word] = combined_keywords.get(word, 0) + (top_n - i)/top_n
    for i, word in enumerate(rake_keywords):
        combined_keywords[word] = combined_keywords.get(word, 0) + (top_n - i)/(top_n * 1.5)
    sorted_keywords = sorted(combined_keywords.items(), key=lambda x: x[1], reverse=True)
    return [keyword for keyword, _ in sorted_keywords[:top_n]]

def score_sentences(text, keywords, method='combined'):
    sentences = sent_tokenize(text)
    sentence_scores = {}
    keywords_lower = [kw.lower() for kw in keywords]
    for i, sentence in enumerate(sentences):
        sentence_lower = sentence.lower()
        words = word_tokenize(sentence_lower)
        keyword_score = sum(1 for word in words if word in keywords_lower)
        length_factor = min(1.0, 10.0/(len(words) + 5))
        position_factor = 1.0 if i < len(sentences) // 10 else 0.8
        if method == 'combined':
            sentence_scores[i] = (keyword_score * length_factor * position_factor)
        else:
            sentence_scores[i] = keyword_score / max(len(words), 1)
    return sentence_scores

def generate_summary(text, keywords, num_sentences=7, method='combined'):
    sentences = sent_tokenize(text)
    sentence_scores = score_sentences(text, keywords, method)
    sorted_scores = sorted(sentence_scores.items(), key=lambda item: item[1], reverse=True)
    top_sentences_indices = sorted([index for index, _ in sorted_scores[:num_sentences]])
    summary_sentences = [sentences[i] for i in top_sentences_indices]
    return " ".join(summary_sentences)

def visualize_keywords(keywords, title="Top Keywords"):
    plt.figure(figsize=(10, 6)) # Adjusted figure size for better GUI integration
    y_pos = np.arange(min(15, len(keywords))) # Show up to 15 keywords
    plt.barh(y_pos, [count for _, count in Counter(keywords).most_common(15)][::-1], align='center')
    plt.yticks(y_pos, [word for word, _ in Counter(keywords).most_common(15)][::-1])
    plt.xlabel('Frequency/Importance')
    plt.ylabel('Keywords')
    plt.title(title)
    plt.tight_layout()
    plt.show()

def format_summary(summary, title):
    html = f"""
    <div style="background-color: #f9f9f9; padding: 15px; border-radius: 5px; margin: 10px 0;">
        <h3 style="color: #2c3e50;">{title}</h3>
        <div style="line-height: 1.6; text-align: justify;">{summary}</div>
    </div>
    """
    return html

# --- GUI with ipywidgets ---

# File Upload Widget
file_upload = widgets.FileUpload(
    accept='.pdf',  # Only accept PDF files
    multiple=True,  # Allow multiple file uploads
    description='Upload PDF(s):',
    button_style='info',
    style={'description_width': 'initial'}
)

# Summary Length Slider
summary_length_slider = widgets.IntSlider(
    value=7,
    min=3,
    max=15,
    step=1,
    description='Summary Length (Sentences):',
    style={'description_width': 'initial'}
)

# Keyword Method Dropdown
keyword_method_dropdown = widgets.Dropdown(
    options=[('TF-IDF', 'tfidf'), ('RAKE', 'rake'), ('Combined', 'combined')],
    value='combined',
    description='Keyword Method:',
    style={'description_width': 'initial'}
)

# Output Area
output_area = widgets.Output()

# Process Button
process_button = widgets.Button(
    description='Generate Summary',
    button_style='primary',
    tooltip='Click to process uploaded PDFs'
)

# Function to handle button click
def on_process_button_clicked(b):
    with output_area:
        clear_output()
        uploaded_files = list(file_upload.value.keys())
        if not uploaded_files:
            print("Please upload PDF files.")
            return

        summary_length = summary_length_slider.value
        keyword_method = keyword_method_dropdown.value

        results = {}
        for filename in uploaded_files:
            file_content = file_upload.value[filename]['content']
            with open(filename, 'wb') as f:
                f.write(file_content)

            print(f"\nProcessing: {filename}")
            text = extract_text_from_pdf(filename)
            if not text:
                print(f"Could not extract text from {filename}")
                continue

            processed_text = preprocess_text(text)
            keywords = extract_keywords_combined(processed_text)

            print(f"\nTop Keywords for {filename}:")
            print(", ".join(keywords[:15]))
            visualize_keywords(keywords, f"Top Keywords in {filename}")

            summary = generate_summary(processed_text, keywords, num_sentences=summary_length)
            title = f"Summary of {filename}"
            display(HTML(format_summary(summary, title)))

            os.remove(filename) # Clean up uploaded file

        print("\nSummarization complete!")

# Attach the button click handler
process_button.on_click(on_process_button_clicked)

# Display the GUI elements
display(file_upload, summary_length_slider, keyword_method_dropdown, process_button, output_area)



ModuleNotFoundError: No module named 'numpy.strings'