In [1]:
!pip install gradio requests python-dotenv beautifulsoup4 nltk tensorflow reportlab



In [2]:
#Import necessary libraries
import os
import requests
import pandas as pd
from datetime import datetime, timedelta
import time
from dotenv import load_dotenv
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.probability import FreqDist
import gradio as gr
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle
import numpy as np
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
from reportlab.lib.styles import getSampleStyleSheet
from io import BytesIO
import re
from bs4 import BeautifulSoup

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Load environment variables
load_dotenv()

print("All libraries imported successfully.")

All libraries imported successfully.


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\19727\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\19727\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
#Add the Guardian API Fetching code:
API_KEY = os.getenv('GUARDIAN_API_KEY')
BASE_URL = "https://content.guardianapis.com/search"

def fetch_articles(start_date, end_date, section):
    articles = []
    current_date = start_date
    while current_date <= end_date:
        params = {
            'api-key': API_KEY,
            'section': section,
            'from-date': current_date.strftime("%Y-%m-%d"),
            'to-date': (current_date + timedelta(days=1)).strftime("%Y-%m-%d"),
            'show-fields': 'bodyText',
            'page-size': 50
        }
        response = requests.get(BASE_URL, params=params)
        data = response.json()
        articles.extend(data['response']['results'])
        current_date += timedelta(days=1)
        time.sleep(1)  # Rate limiting
    return articles

print("Guardian API fetching function defined.")

Guardian API fetching function defined.


In [4]:
#Add the text processing and summarization functions:
def preprocess_text(text):
    text = re.sub(r'http\S+|www.\S+', '', text, flags=re.MULTILINE)
    text = BeautifulSoup(text, "html.parser").get_text()
    text = ' '.join(text.split())
    return text

def summarize_text(text, num_sentences=3):
    sentences = sent_tokenize(text)
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text.lower())
    words = [word for word in words if word.isalnum() and word not in stop_words]
    
    freq = FreqDist(words)
    
    sentence_scores = {}
    for i, sentence in enumerate(sentences):
        for word in word_tokenize(sentence.lower()):
            if word in freq:
                if i in sentence_scores:
                    sentence_scores[i] += freq[word]
                else:
                    sentence_scores[i] = freq[word]
    
    top_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)[:num_sentences]
    summary = ' '.join([sentences[i] for i in sorted(top_sentences)])
    
    return summary

def get_top_snippets(articles, n=3):
    def get_article_date(article):
        return article.get('webPublicationDate', '')
    
    sorted_articles = sorted(articles, key=get_article_date, reverse=True)
    snippets = []
    for article in sorted_articles[:n]:
        title = article.get('webTitle', 'No title')
        body = article.get('fields', {}).get('bodyText', 'No description')
        snippet = f"{title}: {body[:100]}..."
        snippets.append(snippet)
    return snippets

print("Text processing and summarization functions defined.")

Text processing and summarization functions defined.


In [5]:
# Load the guardian model and define the predication function:
model = load_model('guardian_article_classifier_final.h5')
with open('tokenizer_final.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)
with open('label_encoder_final.pickle', 'rb') as handle:
    label_encoder = pickle.load(handle)

max_len = 200  # Make sure this matches the value used during training

def predict_article_category(title, body, model, tokenizer, label_encoder):
    text = f"{title} {body}"
    processed_text = preprocess_text(text)
    text_seq = tokenizer.texts_to_sequences([processed_text])
    text_pad = pad_sequences(text_seq, maxlen=max_len)
    prediction = model.predict(text_pad)
    predicted_class_index = np.argmax(prediction, axis=1)[0]
    predicted_class = label_encoder.classes_[predicted_class_index]
    return predicted_class

print("Guardian model loaded and prediction function defined.")

FileNotFoundError: [Errno 2] Unable to synchronously open file (unable to open file: name = 'guardian_article_classifier_final.h5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)

In [None]:
#Add the main news summary function:
def get_news_summary(topic, language='en', sort='newest', limit=10):
    end_date = datetime.now()
    start_date = end_date - timedelta(days=7)
    sections = ['politics', 'business', 'technology', 'sport', 'culture']
    all_articles = []
    
    for section in sections:
        all_articles.extend(fetch_articles(start_date, end_date, section))
    
    # Filter articles based on the topic
    filtered_articles = [article for article in all_articles if topic.lower() in article['webTitle'].lower()]
    
    # Sort articles
    if sort == 'newest':
        filtered_articles.sort(key=lambda x: x['webPublicationDate'], reverse=True)
    elif sort == 'oldest':
        filtered_articles.sort(key=lambda x: x['webPublicationDate'])
    
    # Limit the number of articles
    filtered_articles = filtered_articles[:limit]
    
    # Classify articles
    classified_articles = []
    for article in filtered_articles:
        title = article.get('webTitle', '')
        body = article.get('fields', {}).get('bodyText', '')
        category = predict_article_category(title, body, model, tokenizer, label_encoder)
        article['category'] = category
        classified_articles.append(article)

    # Group articles by category
    categorized_articles = {}
    for article in classified_articles:
        category = article['category']
        if category not in categorized_articles:
            categorized_articles[category] = []
        categorized_articles[category].append(article)

    # Generate summary for each category
    output = f"Summary of recent news on '{topic}':\n\n"
    for category, cat_articles in categorized_articles.items():
        output += f"{category.upper()}:\n"
        cat_content = " ".join([art.get('fields', {}).get('bodyText', '') for art in cat_articles])
        cat_summary = summarize_text(cat_content, num_sentences=2)
        output += f"{cat_summary}\n\n"

    # Get top snippets and sources
    top_snippets = get_top_snippets(filtered_articles)
    output += "Top Articles:\n"
    for i, snippet in enumerate(top_snippets, 1):
        output += f"{i}. {snippet}\n"

    sources = set(article.get('sectionName', 'Unknown') for article in filtered_articles)
    output += f"\nSources: {', '.join(sources)}"

    return output

print("Main news summary function defined.")

In [None]:
#Add the PDF generation function:
def generate_pdf(content):
    buffer = BytesIO()
    doc = SimpleDocTemplate(buffer, pagesize=letter)
    styles = getSampleStyleSheet()
    story = []

    # Add title
    story.append(Paragraph("News Summary", styles['Title']))
    story.append(Spacer(1, 12))

    # Add content
    for line in content.split('\n'):
        if line.strip():
            story.append(Paragraph(line, styles['BodyText']))
            story.append(Spacer(1, 6))

    doc.build(story)
    buffer.seek(0)
    return buffer

print("PDF generation function defined.")

In [None]:
#Create the Gradio interface:
def get_news_summary_with_pdf(topic, language='en', sort='newest', limit=10):
    text_output = get_news_summary(topic, language, sort, limit)
    pdf_buffer = generate_pdf(text_output)
    return text_output, pdf_buffer

iface = gr.Interface(
    fn=get_news_summary_with_pdf,
    inputs=[
        gr.Textbox(label="Enter the topic you want a summary for:"),
        gr.Dropdown(choices=["en"], label="Language", value="en"),
        gr.Dropdown(choices=["newest", "oldest"], label="Sort By", value="newest"),
        gr.Slider(minimum=1, maximum=25, step=1, label="Number of Articles", value=10)
    ],
    outputs=[
        gr.Textbox(label="Summary and Sources"),
        gr.File(label="Download PDF")
    ],
    title="Neural Newsroom",
    description="Get a summary of the latest news on a given topic from The Guardian, classified by our model. You can also download the summary as a PDF.",
    examples=[["climate change"], ["artificial intelligence"], ["global economy"]],
)

iface.launch()

print("Gradio interface launched. You can now interact with the application.")