In [1]:
pip install --upgrade gradio

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install beautifulsoup4 nltk

Note: you may need to restart the kernel to use updated packages.


In [3]:
#Import libraries and set up environment
import requests
import gradio as gr
from datetime import datetime, timedelta
import os
from dotenv import load_dotenv
import urllib.parse
import json
import re
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.probability import FreqDist

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Load environment variables
load_dotenv()

# API keys (store these securely, preferably as environment variables)
NEWSAPI_KEY = os.getenv("NEWSAPI_KEY")
MEDIASTACK_KEY = os.getenv("MEDIASTACK_KEY")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\19727\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\19727\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
#Caching functions
def cache_news(topic, language, articles):
    cache_dir = "news_cache"
    if not os.path.exists(cache_dir):
        os.makedirs(cache_dir)
    cache_file = os.path.join(cache_dir, f"cache_{topic}_{language}.json")
    with open(cache_file, 'w') as f:
        json.dump({'timestamp': datetime.now().isoformat(), 'articles': articles}, f)

def get_cached_news(topic, language):
    cache_dir = "news_cache"
    cache_file = os.path.join(cache_dir, f"cache_{topic}_{language}.json")
    try:
        with open(cache_file, 'r') as f:
            data = json.load(f)
            if datetime.fromisoformat(data['timestamp']) > datetime.now() - timedelta(hours=1):
                return data['articles']
    except FileNotFoundError:
        pass
    return None

In [5]:
#News Fetching Functions
def fetch_news(api, query, from_date, to_date, language='en', sort='published_desc', limit=10):
    if api == "newsapi":
        url = f"http://newsapi.org/v2/everything?q={urllib.parse.quote(query)}&from={from_date}&to={to_date}&sortBy=publishedAt&pageSize={limit}&language={language}&apiKey={NEWSAPI_KEY}"
    elif api == "mediastack":
        url = f"http://api.mediastack.com/v1/news?access_key={MEDIASTACK_KEY}&keywords={urllib.parse.quote(query)}&date={from_date},{to_date}&sort={sort}&limit={limit}&languages={language}"
    
    response = requests.get(url)
    return response.json()

In [6]:
#Text preprocessing functions
def summarize_text(text, num_sentences=5):
    # Tokenize the text into sentences
    sentences = sent_tokenize(text)
    
    # Tokenize words and remove stopwords
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text.lower())
    words = [word for word in words if word.isalnum() and word not in stop_words]
    
    # Calculate word frequencies
    freq = FreqDist(words)
    
    # Calculate sentence scores based on word frequencies
    sentence_scores = {}
    for i, sentence in enumerate(sentences):
        for word in word_tokenize(sentence.lower()):
            if word in freq:
                if i in sentence_scores:
                    sentence_scores[i] += freq[word]
                else:
                    sentence_scores[i] = freq[word]
    
    # Get the top N sentences with highest scores
    top_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)[:num_sentences]
    
    # Combine the top sentences in their original order
    summary = ' '.join([sentences[i] for i in sorted(top_sentences)])
    
    return summary

In [7]:
#Main news Summary function
def get_news_summary(topic, language='en', sort='published_desc', limit=10):
    cached_articles = get_cached_news(topic, language)
    if cached_articles:
        articles = cached_articles
    else:
        today = datetime.now().date()
        yesterday = today - timedelta(days=1)
        
        # Fetch news from both APIs
        newsapi_data = fetch_news("newsapi", topic, yesterday, today, language, sort, limit)
        mediastack_data = fetch_news("mediastack", topic, yesterday.strftime("%Y-%m-%d"), today.strftime("%Y-%m-%d"), language, sort, limit)
        
        # Combine articles from both sources
        articles = newsapi_data.get('articles', []) + mediastack_data.get('data', [])
        cache_news(topic, language, articles)
    
    if not articles:
        return "No articles found for the given topic."
    
    # Combine the content of all articles and preprocess
    all_content = " ".join([article.get('content', '') or article.get('description', '') for article in articles])
    preprocessed_content = preprocess_text(all_content)
    
    # Generate summary
    summary = summarize_text(preprocessed_content)
    
    # Get top snippets
    top_snippets = get_top_snippets(articles)
    
    # Prepare sources
    sources = set([article.get('source', {}).get('name') or article.get('source') for article in articles])
    sources_str = ", ".join(sources)
    
    # Prepare the output
    output = f"Summary:\n\n{summary}\n\nTop Articles:\n"
    for i, snippet in enumerate(top_snippets, 1):
        output += f"{i}. {snippet}\n"
    output += f"\nSources: {sources_str}"
    
    return output

In [9]:
#Gradio Interface
iface = gr.Interface(
    fn=get_news_summary,
    inputs=[
        gr.Textbox(label="Enter the topic you want a summary for:"),
        gr.Dropdown(choices=["en", "de", "es", "fr", "it", "nl", "no", "pt", "ru", "se", "zh"], label="Language", value="en"),
        gr.Dropdown(choices=["published_desc", "published_asc", "popularity"], label="Sort By", value="published_desc"),
        gr.Slider(minimum=1, maximum=25, step=1, label="Number of Articles", value=10)
    ],
    outputs=gr.Textbox(label="Summary and Sources"),
    title="News Summarizer",
    description="Get a summary of the latest news on a given topic from multiple sources."
)

In [10]:
#Launch the Interface
iface.launch()

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


