In [1]:
pip install --upgrade gradio

Collecting gradio
  Downloading gradio-4.39.0-py3-none-any.whl.metadata (15 kB)
Collecting gradio-client==1.1.1 (from gradio)
  Downloading gradio_client-1.1.1-py3-none-any.whl.metadata (7.1 kB)
Downloading gradio-4.39.0-py3-none-any.whl (12.4 MB)
   ---------------------------------------- 0.0/12.4 MB ? eta -:--:--
   ---------------------------------------- 0.0/12.4 MB ? eta -:--:--
   ---------------------------------------- 0.1/12.4 MB 1.3 MB/s eta 0:00:10
    --------------------------------------- 0.2/12.4 MB 1.7 MB/s eta 0:00:08
    --------------------------------------- 0.2/12.4 MB 1.5 MB/s eta 0:00:09
    --------------------------------------- 0.3/12.4 MB 1.5 MB/s eta 0:00:09
    --------------------------------------- 0.3/12.4 MB 1.5 MB/s eta 0:00:09
   - -------------------------------------- 0.4/12.4 MB 1.5 MB/s eta 0:00:09
   - -------------------------------------- 0.4/12.4 MB 1.4 MB/s eta 0:00:09
   - -------------------------------------- 0.5/12.4 MB 1.3 MB/s eta 0:00

In [2]:
pip install beautifulsoup4 nltk

Note: you may need to restart the kernel to use updated packages.


In [3]:
import traceback

In [4]:
#Import libraries and setup environment
import requests
import gradio as gr
from datetime import datetime, timedelta
import os
from dotenv import load_dotenv
import urllib.parse
import json
import re
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.probability import FreqDist

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Load environment variables
load_dotenv()

# API keys (store these securely, preferably as environment variables)
NEWSAPI_KEY = os.getenv("NEWSAPI_KEY")
MEDIASTACK_KEY = os.getenv("MEDIASTACK_KEY")

print("Environment set up complete.")

Environment set up complete.


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gefhz\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gefhz\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [5]:
# Caching functions
def cache_news(topic, language, articles):
    cache_dir = "news_cache"
    if not os.path.exists(cache_dir):
        os.makedirs(cache_dir)
    cache_file = os.path.join(cache_dir, f"cache_{topic}_{language}.json")
    with open(cache_file, 'w') as f:
        json.dump({'timestamp': datetime.now().isoformat(), 'articles': articles}, f)
    print(f"Cached {len(articles)} articles for topic '{topic}' in language '{language}'")

def get_cached_news(topic, language):
    cache_dir = "news_cache"
    cache_file = os.path.join(cache_dir, f"cache_{topic}_{language}.json")
    try:
        with open(cache_file, 'r') as f:
            data = json.load(f)
            if datetime.fromisoformat(data['timestamp']) > datetime.now() - timedelta(hours=1):
                print(f"Retrieved {len(data['articles'])} cached articles for topic '{topic}' in language '{language}'")
                return data['articles']
    except FileNotFoundError:
        print(f"No cache found for topic '{topic}' in language '{language}'")
    return None

print("Caching functions defined.")

Caching functions defined.


In [6]:
# News fetching function
def fetch_news(api, query, from_date, to_date, language='en', sort='published_desc', limit=10):
    if api == "newsapi":
        sort_param = {
            "published_desc": "publishedAt",
            "published_asc": "publishedAt",
            "popularity": "popularity"
        }.get(sort, "publishedAt")
        url = f"http://newsapi.org/v2/everything?q={urllib.parse.quote(query)}&from={from_date}&to={to_date}&sortBy={sort_param}&pageSize={limit}&language={language}&apiKey={NEWSAPI_KEY}"
    elif api == "mediastack":
        mediastack_sort = "published_desc" if sort in ["published_desc", "popularity"] else "published_asc"
        url = f"http://api.mediastack.com/v1/news?access_key={MEDIASTACK_KEY}&keywords={urllib.parse.quote(query)}&date={from_date},{to_date}&sort={mediastack_sort}&limit={limit}&languages={language}"
    
    print(f"Fetching news from {api}...")
    print(f"URL: {url}")
    
    response = requests.get(url)
    data = response.json()
    
    print(f"Response status code: {response.status_code}")
    print(f"Response data: {data}")
    
    if api == "newsapi" and sort == "published_asc":
        data['articles'] = data['articles'][::-1]
    elif api == "mediastack" and sort == "popularity":
        data['data'] = sorted(data['data'], key=lambda x: x.get('published_at', ''), reverse=True)
    
    return data

print("News fetching function defined.")

     

News fetching function defined.


In [7]:
# Text preprocessing and summarization functions
def preprocess_text(text):
    # Remove URLs
    text = re.sub(r'http\S+|www.\S+', '', text, flags=re.MULTILINE)
    # Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()
    # Remove extra whitespace
    text = ' '.join(text.split())
    return text

def summarize_text(text, num_sentences=3):
    # Tokenize the text into sentences
    sentences = sent_tokenize(text)
    
    # Remove stopwords and tokenize
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text.lower())
    words = [word for word in words if word.isalnum() and word not in stop_words]
    
    # Calculate word frequencies
    freq = FreqDist(words)
    
    # Calculate sentence scores based on word frequencies
    sentence_scores = {}
    for i, sentence in enumerate(sentences):
        for word in word_tokenize(sentence.lower()):
            if word in freq:
                if i in sentence_scores:
                    sentence_scores[i] += freq[word]
                else:
                    sentence_scores[i] = freq[word]
    
    # Get the top N sentences with highest scores
    top_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)[:num_sentences]
    
    # Combine the top sentences in their original order
    summary = ' '.join([sentences[i] for i in sorted(top_sentences)])
    
    return summary

def get_top_snippets(articles, n=3):
    def get_article_date(article):
        return article.get('publishedAt') or article.get('published_at') or ''
    
    sorted_articles = sorted(articles, key=get_article_date, reverse=True)
    snippets = []
    for article in sorted_articles[:n]:
        title = article.get('title', 'No title')
        description = article.get('description', 'No description')
        snippet = f"{title}: {description[:100]}..."
        snippets.append(snippet)
    return snippets

print("Text preprocessing and summarization functions defined.")



Text preprocessing and summarization functions defined.


In [8]:
#Main news summary
def get_news_summary(topic, language='en', sort='published_desc', limit=10):
    try:
        cached_articles = get_cached_news(topic, language)
        if cached_articles:
            articles = cached_articles
            print(f"Using cached articles. Number of articles: {len(articles)}")
        else:
            today = datetime.now().date()
            yesterday = today - timedelta(days=1)
            
            # Fetch news from both APIs
            newsapi_data = fetch_news("newsapi", topic, yesterday, today, language, sort, limit)
            mediastack_data = fetch_news("mediastack", topic, yesterday.strftime("%Y-%m-%d"), today.strftime("%Y-%m-%d"), language, sort, limit)
            
            # Combine articles from both sources
            articles = newsapi_data.get('articles', []) + mediastack_data.get('data', [])
            print(f"Fetched articles. NewsAPI: {len(newsapi_data.get('articles', []))}, MediaStack: {len(mediastack_data.get('data', []))}")
            
            if articles:
                cache_news(topic, language, articles)
        
        if not articles:
            return f"No articles found for the topic '{topic}'. Please try a different topic or check your API keys."
        
        print(f"Total number of articles: {len(articles)}")
        
        # Combine the content of all articles and preprocess
        all_content = " ".join([article.get('content', '') or article.get('description', '') for article in articles])
        preprocessed_content = preprocess_text(all_content)
        
        # Generate summary
        summary = summarize_text(preprocessed_content, num_sentences=3)
        
        # Get top snippets
        top_snippets = get_top_snippets(articles)
        
        # Prepare sources
        sources = set()
        for article in articles:
            if isinstance(article, dict):
                source = article.get('source', {})
                if isinstance(source, dict):
                    sources.add(source.get('name') or 'Unknown')
                else:
                    sources.add(source or 'Unknown')
            else:
                sources.add('Unknown')
        sources_str = ", ".join(sources)
        
        # Prepare the output
        output = f"Summary of recent news on '{topic}':\n\n{summary}\n\nTop Articles:\n"
        for i, snippet in enumerate(top_snippets, 1):
            output += f"{i}. {snippet}\n"
        output += f"\nSources: {sources_str}"
        
        return output
    except Exception as e:
        error_msg = f"An error occurred: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
        print(error_msg)
        return f"An error occurred while processing your request. Please try again later.\nError details: {str(e)}"

print("Main news summary function defined.")
    

Main news summary function defined.


In [9]:
#Gradio Interface
iface = gr.Interface(
    fn=get_news_summary,
    inputs=[
        gr.Textbox(label="Enter the topic you want a summary for:"),
        gr.Dropdown(choices=["en", "de", "es", "fr", "it", "nl", "no", "pt", "ru", "se", "zh"], label="Language", value="en"),
        gr.Dropdown(choices=["published_desc", "published_asc", "popularity"], label="Sort By", value="published_desc"),
        gr.Slider(minimum=1, maximum=25, step=1, label="Number of Articles", value=10)
    ],
    outputs=gr.Textbox(label="Summary and Sources"),
    title="News Summarizer",
    description="Get a summary of the latest news on a given topic from multiple sources."
)

print("Gradio interface defined.")

Gradio interface defined.


In [10]:
#Launch the Interface
iface.launch()

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




No cache found for topic 'russia' in language 'en'
Fetching news from newsapi...
URL: http://newsapi.org/v2/everything?q=russia&from=2024-07-22&to=2024-07-23&sortBy=publishedAt&pageSize=18&language=en&apiKey=None
Response status code: 401
Response data: {'status': 'error', 'code': 'apiKeyInvalid', 'message': 'Your API key is invalid or incorrect. Check your key, or go to https://newsapi.org to create a free API key.'}
Fetching news from mediastack...
URL: http://api.mediastack.com/v1/news?access_key=None&keywords=russia&date=2024-07-22,2024-07-23&sort=published_desc&limit=18&languages=en
Response status code: 401
Response data: {'error': {'code': 'invalid_access_key', 'message': 'You have not supplied a valid API Access Key.'}}
Fetched articles. NewsAPI: 0, MediaStack: 0
No cache found for topic 'putin' in language 'en'
Fetching news from newsapi...
URL: http://newsapi.org/v2/everything?q=putin&from=2024-07-22&to=2024-07-23&sortBy=publishedAt&pageSize=18&language=en&apiKey=None
Respons