In [2]:
# import required packages
import os
from datetime import datetime, timedelta

import requests
from dotenv import load_dotenv
from gnews import GNews
from langchain.chains.summarize import load_summarize_chain
from langchain.document_loaders import UnstructuredURLLoader
from langchain.prompts import PromptTemplate
from langchain_openai import ChatOpenAI

load_dotenv('.env')

ModuleNotFoundError: No module named 'dotenv'

In [3]:
def date_x_month_ago(months):
    '''returns (year, month, day) of the date x month from today '''
    today = datetime.now()
    date_x = today - timedelta(days=30 * months)
    date_x = (date_x.year, date_x.month, date_x.day)
    return date_x

date_x_month_ago(12)

(2023, 9, 8)

In [5]:
def get_headlines(keyword, month_from_today, max_results):
    ''' returns [{title: v, description: v, published date: v, url: v, publisher:v }] from Google News API '''
    google_news = GNews(
        language='en',
        country='US',
        start_date=date_x_month_ago(month_from_today),
        max_results=max_results,
        end_date=None,
        exclude_websites=['yahoo.com', 'foxnews.com', 'msn.com']
        )

    return google_news.get_news(keyword)

get_headlines('rare earths', 24, 5)

NameError: name 'GNews' is not defined

In [42]:
# prompting to perform news summary
prompt_template = """Generate summary for the following text, using the following steps:
                     1. Summary consists of maximum 100 words
                     2. If the text cannot be found or error, return: "Content empty"
                     3. Use only materials from the text supplied
                     4. Do not talk about the text as if written by somebody else. Write it like the author would have done.
                     5. Name causes and responsibilities from the conflicts described.
                     6. If statistics are available, include them in the summary. Ensuring that all statistical data is accurately represented.
                     7. Create the summary in English

                    "{text}"
                    SUMMARY:"""

#  save as a template 
prompt = PromptTemplate.from_template(prompt_template)


# declare LLM model
llm = ChatOpenAI(
    temperature=0,
    model_name="gpt-3.5-turbo-1106",
    api_key=os.environ['OPENAI_API_KEY']
    )

In [43]:
def generate_summary(search_keyword, month_from_today, max_results=5):

    # Get news headlines
    news_by_keyword = get_headlines(search_keyword, month_from_today, max_results)

    # Define output file path
    output_folder = "summaries"
    os.makedirs(output_folder, exist_ok=True)  # Create 'summaries' folder if it doesn't exist
    output_file = os.path.join(output_folder, f"{search_keyword}_summary.txt")

    # Write summaries to the file
    with open(output_file, 'w') as f:
        for item in news_by_keyword:
            # Extract news content
            loader = UnstructuredURLLoader(urls=[item['url']])
            docs = loader.load()

            if docs:
                # Summarize using stuff for easy processing
                chain = load_summarize_chain(llm,
                                            chain_type="stuff",
                                            prompt=prompt)
                summary = chain.run(docs)

                # find the original url not the 'news.google.com/rss...' thing
                original_url = requests.head(item['url'], allow_redirects=True)

                # remove the publisher name from the title
                index = item['title'].rfind(' - ')
                only_title = item['title'][:index]


                # Write summary to the file
                f.write(only_title + '\n')
                f.write(item['publisher']['title'] + ' ' + item['published date'][:-13] + '\n\n')
                f.write(original_url.url)
                f.write('\n\n')
                f.write(summary)
                f.write('\n'+'_'*20)
                f.write('\n\n\n\n')

            else:
            # Switch to a new item or handle the absence of loaded data
            # Your code here
                pass


In [40]:
generate_summary('Haiti', 24)

04/19/2024 03:04:48 AM - Reading document from string ...
04/19/2024 03:04:48 AM - Reading document ...
04/19/2024 03:04:51 AM - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
04/19/2024 03:04:52 AM - Reading document from string ...
04/19/2024 03:04:52 AM - Reading document ...
04/19/2024 03:04:52 AM - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
04/19/2024 03:04:54 AM - Reading document from string ...
04/19/2024 03:04:54 AM - Reading document ...
04/19/2024 03:04:56 AM - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
04/19/2024 03:04:57 AM - Reading document from string ...
04/19/2024 03:04:57 AM - Reading document ...
04/19/2024 03:04:59 AM - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


KeyboardInterrupt: 

In [12]:
news_by_keyword = get_headlines('Indian voting', 24, 5)

for item in news_by_keyword:
    print(item['title'])
    original_url = requests.head(item['url'], allow_redirects=True)
    # print(item['url'])
    print(original_url.url)


Lok Sabha election 2024: India begins voting in mammoth polls - The Associated Press
https://apnews.com/article/india-election-voting-begins-7ea0983c9b7bc7506c8457f44afcfd64


04/18/2024 04:20:27 AM - Reading document from string ...
04/18/2024 04:20:27 AM - Reading document ...


[Document(page_content='By\xa0\n\nASHOK SHARMA and\n\nKRUTIKA PATHI\n\n\n                    Share\n                    \n\nCopy\n    Link copied\n\nEmail\n\nFacebook\n\nX\n\nReddit\n\nLinkedIn\n\nPinterest\n\nFlipboard\n\nPrint\n\nNEW DELHI (AP) — Millions of Indians began voting Friday in a six-week election that’s a referendum on Narendra Modi, the populist prime minister who has championed an assertive brand of Hindu nationalist politics and is seeking a rare third term as the country’s leader.\n\nPeople began queuing up at polling stations hours before they were allowed in at 7 a.m. in the first 21 states to hold votes, from the Himalayan mountains to the tropical Andaman Islands. Nearly 970 million voters — more than 10% of the world’s population — will elect 543 members to the lower house of Parliament for five years during the staggered elections that run until June 1. The votes will be counted on June 4.\n\nThis election is seen as one of the most consequential in India’s hist

04/18/2024 04:20:35 AM - Reading document from string ...
04/18/2024 04:20:35 AM - Reading document ...


[Document(page_content="People queue to cast their votes in the country's West Bengal state on April 19, 2024.\n\nMuzaffarnagar and New Delhi\n\nCNN\n        \xa0—\n\nPolls opened Friday in the first and largest phase of India’s marathon election, in which populist Prime Minister Narendra Modi is widely expected to secure a rare third consecutive term and deepen his historic transformation of the world’s most populous nation.\n\nAbout 969 million people are eligible to vote in the biggest democratic exercise in human history, with polling taking place in seven phases over the next six weeks. Votes will be counted on June 4.\n\nIt is considered among the most consequential votes in decades, with Modi’s powerful right-wing Bharatiya Janata Party (BJP) seeking an outright majority in the lower house of parliament, or Lok Sabha.\n\nAn emphatic win for the BJP would give the party a mandate to\xa0further enshrine its Hindu-nationalist agenda, pulling away from India’s secular foundation tow

04/18/2024 04:20:36 AM - Reading document from string ...
04/18/2024 04:20:36 AM - Reading document ...


[Document(page_content='India first phase election updates: Modi seeks third term in mammoth vote\n\nVote pits ruling coalition led by PM Narendra Modi’s BJP against Indian National Developmental Inclusive Alliance, led by the Congress party.\n\nVideo Duration 02 minutes 43 seconds\n\n02:43\n\nBy\n\nNadim Asrar\xa0and\n\nUsaid Siddiqui\n\nPublished On 19 Apr 2024\n\n19 Apr 2024\n\nThis live page is now closed. You can continue to follow our coverage of India’s election here.', metadata={'source': 'https://news.google.com/rss/articles/CBMib2h0dHBzOi8vd3d3LmFsamF6ZWVyYS5jb20vbmV3cy9saXZlYmxvZy8yMDI0LzQvMTkvaW5kaWEtZWxlY3Rpb24tMjAyNC1saXZlLW5ld3Mtdm90ZS1mb3ItbG9rLXNhYmhhLXNlYXRzLWJlZ2luc9IBc2h0dHBzOi8vd3d3LmFsamF6ZWVyYS5jb20vYW1wL25ld3MvbGl2ZWJsb2cvMjAyNC80LzE5L2luZGlhLWVsZWN0aW9uLTIwMjQtbGl2ZS1uZXdzLXZvdGUtZm9yLWxvay1zYWJoYS1zZWF0cy1iZWdpbnM?oc=5&hl=en-US&gl=US&ceid=US:en'})]
In pictures: India votes in world's biggest election - BBC.com
https://www.bbc.com/news/world-asia-india-68852684

04/18/2024 04:20:36 AM - Reading document from string ...
04/18/2024 04:20:36 AM - Reading document ...


[Document(page_content='', metadata={'source': 'https://news.google.com/rss/articles/CBMiMmh0dHBzOi8vd3d3LmJiYy5jb20vbmV3cy93b3JsZC1hc2lhLWluZGlhLTY4ODUyNjg00gE2aHR0cHM6Ly93d3cuYmJjLmNvbS9uZXdzL3dvcmxkLWFzaWEtaW5kaWEtNjg4NTI2ODQuYW1w?oc=5&hl=en-US&gl=US&ceid=US:en'})]
Indians vote in huge election dominated by jobs, Hindu pride and Modi - Reuters India
https://www.reuters.com/world/india/india-votes-gigantic-election-modi-seeks-historic-third-term-2024-04-18/


04/18/2024 04:20:37 AM - Reading document from string ...
04/18/2024 04:20:37 AM - Reading document ...


[Document(page_content='Please enable JS and disable any ad blocker', metadata={'source': 'https://news.google.com/rss/articles/CBMibGh0dHBzOi8vd3d3LnJldXRlcnMuY29tL3dvcmxkL2luZGlhL2luZGlhLXZvdGVzLWdpZ2FudGljLWVsZWN0aW9uLW1vZGktc2Vla3MtaGlzdG9yaWMtdGhpcmQtdGVybS0yMDI0LTA0LTE4L9IBAA?oc=5&hl=en-US&gl=US&ceid=US:en'})]


In [36]:
loader = UnstructuredURLLoader(urls=['https://www.aljazeera.com/news/liveblog/2024/4/19/india-election-2024-live-news-vote-for-lok-sabha-seats-begins'], mode='single')
docs = loader.load()

docs

04/18/2024 06:06:56 AM - Reading document from string ...
04/18/2024 06:06:56 AM - Reading document ...


[Document(page_content='India first phase election updates: Modi seeks third term in mammoth vote\n\nVote pits ruling coalition led by PM Narendra Modi’s BJP against Indian National Developmental Inclusive Alliance, led by the Congress party.\n\nVideo Duration 02 minutes 43 seconds\n\n02:43\n\nBy\n\nNadim Asrar\xa0and\n\nUsaid Siddiqui\n\nPublished On 19 Apr 2024\n\n19 Apr 2024\n\nThis live page is now closed. You can continue to follow our coverage of India’s election here.', metadata={'source': 'https://www.aljazeera.com/news/liveblog/2024/4/19/india-election-2024-live-news-vote-for-lok-sabha-seats-begins'})]

In [22]:
docs = loader.load()
docs

04/18/2024 04:24:47 AM - Reading document from string ...
04/18/2024 04:24:47 AM - Reading document ...


[Document(page_content='Please enable JS and disable any ad blocker', metadata={'source': 'https://news.google.com/rss/articles/CBMibGh0dHBzOi8vd3d3LnJldXRlcnMuY29tL3dvcmxkL2luZGlhL2luZGlhLXZvdGVzLWdpZ2FudGljLWVsZWN0aW9uLW1vZGktc2Vla3MtaGlzdG9yaWMtdGhpcmQtdGVybS0yMDI0LTA0LTE4L9IBAA?oc=5&hl=en-US&gl=US&ceid=US:en'})]

In [None]:
news_by_keyword = get_headlines(search_keyword, month_from_today, max_results)

for item in news_by_keyword:
        # Extract news content
        loader = UnstructuredURLLoader(urls=[item['url']])
        docs = loader.load()


In [None]:
import requests
from bs4 import BeautifulSoup

def check_article_type(url):
    '''Tests the article for paywalls'''
    # Load the webpage content
    response = requests.get(url)
    if response.status_code != 200:
        print("Error: Failed to retrieve webpage content.")
        return "Unknown"

    # Parse HTML content
    soup = BeautifulSoup(response.content, 'html.parser')

    # Check content length
    content = soup.get_text()
    content_length = len(content)

    # Define keywords related to regular articles
    regular_keywords = ["news", "article", "report", "story"]

    # Check for subscription prompts
    subscription_prompts = ["subscribe", "subscription", "paywall", "premium"]

    # Check if content length meets the threshold for a regular article
    if content_length > 1000:  # Adjust the threshold as needed
        # Check for keywords related to regular articles
        for keyword in regular_keywords:
            if keyword in content.lower():
                # Check for subscription prompts
                for prompt in subscription_prompts:
                    if prompt in content.lower():
                        return "Paywall Article"
                return "Regular Article"

    # If none of the criteria are met, classify as unknown
    return "Unknown"

# Example usage:
url = "https://www.example.com/article"
article_type = check_article_type(url)
print("Article Type:", article_type)
