In [4]:
import pandas as pd
import requests
import json

from gnews import GNews
from newspaper import Article

## defining functions

In [10]:
def extract_article_newspaper(url):
    try:
        response = requests.get(url, timeout=15)  # will give up if it takes longer than 15 sec
        response.raise_for_status()
        if response.status_code == 200:
            
            article = Article(url)
            article.download()
            article.parse()
            content = article.text
    
            #article.nlp()
            #keywords = article.keywords
            print("article collected")
            #return content, keywords
            return content
        
        else:
            print(f"Failed to fetch URL: {url}")
            return None
        
    except requests.Timeout:
        print(f"Timeout occurred while fetching URL: {url}")
        return None
    
    except requests.RequestException as e:
        print(f"Error occurred while fetching URL: {url}. Error: {e}")
        return None
    
    except Exception as ex:
        print(f"Error occurred during article extraction for URL: {url}. Error: {ex}")
        return None

In [11]:
def append_to_csv(data, filename):
    try:
        existing_df = pd.read_csv(filename)
    
        # append the new data to the existing one
        updated_df = pd.concat([existing_df, data], ignore_index=True)
        updated_df.drop_duplicates(inplace=True)
    except FileNotFoundError:
        updated_df = data
        
    #write to same filename
    updated_df.to_csv(filename, index=False)

In [25]:
def pull_data(keyword, start_date, end_date):
    
    google_news = GNews(language='en', country='US', start_date=start_date, end_date=end_date)
    
    news = google_news.get_news(keyword)
    
    #change from json to dataframe
    news_df = pd.DataFrame(news)

    # remove 'href' part of the publisher column and rename it to source
    news_df['source'] = news_df['publisher'].apply(lambda x: x['title'])

    # drop the original publisher column
    news_df.drop(columns=['publisher'], inplace=True)
    
    news_df['label'] = keyword
    
    # pull content using teh extract_article_newspaper function
    news_df['extracted_content'] = ''
    news_df['extracted_content'] = news_df['url'].apply(extract_article_newspaper)
    
    news_df = news_df.dropna(subset=['extracted_content'])
    
    # combine title and description 
    news_df['title_extracted_content'] = news_df['title'].str.cat(news_df['extracted_content'], sep=' ')
    
    # append any new results to the econ_news.csv using the append_to_csv function
    filename= 'data/labeled_articles.csv'
    append_to_csv(news_df, filename)
    
    print(f"results saved to {filename}")

## Collecting urls for each topic

In [33]:
google_news = GNews(language='en', country='US')
start_date=(2016,1,1)
end_date=(2024,4,25)
topics=['inflation','employment wages','housing','GDP','stock market','national debt']

for keyword in topics:
    pull_data(keyword, start_date, end_date)

article collected
article collected
article collected
article collected
article collected
article collected
article collected
article collected
article collected
article collected
article collected
article collected
article collected
article collected
article collected
article collected
article collected
article collected
article collected
article collected
article collected
Error occurred while fetching URL: https://news.google.com/rss/articles/CBMiRmh0dHBzOi8vbW4uZ292L2RlZWQvbmV3c2NlbnRlci9wdWJsaWNhdGlvbnMvdHJlbmRzL2p1bmUtMjAyMi93YWdlcy5qc3DSAQA?oc=5&hl=en-US&gl=US&ceid=US:en. Error: HTTPSConnectionPool(host='mn.gov', port=443): Max retries exceeded with url: /deed/newscenter/publications/trends/june-2022/wages.jsp (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1129)')))
Error occurred during article extraction for URL: https://news.google.com/rss/articles/CBMiO2h0dHBzOi8vd3d3

article collected
article collected
article collected
results saved to data/labeled_articles.csv
article collected
article collected
Error occurred while fetching URL: https://news.google.com/rss/articles/CBMiVWh0dHBzOi8vd3d3LmJscy5nb3YvcmVnaW9ucy93ZXN0L25ld3MtcmVsZWFzZS9jb3VudHllbXBsb3ltZW50YW5kd2FnZXNfY2FsaWZvcm5pYS5odG3SAQA?oc=5&hl=en-US&gl=US&ceid=US:en. Error: 403 Client Error: Forbidden for url: https://www.bls.gov/regions/west/news-release/countyemploymentandwages_california.htm
article collected
article collected
article collected
article collected
Error occurred while fetching URL: https://news.google.com/rss/articles/CBMiVWh0dHBzOi8vd3d3LmJscy5nb3YvcmVnaW9ucy93ZXN0L25ld3MtcmVsZWFzZS9jb3VudHllbXBsb3ltZW50YW5kd2FnZXNfd2FzaGluZ3Rvbi5odG3SAQA?oc=5&hl=en-US&gl=US&ceid=US:en. Error: 403 Client Error: Forbidden for url: https://www.bls.gov/regions/west/news-release/countyemploymentandwages_washington.htm
Error occurred while fetching URL: https://news.google.com/rss/articles/CBMiVmh

article collected
Error occurred while fetching URL: https://news.google.com/rss/articles/CBMiW2h0dHBzOi8vd3d3LmJscy5nb3YvcmVnaW9ucy9zb3V0aGVhc3QvbmV3cy1yZWxlYXNlL2NvdW50eWVtcGxveW1lbnRhbmR3YWdlc19taXNzaXNzaXBwaS5odG3SAQA?oc=5&hl=en-US&gl=US&ceid=US:en. Error: 403 Client Error: Forbidden for url: https://www.bls.gov/regions/southeast/news-release/countyemploymentandwages_mississippi.htm
Error occurred while fetching URL: https://news.google.com/rss/articles/CBMiXWh0dHBzOi8vd3d3LmJscy5nb3YvcmVnaW9ucy9zb3V0aGVhc3QvbmV3cy1yZWxlYXNlL2NvdW50eWVtcGxveW1lbnRhbmR3YWdlc19zb3V0aGNhcm9saW5hLmh0bdIBAA?oc=5&hl=en-US&gl=US&ceid=US:en. Error: 403 Client Error: Forbidden for url: https://www.bls.gov/regions/southeast/news-release/countyemploymentandwages_southcarolina.htm
Error occurred while fetching URL: https://news.google.com/rss/articles/CBMiVWh0dHBzOi8vd3d3LmJscy5nb3YvcmVnaW9ucy9taWR3ZXN0L25ld3MtcmVsZWFzZS9jb3VudHllbXBsb3ltZW50YW5kd2FnZXNfaW5kaWFuYS5odG3SAQA?oc=5&hl=en-US&gl=US&ceid=US:en. Error

Error occurred while fetching URL: https://news.google.com/rss/articles/CBMiV2h0dHBzOi8vd3d3LmJscy5nb3YvcmVnaW9ucy9zb3V0aGVhc3QvbmV3cy1yZWxlYXNlL2NvdW50eWVtcGxveW1lbnRhbmR3YWdlc19nZW9yZ2lhLmh0bdIBAA?oc=5&hl=en-US&gl=US&ceid=US:en. Error: 403 Client Error: Forbidden for url: https://www.bls.gov/regions/southeast/news-release/countyemploymentandwages_georgia.htm
Timeout occurred while fetching URL: https://news.google.com/rss/articles/CBMijwFodHRwczovL3d3dy5wb2xpY3ltYXR0ZXJzb2hpby5vcmcvcmVzZWFyY2gtcG9saWN5L2ZhaXItZWNvbm9teS93b3JrLXdhZ2VzL21pbmltdW0td2FnZS93b3JraW5nLWZvci1sZXNzLXRvby1tYW55LWpvYnMtc3RpbGwtcGF5LXRvby1saXR0bGUtMjAxOdIBAA?oc=5&hl=en-US&gl=US&ceid=US:en
Error occurred while fetching URL: https://news.google.com/rss/articles/CBMiGGh0dHBzOi8vd3d3LmJscy5nb3YvZW1wL9IBAA?oc=5&hl=en-US&gl=US&ceid=US:en. Error: 403 Client Error: Forbidden for url: https://www.bls.gov/emp/
Error occurred while fetching URL: https://news.google.com/rss/articles/CBMiQ2h0dHBzOi8vd3d3LnNjaWVuY2VkaXJlY3QuY

Error occurred while fetching URL: https://news.google.com/rss/articles/CBMiwwFodHRwczovL3d3dy5nb3YuY2EuZ292LzIwMjEvMDkvMjgvZ292ZXJub3ItbmV3c29tLXNpZ25zLWxlZ2lzbGF0aW9uLXRvLWluY3JlYXNlLWFmZm9yZGFibGUtaG91c2luZy1zdXBwbHktYW5kLXN0cmVuZ3RoZW4tYWNjb3VudGFiaWxpdHktaGlnaGxpZ2h0cy1jb21wcmVoZW5zaXZlLXN0cmF0ZWd5LXRvLXRhY2tsZS1ob3VzaW5nLWNyaXNpcy_SAQA?oc=5&hl=en-US&gl=US&ceid=US:en. Error: 403 Client Error: Forbidden for url: https://www.gov.ca.gov/2021/09/28/governor-newsom-signs-legislation-to-increase-affordable-housing-supply-and-strengthen-accountability-highlights-comprehensive-strategy-to-tackle-housing-crisis/
Error occurred while fetching URL: https://news.google.com/rss/articles/CBMic2h0dHBzOi8vd3d3LmFtZXJpY2FucHJvZ3Jlc3Mub3JnL2FydGljbGUvdGhlLXJlbnRhbC1ob3VzaW5nLWNyaXNpcy1pcy1hLXN1cHBseS1wcm9ibGVtLXRoYXQtbmVlZHMtc3VwcGx5LXNvbHV0aW9ucy_SAQA?oc=5&hl=en-US&gl=US&ceid=US:en. Error: 403 Client Error: Forbidden for url: https://www.americanprogress.org/article/the-rental-housing-crisis-is-a-

article collected
article collected
article collected
article collected
article collected
Error occurred while fetching URL: https://news.google.com/rss/articles/CBMiSmh0dHBzOi8vd3d3Lmdvdi5jYS5nb3YvMjAyMi8wOS8yOC9jYWxpZm9ybmlhLXRvLWJ1aWxkLW1vcmUtaG91c2luZy1mYXN0ZXIv0gEA?oc=5&hl=en-US&gl=US&ceid=US:en. Error: 403 Client Error: Forbidden for url: https://www.gov.ca.gov/2022/09/28/california-to-build-more-housing-faster/
article collected
article collected
Error occurred while fetching URL: https://news.google.com/rss/articles/CBMijgFodHRwczovL3d3dy5nb3YuY2EuZ292LzIwMjEvMDkvMTYvZ292ZXJub3ItbmV3c29tLXNpZ25zLWhpc3RvcmljLWxlZ2lzbGF0aW9uLXRvLWJvb3N0LWNhbGlmb3JuaWFzLWhvdXNpbmctc3VwcGx5LWFuZC1maWdodC10aGUtaG91c2luZy1jcmlzaXMv0gEA?oc=5&hl=en-US&gl=US&ceid=US:en. Error: 403 Client Error: Forbidden for url: https://www.gov.ca.gov/2021/09/16/governor-newsom-signs-historic-legislation-to-boost-californias-housing-supply-and-fight-the-housing-crisis/
article collected
article collected
article collec

article collected
article collected
article collected
article collected
article collected
article collected
article collected
article collected
article collected
Error occurred while fetching URL: https://news.google.com/rss/articles/CBMiaGh0dHBzOi8vd3d3LnJldXRlcnMuY29tL21hcmtldHMvYXNpYS9jaGluYXMtcTItZ2RwLXNlZW4tcmlzaW5nLTczLXl5LWxvdy1iYXNlLXJlY292ZXJ5LWZhZGVzLTIwMjMtMDctMTMv0gEA?oc=5&hl=en-US&gl=US&ceid=US:en. Error: 401 Client Error: HTTP Forbidden for url: https://www.reuters.com/markets/asia/chinas-q2-gdp-seen-rising-73-yy-low-base-recovery-fades-2023-07-13/
results saved to data/labeled_articles.csv
article collected
article collected
article collected
article collected
Error occurred while fetching URL: https://news.google.com/rss/articles/CBMiTmh0dHBzOi8vd3d3Lndzai5jb20vc3RvcnkvdGVlbmFnZXJzLWFyZS1wb3VyaW5nLWludG8tdGhlLXN0b2NrLW1hcmtldC0yZDlmOWZjM9IBTmh0dHBzOi8vd3d3Lndzai5jb20vc3RvcnkvdGVlbmFnZXJzLWFyZS1wb3VyaW5nLWludG8tdGhlLXN0b2NrLW1hcmtldC0yZDlmOWZjMw?oc=5&hl=en-US&gl=US&ceid=

article collected
article collected
Error occurred while fetching URL: https://news.google.com/rss/articles/CBMiWWh0dHBzOi8vd3d3Lndzai5jb20vZmluYW5jZS9zdG9ja3Mvd2h5LXRoZS1zdG9jay1tYXJrZXQta2VlcHMtY2hhbmdpbmctaXRzLXN0b3J5LTMxOGJiOWM20gEA?oc=5&hl=en-US&gl=US&ceid=US:en. Error: 403 Client Error: Forbidden for url: https://www.wsj.com/finance/stocks/why-the-stock-market-keeps-changing-its-story-318bb9c6
article collected
article collected
article collected
article collected
article collected
Error occurred while fetching URL: https://news.google.com/rss/articles/CBMiPmh0dHBzOi8vd3d3LmJhcnJvbnMuY29tL2xpdmVjb3ZlcmFnZS9zdG9jay1tYXJrZXQtdG9kYXktMDQxNjI00gEA?oc=5&hl=en-US&gl=US&ceid=US:en. Error: 403 Client Error: Forbidden for url: https://www.barrons.com/livecoverage/stock-market-today-041624
Error occurred while fetching URL: https://news.google.com/rss/articles/CBMiPmh0dHBzOi8vd3d3LmJhcnJvbnMuY29tL2xpdmVjb3ZlcmFnZS9zdG9jay1tYXJrZXQtdG9kYXktMDIyMDI00gEA?oc=5&hl=en-US&gl=US&ceid=US:en. Error:

article collected
Error occurred while fetching URL: https://news.google.com/rss/articles/CBMiPmh0dHBzOi8vd3d3LmJhcnJvbnMuY29tL2xpdmVjb3ZlcmFnZS9zdG9jay1tYXJrZXQtdG9kYXktMDQwMTI00gEA?oc=5&hl=en-US&gl=US&ceid=US:en. Error: 403 Client Error: Forbidden for url: https://www.barrons.com/livecoverage/stock-market-today-040124
article collected
Error occurred while fetching URL: https://news.google.com/rss/articles/CBMiUWh0dHBzOi8vd3d3Lndzai5jb20vbGl2ZWNvdmVyYWdlL3N0b2NrLW1hcmtldC10b2RheS1kb3ctam9uZXMtZWFybmluZ3MtMDItMDktMjAyNNIBAA?oc=5&hl=en-US&gl=US&ceid=US:en. Error: 403 Client Error: Forbidden for url: https://www.wsj.com/livecoverage/stock-market-today-dow-jones-earnings-02-09-2024
Error occurred while fetching URL: https://news.google.com/rss/articles/CBMiPmh0dHBzOi8vd3d3LmJhcnJvbnMuY29tL2xpdmVjb3ZlcmFnZS9zdG9jay1tYXJrZXQtdG9kYXktMDExNjI00gEA?oc=5&hl=en-US&gl=US&ceid=US:en. Error: 403 Client Error: Forbidden for url: https://www.barrons.com/livecoverage/stock-market-today-011624
Error o

article collected
article collected
article collected
Error occurred while fetching URL: https://news.google.com/rss/articles/CBMiQ2h0dHBzOi8vZWNvbm9mYWN0Lm9yZy9yaXNpbmctY29zdHMtb2YtZmluYW5jaW5nLXUtcy1nb3Zlcm5tZW50LWRlYnTSAQA?oc=5&hl=en-US&gl=US&ceid=US:en. Error: 403 Client Error: Forbidden for url: https://econofact.org/rising-costs-of-financing-u-s-government-debt
article collected
article collected
Error occurred while fetching URL: https://news.google.com/rss/articles/CBMieGh0dHBzOi8vd3d3Lm55dGltZXMuY29tLzIwMTYvMDUvMDcvdXMvcG9saXRpY3MvZG9uYWxkLXRydW1wcy1pZGVhLXRvLWN1dC1uYXRpb25hbC1kZWJ0LWdldC1jcmVkaXRvcnMtdG8tYWNjZXB0LWxlc3MuaHRtbNIBAA?oc=5&hl=en-US&gl=US&ceid=US:en. Error: 403 Client Error: Forbidden for url: https://www.nytimes.com/2016/05/07/us/politics/donald-trumps-idea-to-cut-national-debt-get-creditors-to-accept-less.html
article collected
article collected
Error occurred while fetching URL: https://news.google.com/rss/articles/CBMiRGh0dHBzOi8vd3d3Lm55dGltZXMuY29tLzIwMjMvMD

In [44]:
labeled_df = pd.read_csv('data/labeled_articles.csv')

#drop nas
labeled_df = labeled_df.dropna(subset=['extracted_content'])

# combine text columns
labeled_df['title_extracted_content'] = labeled_df['title'].str.cat(labeled_df['extracted_content'], sep=' ')

labeled_df['label'] = labeled_df['label'].replace('housing costs', 'housing')

In [46]:
labeled_df['label'].value_counts()

label
housing             177
inflation           170
GDP                 163
national debt       161
stock market        106
employment wages     88
Name: count, dtype: int64

In [47]:
# save to csv 
labeled_df.to_csv('data/labeled_articles.csv')

In [50]:
google_news = GNews(language='en', country='US')

infation_news = google_news.get_news('Inflation')
infation_news_df = pd.DataFrame(infation_news)

employment_news = google_news.get_news('employment wages')
employment_news_df = pd.DataFrame(employment_news)

housing_news = google_news.get_news('housing costs')
housing_news_df = pd.DataFrame(housing_news)

GDP_news = google_news.get_news('GDP')
GDP_news_df = pd.DataFrame(GDP_news)

stocks_news = google_news.get_news('stock market')
stocks_news_df = pd.DataFrame(stocks_news)

debt_news = google_news.get_news('national debt')
debt_news_df = pd.DataFrame(debt_news)

debt_news_df

# GRACE MAKE THIS INTO A FUNCTION

Unnamed: 0,title,description,published date,url,publisher
0,US debt: Why economists are so worried about h...,US debt: Why economists are so worried about h...,"Sat, 20 Apr 2024 12:16:00 GMT",https://news.google.com/rss/articles/CBMibWh0d...,"{'href': 'https://www.businessinsider.com', 't..."
1,IMF sounds alarm on ballooning US national deb...,IMF sounds alarm on ballooning US national deb...,"Thu, 18 Apr 2024 17:49:00 GMT",https://news.google.com/rss/articles/CBMiaWh0d...,"{'href': 'https://www.foxbusiness.com', 'title..."
2,"I tried to wipe out the national debt, but the...","I tried to wipe out the national debt, but the...","Fri, 19 Apr 2024 09:30:00 GMT",https://news.google.com/rss/articles/CBMiZmh0d...,"{'href': 'https://www.post-gazette.com', 'titl..."
3,How Debt-to-GDP Ratios Have Changed Since 2000...,How Debt-to-GDP Ratios Have Changed Since 2000...,"Thu, 18 Apr 2024 18:37:46 GMT",https://news.google.com/rss/articles/CBMiUGh0d...,"{'href': 'https://www.visualcapitalist.com', '..."
4,Pressure Mounts at IMF Against Blueprint for E...,Pressure Mounts at IMF Against Blueprint for E...,"Sun, 21 Apr 2024 12:00:00 GMT",https://news.google.com/rss/articles/CBMib2h0d...,"{'href': 'https://www.bloomberg.com', 'title':..."
...,...,...,...,...,...
95,Peterson Foundation Statement on National Debt...,Peterson Foundation Statement on National Debt...,"Tue, 02 Jan 2024 08:00:00 GMT",https://news.google.com/rss/articles/CBMicGh0d...,"{'href': 'https://www.pgpf.org', 'title': 'Pet..."
96,The National Debt Hit A Record $34 Trillion: H...,The National Debt Hit A Record $34 Trillion: H...,"Wed, 03 Jan 2024 08:00:00 GMT",https://news.google.com/rss/articles/CBMibGh0d...,"{'href': 'https://www.investopedia.com', 'titl..."
97,Top 10 Reasons Why the National Debt Matters -...,Top 10 Reasons Why the National Debt Matters ...,"Mon, 12 Feb 2024 08:00:00 GMT",https://news.google.com/rss/articles/CBMiQWh0d...,"{'href': 'https://www.pgpf.org', 'title': 'Pet..."
98,Interest payments on the nation’s debt are soa...,Interest payments on the nation’s debt are soa...,"Thu, 16 Nov 2023 08:00:00 GMT",https://news.google.com/rss/articles/CBMiXGh0d...,"{'href': 'https://www.cnn.com', 'title': 'CNN'}"


In [56]:
topics = ['inflation','employment wages','housing costs','GDP','stock market','national debt']

df = []

for topic in topics:
    df_topic = pull_topic_url(topic)
    df.append(df_topic)
    
final_df = pd.concat(df, ignore_index=True)

In [57]:
final_df

Unnamed: 0,title,description,published date,url,source,label
0,Fed's favorite inflation gauge and Big Tech ea...,Fed's favorite inflation gauge and Big Tech ea...,"Sun, 21 Apr 2024 11:42:05 GMT",https://news.google.com/rss/articles/CBMilgFod...,Yahoo Finance,inflation
1,Trump wins voters on inflation as Biden zeroes...,Trump wins voters on inflation as Biden zeroes...,"Sun, 21 Apr 2024 19:33:40 GMT",https://news.google.com/rss/articles/CBMiVWh0d...,CNBC,inflation
2,Inflation Mindset Taking Root in Japan Boosts ...,Inflation Mindset Taking Root in Japan Boosts ...,"Mon, 22 Apr 2024 00:30:00 GMT",https://news.google.com/rss/articles/CBMic2h0d...,Bloomberg,inflation
3,March's PCE Inflation Expected To Support Late...,March's PCE Inflation Expected To Support Late...,"Sun, 21 Apr 2024 23:14:42 GMT",https://news.google.com/rss/articles/CBMidWh0d...,Forbes,inflation
4,Why Haven't We Whipped inflation Yet? - AIER -...,Why Haven't We Whipped inflation Yet? AIER - ...,"Fri, 19 Apr 2024 10:55:14 GMT",https://news.google.com/rss/articles/CBMiQWh0d...,AIER - Daily Economy News,inflation
...,...,...,...,...,...,...
595,Peterson Foundation Statement on National Debt...,Peterson Foundation Statement on National Debt...,"Tue, 02 Jan 2024 08:00:00 GMT",https://news.google.com/rss/articles/CBMicGh0d...,Peterson Foundation,national debt
596,The National Debt Hit A Record $34 Trillion: H...,The National Debt Hit A Record $34 Trillion: H...,"Wed, 03 Jan 2024 08:00:00 GMT",https://news.google.com/rss/articles/CBMibGh0d...,Investopedia,national debt
597,Top 10 Reasons Why the National Debt Matters -...,Top 10 Reasons Why the National Debt Matters ...,"Mon, 12 Feb 2024 08:00:00 GMT",https://news.google.com/rss/articles/CBMiQWh0d...,Peterson Foundation,national debt
598,Interest payments on the nation’s debt are soa...,Interest payments on the nation’s debt are soa...,"Thu, 16 Nov 2023 08:00:00 GMT",https://news.google.com/rss/articles/CBMiXGh0d...,CNN,national debt


In [62]:
final_df['extracted_content'] = ''
#econ_news_df['extracted_keywords'] = ''

#econ_news_df[['extracted_content', 'extracted_keywords']] = econ_news_df['url'].apply(extract_article_newspaper)
final_df['extracted_content'] = final_df['url'].apply(extract_article_newspaper)


article collected
article collected
article collected
Error occurred during article extraction for URL: https://news.google.com/rss/articles/CBMidWh0dHBzOi8vd3d3LmZvcmJlcy5jb20vc2l0ZXMvc2ltb25tb29yZS8yMDI0LzA0LzIxL21hcmNocy1wY2UtaW5mbGF0aW9uLWV4cGVjdGVkLXRvLXN1cHBvcnQtbGF0ZXItaW50ZXJlc3QtcmF0ZS1jdXRzL9IBAA?oc=5&hl=en-US&gl=US&ceid=US:en. Error: Article `download()` failed with 403 Client Error: Max restarts limit reached for url: https://www.forbes.com/sites/simonmoore/2024/04/21/marchs-pce-inflation-expected-to-support-later-interest-rate-cuts/ on URL https://news.google.com/rss/articles/CBMidWh0dHBzOi8vd3d3LmZvcmJlcy5jb20vc2l0ZXMvc2ltb25tb29yZS8yMDI0LzA0LzIxL21hcmNocy1wY2UtaW5mbGF0aW9uLWV4cGVjdGVkLXRvLXN1cHBvcnQtbGF0ZXItaW50ZXJlc3QtcmF0ZS1jdXRzL9IBAA?oc=5&hl=en-US&gl=US&ceid=US:en
Error occurred while fetching URL: https://news.google.com/rss/articles/CBMiQWh0dHBzOi8vd3d3LmFpZXIub3JnL2FydGljbGUvd2h5LWhhdmVudC13ZS13aGlwcGVkLWluZmxhdGlvbi15ZXQv0gEA?oc=5&hl=en-US&gl=US&ceid=US:en. Error

article collected
article collected
article collected
article collected
article collected
article collected
article collected
article collected
article collected
article collected
Error occurred while fetching URL: https://news.google.com/rss/articles/CBMiWGh0dHBzOi8vd3d3Lm55dGltZXMuY29tLzIwMjQvMDQvMTEvYnVzaW5lc3MvZWNvbm9teS9mZWRlcmFsLXJlc2VydmUtc29mdC1uby1sYW5kaW5nLmh0bWzSAQA?oc=5&hl=en-US&gl=US&ceid=US:en. Error: 403 Client Error: Forbidden for url: https://www.nytimes.com/2024/04/11/business/economy/federal-reserve-soft-no-landing.html
article collected
article collected
article collected
article collected
article collected
Error occurred while fetching URL: https://news.google.com/rss/articles/CBMiQmh0dHBzOi8vd3d3Lm55dGltZXMuY29tLzIwMjQvMDQvMTAvYnVzaW5lc3MvY3BpLWluZmxhdGlvbi1mZWQuaHRtbNIBAA?oc=5&hl=en-US&gl=US&ceid=US:en. Error: 403 Client Error: Forbidden for url: https://www.nytimes.com/2024/04/10/business/cpi-inflation-fed.html
article collected
Error occurred while fetching URL

Error occurred while fetching URL: https://news.google.com/rss/articles/CBMiUmh0dHBzOi8vd3d3LmJscy5nb3YvcmVnaW9ucy93ZXN0L25ld3MtcmVsZWFzZS9jb3VudHllbXBsb3ltZW50YW5kd2FnZXNfYXJpem9uYS5odG3SAQA?oc=5&hl=en-US&gl=US&ceid=US:en. Error: 403 Client Error: Forbidden for url: https://www.bls.gov/regions/west/news-release/countyemploymentandwages_arizona.htm
article collected
article collected
article collected
Error occurred while fetching URL: https://news.google.com/rss/articles/CBMia2h0dHBzOi8vd3d3LnJldXRlcnMuY29tL21hcmtldHMvdXMvdXMtam9iLWdyb3d0aC1iZWF0cy1leHBlY3RhdGlvbnMtZGVjZW1iZXItd2FnZXMtcmlzZS1zb2xpZGx5LTIwMjQtMDEtMDUv0gEA?oc=5&hl=en-US&gl=US&ceid=US:en. Error: 401 Client Error: HTTP Forbidden for url: https://www.reuters.com/markets/us/us-job-growth-beats-expectations-december-wages-rise-solidly-2024-01-05/
Error occurred while fetching URL: https://news.google.com/rss/articles/CBMiUGh0dHBzOi8vd3d3LmJscy5nb3YvcmVnaW9ucy93ZXN0L25ld3MtcmVsZWFzZS9lbXBsb3ltZW50Y29zdGluZGV4X2xvc2FuZ2VsZXMua

article collected
Timeout occurred while fetching URL: https://news.google.com/rss/articles/CBMiRWh0dHBzOi8vd3d3LnNhY2JlZS5jb20vbmV3cy9kYXRhYmFzZXMvc3RhdGUtcGF5L2FydGljbGUyMjk0Njg1NDkuaHRtbNIBRWh0dHBzOi8vYW1wLnNhY2JlZS5jb20vbmV3cy9kYXRhYmFzZXMvc3RhdGUtcGF5L2FydGljbGUyMjk0Njg1NDkuaHRtbA?oc=5&hl=en-US&gl=US&ceid=US:en
Error occurred while fetching URL: https://news.google.com/rss/articles/CBMiV2h0dHBzOi8vd3d3LnBheWNvci5jb20vcmVzb3VyY2UtY2VudGVyL2FydGljbGVzL21pbmltdW0td2FnZS10aXBwZWQtZW1wbG95ZWVzLWJ5LXN0YXRlL9IBAA?oc=5&hl=en-US&gl=US&ceid=US:en. Error: 403 Client Error: Forbidden for url: https://www.paycor.com/resource-center/articles/minimum-wage-tipped-employees-by-state/
article collected
article collected
article collected
Error occurred while fetching URL: https://news.google.com/rss/articles/CBMif2h0dHBzOi8vdGhlY29udmVyc2F0aW9uLmNvbS93aHktZG9udC1hdXN0cmFsaWFucy10YWxrLWFib3V0LXRoZWlyLXNhbGFyaWVzLXBheS10cmFuc3BhcmVuY3ktYW5kLWZhaXJuZXNzLWdvLWhhbmQtaW4taGFuZC0yMjQwNjfSAQA?oc=5&hl=en-US

Error occurred while fetching URL: https://news.google.com/rss/articles/CBMiU2h0dHBzOi8vZmluYW5jZS55YWhvby5jb20vbmV3cy9iaWdnZXN0LWV4cGVuc2VzLXJldGlyZWVzLWV2ZXJ5LXN0YXRlLTIzMDAxODc4NS5odG1s0gEA?oc=5&hl=en-US&gl=US&ceid=US:en. Error: 404 Client Error: Not Found for url: https://finance.yahoo.com/news/biggest-expenses-retirees-every-state-230018785.html
Error occurred while fetching URL: https://news.google.com/rss/articles/CBMiU2h0dHBzOi8vd3d3Lndzai5jb20vZWNvbm9teS9ob3VzaW5nL2hvbWUtc2FsZXMtbW9ydGdhZ2UtcmF0ZXMtbWFyY2gtcmVwb3J0LTRhNTAxNTM30gEA?oc=5&hl=en-US&gl=US&ceid=US:en. Error: 403 Client Error: Forbidden for url: https://www.wsj.com/economy/housing/home-sales-mortgage-rates-march-report-4a501537
article collected
article collected
article collected
Error occurred while fetching URL: https://news.google.com/rss/articles/CBMiXGh0dHBzOi8vd3d3LnNmY2hyb25pY2xlLmNvbS9zZi9hcnRpY2xlL3NmLWhvbWVsZXNzLWhvdXNpbmctY3V0LWNvc3RzLXRlbmRlcmxvaW4tMTkzOTY1NzUucGhw0gEA?oc=5&hl=en-US&gl=US&ceid=US:en. Err

article collected
Error occurred while fetching URL: https://news.google.com/rss/articles/CBMiRGh0dHBzOi8vd3d3LmF4aW9zLmNvbS8yMDI0LzAzLzExL2JpZGVuLXVzLWhvdXNpbmctY3Jpc2lzLTIwMjUtYnVkZ2V00gEA?oc=5&hl=en-US&gl=US&ceid=US:en. Error: 403 Client Error: Forbidden for url: https://www.axios.com/2024/03/11/biden-us-housing-crisis-2025-budget
article collected
Error occurred while fetching URL: https://news.google.com/rss/articles/CBMifGh0dHBzOi8vdW5pZG9zdXMub3JnL2Jsb2cvMjAyNC8wMi8yMS9ob3VzaW5nLWNvc3RzLWRyaXZpbmctbGF0aW5vLXZvdGVyLWNvbmNlcm5zLWFib3V0LWluZmxhdGlvbi1haGVhZC1vZi10aGUtMjAyNC1lbGVjdGlvbi_SAQA?oc=5&hl=en-US&gl=US&ceid=US:en. Error: 403 Client Error: Forbidden for url: https://unidosus.org/blog/2024/02/21/housing-costs-driving-latino-voter-concerns-about-inflation-ahead-of-the-2024-election/
Error occurred while fetching URL: https://news.google.com/rss/articles/CBMiTWh0dHBzOi8vd3d3LnBvbGl0aWNvLmNvbS9uZXdzLzIwMjQvMDMvMTQvYmlkZW4taG91c2luZy1hZmZvcmRhYmlsaXR5LTAwMTQ2ODM40gEA?oc=5&hl=en-U

article collected
Error occurred while fetching URL: https://news.google.com/rss/articles/CBMiamh0dHBzOi8vd3d3LnJldXRlcnMuY29tL21hcmtldHMvY2FuYWRhcy1nZHAtb3V0cGVyZm9ybXMtamFuLWdyb3d0aC1mb3JlY2FzdC1saWtlbHktZ3Jldy0wNC1mZWItMjAyNC0wMy0yOC_SAQA?oc=5&hl=en-US&gl=US&ceid=US:en. Error: 401 Client Error: HTTP Forbidden for url: https://www.reuters.com/markets/canadas-gdp-outperforms-jan-growth-forecast-likely-grew-04-feb-2024-03-28/
article collected
article collected
article collected
article collected
article collected
Error occurred while fetching URL: https://news.google.com/rss/articles/CBMiVWh0dHBzOi8vd3d3LnJldXRlcnMuY29tL21hcmtldHMvYXNpYS9nbG9iYWwtbWFya2V0cy12aWV3LWFzaWEtZ3JhcGhpYy1waXgtMjAyNC0wNC0xNS_SAQA?oc=5&hl=en-US&gl=US&ceid=US:en. Error: 401 Client Error: HTTP Forbidden for url: https://www.reuters.com/markets/asia/global-markets-view-asia-graphic-pix-2024-04-15/
article collected
article collected
article collected
article collected
article collected
article collected
article c

article collected
article collected
article collected
Error occurred while fetching URL: https://news.google.com/rss/articles/CBMiR2h0dHBzOi8vd3d3Lndzai5jb20vbGl2ZWNvdmVyYWdlL3N0b2NrLW1hcmtldC10b2RheS1lYXJuaW5ncy0wNC0xNy0yMDI00gEA?oc=5&hl=en-US&gl=US&ceid=US:en. Error: 403 Client Error: Forbidden for url: https://www.wsj.com/livecoverage/stock-market-today-earnings-04-17-2024
article collected
article collected
article collected
article collected
Error occurred while fetching URL: https://news.google.com/rss/articles/CBMidGh0dHBzOi8vd3d3Lm1hcmtldHdhdGNoLmNvbS9zdG9yeS90aGlzLWNlbnR1cnktb2xkLXN0b2NrLW1hcmtldC1pbmRpY2F0b3Itc3VnZ2VzdHMtc2VsbG9mZi1pcy1mYXItZnJvbS1vdmVyLWY1YzAwNzdm0gF4aHR0cHM6Ly93d3cubWFya2V0d2F0Y2guY29tL2FtcC9zdG9yeS90aGlzLWNlbnR1cnktb2xkLXN0b2NrLW1hcmtldC1pbmRpY2F0b3Itc3VnZ2VzdHMtc2VsbG9mZi1pcy1mYXItZnJvbS1vdmVyLWY1YzAwNzdm?oc=5&hl=en-US&gl=US&ceid=US:en. Error: 401 Client Error: HTTP Forbidden for url: https://www.marketwatch.com/story/this-century-old-stock-market-indicat

article collected
Error occurred while fetching URL: https://news.google.com/rss/articles/CBMiPmh0dHBzOi8vd3d3LmJhcnJvbnMuY29tL2xpdmVjb3ZlcmFnZS9zdG9jay1tYXJrZXQtdG9kYXktMDQxNTI00gEA?oc=5&hl=en-US&gl=US&ceid=US:en. Error: 403 Client Error: Forbidden for url: https://www.barrons.com/livecoverage/stock-market-today-041524
article collected
Error occurred while fetching URL: https://news.google.com/rss/articles/CBMiZWh0dHBzOi8vd3d3Lndzai5jb20vZmluYW5jZS9tYXJrZXQtcmVhY3Rpb24tdG8taXJhbi1hdHRhY2stdGVsbHMtdXMtc3RvY2tzLWFyZW50LWluLWEtYnViYmxlLWU1ZTBiNmZl0gEA?oc=5&hl=en-US&gl=US&ceid=US:en. Error: 403 Client Error: Forbidden for url: https://www.wsj.com/finance/market-reaction-to-iran-attack-tells-us-stocks-arent-in-a-bubble-e5e0b6fe
Error occurred while fetching URL: https://news.google.com/rss/articles/CBMic2h0dHBzOi8vd3d3LnJldXRlcnMuY29tL21hcmtldHMvdXMvZnV0dXJlcy1ib3VuY2UtYmFjay1sYXN0LXNlc3Npb25zLWJhdHRlcmluZy1hbWlkLW1pZGRsZS1lYXN0LWppdHRlcnMtMjAyNC0wNC0xNS_SAQA?oc=5&hl=en-US&gl=US&ceid=US:e

article collected
article collected
article collected
Error occurred while fetching URL: https://news.google.com/rss/articles/CBMiXWh0dHBzOi8vdGhlaGlsbC5jb20vYnVzaW5lc3MvNDU5NzM5NS1zb21ldGhpbmctd2lsbC1oYXZlLXRvLWdpdmUtaW1mLXNvdW5kcy1hbGFybS1vbi11cy1kZWJ0L9IBYWh0dHBzOi8vdGhlaGlsbC5jb20vYnVzaW5lc3MvNDU5NzM5NS1zb21ldGhpbmctd2lsbC1oYXZlLXRvLWdpdmUtaW1mLXNvdW5kcy1hbGFybS1vbi11cy1kZWJ0L2FtcC8?oc=5&hl=en-US&gl=US&ceid=US:en. Error: 429 Client Error: Unknown Error for url: https://thehill.com/business/4597395-something-will-have-to-give-imf-sounds-alarm-on-us-debt/
Error occurred while fetching URL: https://news.google.com/rss/articles/CBMiTWh0dHBzOi8vd3d3LnZpc3VhbGNhcGl0YWxpc3QuY29tL3Utcy1kZWJ0LWludGVyZXN0LXBheW1lbnRzLXJlYWNoLTEtdHJpbGxpb24v0gEA?oc=5&hl=en-US&gl=US&ceid=US:en. Error: 403 Client Error: Forbidden for url: https://www.visualcapitalist.com/u-s-debt-interest-payments-reach-1-trillion/
article collected
article collected
Error occurred while fetching URL: https://news.google.com/rs

article collected
article collected
article collected
article collected
article collected
article collected


In [86]:
final_df

Unnamed: 0,title,description,published date,url,source,label,extracted_content,title_extracted_content
0,Fed's favorite inflation gauge and Big Tech ea...,Fed's favorite inflation gauge and Big Tech ea...,"Sun, 21 Apr 2024 11:42:05 GMT",https://news.google.com/rss/articles/CBMilgFod...,Yahoo Finance,inflation,The market rally is at its most fragile point ...,Feds favorite inflation gauge and Big Tech ear...
1,Trump wins voters on inflation as Biden zeroes...,Trump wins voters on inflation as Biden zeroes...,"Sun, 21 Apr 2024 19:33:40 GMT",https://news.google.com/rss/articles/CBMiVWh0d...,CNBC,inflation,More voters trust Donald Trump than President ...,Trump wins voters on inflation as Biden zeroes...
2,Inflation Mindset Taking Root in Japan Boosts ...,Inflation Mindset Taking Root in Japan Boosts ...,"Mon, 22 Apr 2024 00:30:00 GMT",https://news.google.com/rss/articles/CBMic2h0d...,Bloomberg,inflation,Higher inflationary expectations and price tol...,Inflation Mindset Taking Root in Japan Boosts ...
3,March's PCE Inflation Expected To Support Late...,March's PCE Inflation Expected To Support Late...,"Sun, 21 Apr 2024 23:14:42 GMT",https://news.google.com/rss/articles/CBMidWh0d...,Forbes,inflation,Jerome Powell chairman of the US Federal Reser...,Marchs PCE Inflation Expected To Support Later...
5,Average 30-year fixed mortgage rates continue ...,Average 30-year fixed mortgage rates continue ...,"Fri, 19 Apr 2024 19:40:40 GMT",https://news.google.com/rss/articles/CBMiaGh0d...,USA TODAY,inflation,Average year fixed mortgage rates in the US co...,Average year fixed mortgage rates continue to ...
...,...,...,...,...,...,...,...,...
595,Peterson Foundation Statement on National Debt...,Peterson Foundation Statement on National Debt...,"Tue, 02 Jan 2024 08:00:00 GMT",https://news.google.com/rss/articles/CBMicGh0d...,Peterson Foundation,national debt,Peterson Foundation Statement on National Debt...,Peterson Foundation Statement on National Debt...
596,The National Debt Hit A Record $34 Trillion: H...,The National Debt Hit A Record $34 Trillion: H...,"Wed, 03 Jan 2024 08:00:00 GMT",https://news.google.com/rss/articles/CBMibGh0d...,Investopedia,national debt,Key Takeaways The US national debt crossed the...,The National Debt Hit A Record Trillion How Mu...
597,Top 10 Reasons Why the National Debt Matters -...,Top 10 Reasons Why the National Debt Matters ...,"Mon, 12 Feb 2024 08:00:00 GMT",https://news.google.com/rss/articles/CBMiQWh0d...,Peterson Foundation,national debt,Top Reasons Why the National Debt Matters At t...,Top Reasons Why the National Debt Matters Pete...
598,Interest payments on the nation’s debt are soa...,Interest payments on the nation’s debt are soa...,"Thu, 16 Nov 2023 08:00:00 GMT",https://news.google.com/rss/articles/CBMiXGh0d...,CNN,national debt,CNN Americans arent the only ones feeling the ...,Interest payments on the nations debt are soar...


In [91]:
final_df['extracted_content'][2]

'Higher inflationary expectations and price tolerance are taking root in Japan a development that supports the central banks moves to normalize policy and raise interest rates further A recent survey by Tsutomu Watanabe a leading inflation expert in the nation found that Japanese consumers tolerance of price changes is holding up and is higher than levels seen among shoppers in some other major economies The survey showed for the thirdyear running that more than half of respondents would continue to buy a product at the same supermarket even if prices rose by'

In [64]:
from bs4 import BeautifulSoup

def extract_article_text(url):
    try:
        response = requests.get(url, timeout=30)  # Set timeout to 30 seconds
        response.raise_for_status()
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            # find all <p> tags and extract their text
            paragraphs = soup.find_all('p')
            article_text = '\n'.join([p.get_text() for p in paragraphs])
            print("Article collected")
            return article_text
        else:
            print(f"Failed to fetch URL: {url}")
            return None
    except requests.Timeout:
        print(f"Timeout occurred while fetching URL: {url}")
        return None
    except requests.RequestException as e:
        print(f"Error occurred while fetching URL: {url}. Error: {e}")
        return None

# Check if 'extracted_content' column is null
if final_df['extracted_content'].isnull().any():
    # Apply the extraction function only to rows where 'extracted_content' is null
    final_df.loc[final_df['extracted_content'].isnull(), 'extracted_content'] =  final_df['url'].apply(extract_article_text)



Article collected
Article collected
Article collected
Article collected
Error occurred while fetching URL: https://news.google.com/rss/articles/CBMiQWh0dHBzOi8vd3d3LmFpZXIub3JnL2FydGljbGUvd2h5LWhhdmVudC13ZS13aGlwcGVkLWluZmxhdGlvbi15ZXQv0gEA?oc=5&hl=en-US&gl=US&ceid=US:en. Error: 403 Client Error: Forbidden for url: https://www.aier.org/article/why-havent-we-whipped-inflation-yet/
Article collected
Article collected
Article collected
Error occurred while fetching URL: https://news.google.com/rss/articles/CBMiY2h0dHBzOi8vd3d3LmJhcnJvbnMuY29tL2FydGljbGVzL21hcmtldHMtYm91bmNlLWFzLW1pZGVhc3QtZmVhcnMtZWFzZS11cy1pbmZsYXRpb24taW4tdmlldy03MzhkZDRkYtIBY2h0dHBzOi8vd3d3LmJhcnJvbnMuY29tL2FtcC9uZXdzL21hcmtldHMtYm91bmNlLWFzLW1pZGVhc3QtZmVhcnMtZWFzZS11cy1pbmZsYXRpb24taW4tdmlldy03MzhkZDRkYg?oc=5&hl=en-US&gl=US&ceid=US:en. Error: 403 Client Error: Forbidden for url: https://www.barrons.com/articles/markets-bounce-as-mideast-fears-ease-us-inflation-in-view-738dd4db
Article collected
Article collected
Arti

Article collected
Article collected
Article collected
Article collected
Article collected
Error occurred while fetching URL: https://news.google.com/rss/articles/CBMiQmh0dHBzOi8vd3d3Lm55dGltZXMuY29tLzIwMjQvMDQvMTAvYnVzaW5lc3MvY3BpLWluZmxhdGlvbi1mZWQuaHRtbNIBAA?oc=5&hl=en-US&gl=US&ceid=US:en. Error: 403 Client Error: Forbidden for url: https://www.nytimes.com/2024/04/10/business/cpi-inflation-fed.html
Article collected
Error occurred while fetching URL: https://news.google.com/rss/articles/CBMiVWh0dHBzOi8vd3d3Lndzai5jb20vZmluYW5jZS9zdG9ja3MvZ2xvYmFsLXN0b2Nrcy1tYXJrZXRzLWRvdy1uZXdzLTA0LTEwLTIwMjQtYTY3MmE1YmPSAQA?oc=5&hl=en-US&gl=US&ceid=US:en. Error: 403 Client Error: Forbidden for url: https://www.wsj.com/finance/stocks/global-stocks-markets-dow-news-04-10-2024-a672a5bc
Article collected
Error occurred while fetching URL: https://news.google.com/rss/articles/CBMiV2h0dHBzOi8vd3d3LnJldXRlcnMuY29tL3dvcmxkL3VrL3VrLXJlZ3VsYXItd2FnZXMtZ3Jvdy02MC0zLW1vbnRocy1mZWJydWFyeS0yMDI0LTA0LTE2L9IBAA?oc=

Article collected
Article collected
Error occurred while fetching URL: https://news.google.com/rss/articles/CBMia2h0dHBzOi8vd3d3LnJldXRlcnMuY29tL21hcmtldHMvdXMvdXMtam9iLWdyb3d0aC1iZWF0cy1leHBlY3RhdGlvbnMtZGVjZW1iZXItd2FnZXMtcmlzZS1zb2xpZGx5LTIwMjQtMDEtMDUv0gEA?oc=5&hl=en-US&gl=US&ceid=US:en. Error: 401 Client Error: HTTP Forbidden for url: https://www.reuters.com/markets/us/us-job-growth-beats-expectations-december-wages-rise-solidly-2024-01-05/
Error occurred while fetching URL: https://news.google.com/rss/articles/CBMiUGh0dHBzOi8vd3d3LmJscy5nb3YvcmVnaW9ucy93ZXN0L25ld3MtcmVsZWFzZS9lbXBsb3ltZW50Y29zdGluZGV4X2xvc2FuZ2VsZXMuaHRt0gEA?oc=5&hl=en-US&gl=US&ceid=US:en. Error: 403 Client Error: Forbidden for url: https://www.bls.gov/regions/west/news-release/employmentcostindex_losangeles.htm
Error occurred while fetching URL: https://news.google.com/rss/articles/CBMieGh0dHBzOi8vd3d3LnJldXRlcnMuY29tL21hcmtldHMvdXMvdXMtam9iLWdyb3d0aC1iZWF0cy1leHBlY3RhdGlvbnMtc2VwdGVtYmVyLXVuZW1wbG95bWVudC1yYXRl

Error occurred while fetching URL: https://news.google.com/rss/articles/CBMiV2h0dHBzOi8vd3d3LnBheWNvci5jb20vcmVzb3VyY2UtY2VudGVyL2FydGljbGVzL21pbmltdW0td2FnZS10aXBwZWQtZW1wbG95ZWVzLWJ5LXN0YXRlL9IBAA?oc=5&hl=en-US&gl=US&ceid=US:en. Error: 403 Client Error: Forbidden for url: https://www.paycor.com/resource-center/articles/minimum-wage-tipped-employees-by-state/
Article collected
Article collected
Article collected
Error occurred while fetching URL: https://news.google.com/rss/articles/CBMif2h0dHBzOi8vdGhlY29udmVyc2F0aW9uLmNvbS93aHktZG9udC1hdXN0cmFsaWFucy10YWxrLWFib3V0LXRoZWlyLXNhbGFyaWVzLXBheS10cmFuc3BhcmVuY3ktYW5kLWZhaXJuZXNzLWdvLWhhbmQtaW4taGFuZC0yMjQwNjfSAQA?oc=5&hl=en-US&gl=US&ceid=US:en. Error: 403 Client Error: Forbidden for url: https://theconversation.com/why-dont-australians-talk-about-their-salaries-pay-transparency-and-fairness-go-hand-in-hand-224067
Article collected
Article collected
Timeout occurred while fetching URL: https://news.google.com/rss/articles/CBMiPmh0dHBzOi8vd

Article collected
Article collected
Article collected
Article collected
Error occurred while fetching URL: https://news.google.com/rss/articles/CBMiX2h0dHBzOi8vd3d3LmZhc3Rjb21wYW55LmNvbS85MTEwODUzNS9ob3VzaW5nLW1hcmtldC1jb25zdHJ1Y3Rpb24tY29zdHMtc2luZ2xlLWZhbWlseS1ob21lLWNoYXJ00gEA?oc=5&hl=en-US&gl=US&ceid=US:en. Error: 403 Client Error: Forbidden for url: https://www.fastcompany.com/91108535/housing-market-construction-costs-single-family-home-chart
Article collected
Article collected
Error occurred while fetching URL: https://news.google.com/rss/articles/CBMicmh0dHBzOi8va3RsYS5jb20vbmV3cy9sb2NhbC1uZXdzL2wtYS1zLWhvdXNpbmctY29zdHMtaGFzLWhlbHBlZC1sb3dlci1yZXNpZGVudHMtcXVhbGl0eS1vZi1saWZlLXVjbGEtc3VydmV5LWZpbmRzL9IBdmh0dHBzOi8va3RsYS5jb20vbmV3cy9sb2NhbC1uZXdzL2wtYS1zLWhvdXNpbmctY29zdHMtaGFzLWhlbHBlZC1sb3dlci1yZXNpZGVudHMtcXVhbGl0eS1vZi1saWZlLXVjbGEtc3VydmV5LWZpbmRzL2FtcC8?oc=5&hl=en-US&gl=US&ceid=US:en. Error: 429 Client Error: Unknown Error for url: https://ktla.com/news/local-news/l-a-s-

Article collected
Error occurred while fetching URL: https://news.google.com/rss/articles/CBMiUWh0dHBzOi8vd3d3Lndzai5jb20vdXMtbmV3cy9lZHVjYXRpb24vY29sbGVnZS1ob3VzaW5nLWRvcm1zLWNvc3QtdHVpdGlvbi05ZDk4YzFhNNIBAA?oc=5&hl=en-US&gl=US&ceid=US:en. Error: 403 Client Error: Forbidden for url: https://www.wsj.com/us-news/education/college-housing-dorms-cost-tuition-9d98c1a4
Error occurred while fetching URL: https://news.google.com/rss/articles/CBMiUGh0dHBzOi8vd3d3LnZpc3VhbGNhcGl0YWxpc3QuY29tL2hvdy1kZWJ0LXRvLWdkcC1yYXRpb3MtaGF2ZS1jaGFuZ2VkLXNpbmNlLTIwMDAv0gEA?oc=5&hl=en-US&gl=US&ceid=US:en. Error: 403 Client Error: Forbidden for url: https://www.visualcapitalist.com/how-debt-to-gdp-ratios-have-changed-since-2000/
Error occurred while fetching URL: https://news.google.com/rss/articles/CBMiPWh0dHBzOi8vcGh5cy5vcmcvbmV3cy8yMDI0LTA0LWNsaW1hdGUtaW1wYWN0cy1nbG9iYWwtZ2RwLmh0bWzSATxodHRwczovL3BoeXMub3JnL25ld3MvMjAyNC0wNC1jbGltYXRlLWltcGFjdHMtZ2xvYmFsLWdkcC5hbXA?oc=5&hl=en-US&gl=US&ceid=US:en. Error: 400 

Article collected
Article collected
Article collected
Article collected
Error occurred while fetching URL: https://news.google.com/rss/articles/CBMiYmh0dHBzOi8vd3d3LnJldXRlcnMuY29tL21hcmtldHMvdXMvbW9kZXJhdGUtdXMtZWNvbm9taWMtZ3Jvd3RoLWV4cGVjdGVkLWZvdXJ0aC1xdWFydGVyLTIwMjQtMDEtMjUv0gEA?oc=5&hl=en-US&gl=US&ceid=US:en. Error: 401 Client Error: HTTP Forbidden for url: https://www.reuters.com/markets/us/moderate-us-economic-growth-expected-fourth-quarter-2024-01-25/
Article collected
Error occurred while fetching URL: https://news.google.com/rss/articles/CBMiZWh0dHBzOi8vd3d3LnJldXRlcnMuY29tL3dvcmxkL3VzL3VzLWZvdXJ0aC1xdWFydGVyLWVjb25vbWljLWdyb3d0aC1yZXZpc2VkLXNsaWdodGx5LWxvd2VyLTIwMjQtMDItMjgv0gEA?oc=5&hl=en-US&gl=US&ceid=US:en. Error: 401 Client Error: HTTP Forbidden for url: https://www.reuters.com/world/us/us-fourth-quarter-economic-growth-revised-slightly-lower-2024-02-28/
Error occurred while fetching URL: https://news.google.com/rss/articles/CBMiXWh0dHBzOi8vd3d3LnJldXRlcnMuY29tL21hcmtld

Error occurred while fetching URL: https://news.google.com/rss/articles/CBMiRWh0dHBzOi8vd3d3LmJhcnJvbnMuY29tL2FydGljbGVzL3N0b2NrLW1hcmtldC12aXgtZmVhci1nYXVnZS1mNzYxYmVlYdIBSWh0dHBzOi8vd3d3LmJhcnJvbnMuY29tL2FtcC9hcnRpY2xlcy9zdG9jay1tYXJrZXQtdml4LWZlYXItZ2F1Z2UtZjc2MWJlZWE?oc=5&hl=en-US&gl=US&ceid=US:en. Error: 403 Client Error: Forbidden for url: https://www.barrons.com/articles/stock-market-vix-fear-gauge-f761beea
Error occurred while fetching URL: https://news.google.com/rss/articles/CBMiaGh0dHBzOi8vZmluYW5jZS55YWhvby5jb20vbmV3cy9saXZlL3N0b2NrLW1hcmtldC10b2RheS1zcC01MDAtc2xpZGVzLWZvci01dGgtc3RyYWlnaHQtZGF5LTIwMDE1ODA0OS5odG1s0gEA?oc=5&hl=en-US&gl=US&ceid=US:en. Error: 404 Client Error: Not Found for url: https://finance.yahoo.com/news/live/stock-market-today-sp-500-slides-for-5th-straight-day-200158049.html
Error occurred while fetching URL: https://news.google.com/rss/articles/CBMiSmh0dHBzOi8vd3d3LmJhcnJvbnMuY29tL2FydGljbGVzL3N0b2NrLW1hcmtldC10b2RheS1kb3duLWVhcm5pbmdzLWUzZWI0MjZi0gFO

Article collected
Article collected
Error occurred while fetching URL: https://news.google.com/rss/articles/CBMic2h0dHBzOi8vd3d3LmludmVzdG9ycy5jb20vbWFya2V0LXRyZW5kL3N0b2NrLW1hcmtldC10b2RheS9kb3ctam9uZXMtZnV0dXJlcy1wb3dlci10cmVuZC1lbmRpbmctbmV0ZmxpeC1zdWJzY3JpYmVycy_SAQA?oc=5&hl=en-US&gl=US&ceid=US:en. Error: 403 Client Error: Forbidden for url: https://www.investors.com/market-trend/stock-market-today/dow-jones-futures-power-trend-ending-netflix-subscribers/
Article collected
Error occurred while fetching URL: https://news.google.com/rss/articles/CBMiZGh0dHBzOi8vd3d3LnJldXRlcnMuY29tL3dvcmxkL2V1cm9wZS9mbGFtZXMtc3RpbGwtZmxhcmUtZGVubWFya3MtaGlzdG9yaWMtc3RvY2stZXhjaGFuZ2UtMjAyNC0wNC0xOC_SAQA?oc=5&hl=en-US&gl=US&ceid=US:en. Error: 401 Client Error: HTTP Forbidden for url: https://www.reuters.com/world/europe/flames-still-flare-denmarks-historic-stock-exchange-2024-04-18/
Article collected
Article collected
Article collected
Error occurred while fetching URL: https://news.google.com/rss/art

Article collected
Article collected
Article collected
Article collected
Article collected
Article collected
Article collected
Article collected
Article collected
Article collected
Timeout occurred while fetching URL: https://news.google.com/rss/articles/CBMiWmh0dHBzOi8vd3d3LnN0YXItdGVsZWdyYW0uY29tL29waW5pb24vb3BuLWNvbHVtbnMtYmxvZ3Mvb3RoZXItdm9pY2VzL2FydGljbGUyODc3NDcyNzUuaHRtbNIBAA?oc=5&hl=en-US&gl=US&ceid=US:en
Article collected
Article collected
Article collected
Article collected
Article collected
Article collected
Article collected
Article collected
Article collected
Article collected
Article collected
Article collected
Article collected
Error occurred while fetching URL: https://news.google.com/rss/articles/CBMiRWh0dHBzOi8vd3d3Lm55dGltZXMuY29tLzIwMjQvMDMvMjAvdXMvcG9saXRpY3MvZGVidC10YXhlcy1idWRnZXQuaHRtbNIBAA?oc=5&hl=en-US&gl=US&ceid=US:en. Error: 403 Client Error: Forbidden for url: https://www.nytimes.com/2024/03/20/us/politics/debt-taxes-budget.html
Article collected
Article col

In [71]:
final_df[final_df['extracted_content'].isnull() == True]

Unnamed: 0,title,description,published date,url,source,label,extracted_content
4,Why Haven't We Whipped inflation Yet? - AIER -...,Why Haven't We Whipped inflation Yet? AIER - ...,"Fri, 19 Apr 2024 10:55:14 GMT",https://news.google.com/rss/articles/CBMiQWh0d...,AIER - Daily Economy News,inflation,
8,"Markets Bounce As MidEast Fears Ease, US Infla...","Markets Bounce As MidEast Fears Ease, US Infla...","Mon, 22 Apr 2024 02:45:55 GMT",https://news.google.com/rss/articles/CBMiY2h0d...,Barron's,inflation,
14,"Inflation, Interest Rates and Oil Prices Have ...","Inflation, Interest Rates and Oil Prices Have ...","Fri, 19 Apr 2024 13:00:13 GMT",https://news.google.com/rss/articles/CBMiXGh0d...,The New York Times,inflation,
16,S&P 500 heads for worst month since 2022 as bo...,S&P 500 heads for worst month since 2022 as bo...,"Sun, 21 Apr 2024 16:01:00 GMT",https://news.google.com/rss/articles/CBMiemh0d...,MarketWatch,inflation,
18,Trading volatility: The US dollar is one to wa...,Trading volatility: The US dollar is one to wa...,"Sun, 21 Apr 2024 12:23:00 GMT",https://news.google.com/rss/articles/CBMiVGh0d...,Forex Factory,inflation,
...,...,...,...,...,...,...,...
574,The cost of interest on U.S. debt is soaring -...,The cost of interest on U.S. debt is soaring ...,"Thu, 08 Feb 2024 08:00:00 GMT",https://news.google.com/rss/articles/CBMiRmh0d...,Axios,national debt,
578,How Argentina's debt crisis changed the sovere...,How Argentina's debt crisis changed the sovere...,"Thu, 11 Apr 2024 07:00:00 GMT",https://news.google.com/rss/articles/CBMiT2h0d...,Axios,national debt,
579,Record-High Interest Payments Could Make the N...,Record-High Interest Payments Could Make the N...,"Tue, 20 Feb 2024 10:34:33 GMT",https://news.google.com/rss/articles/CBMilAFod...,The Wall Street Journal,national debt,
584,Budget watchdog warns US could suffer market s...,Budget watchdog warns US could suffer market s...,"Wed, 27 Mar 2024 07:00:00 GMT",https://news.google.com/rss/articles/CBMicmh0d...,The Hill,national debt,


In [None]:
# make a histogram of the

In [77]:
# preprocessing 
import re

# get rid of null rows 
final_df = final_df.dropna(subset=['extracted_content'])

# join together title and extracted_content to make text column for LLM
final_df['title_extracted_content'] = final_df['title'].str.cat(final_df['extracted_content'], sep=' ')

# Filter out punctuation and newline characters
final_df['title_extracted_content'] = final_df['title_extracted_content'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x) if pd.notnull(x) else x)
final_df['title_extracted_content'] = final_df['title_extracted_content'].apply(lambda x: re.sub(r'\s+', ' ', x).strip() if pd.notnull(x) else x)

In [78]:
final_df['title_extracted_content'][1]

'Trump wins voters on inflation as Biden zeroes in on tariffs jobs NBC News poll CNBC More voters trust Donald Trump than President Joe Biden to deal with inflation and the cost of living their top concerns for the US according to the latest NBC News poll The poll of registered voters nationwide found that of respondents said Trump would better handle inflation and the cost of living while said the same of Biden The survey was taken from April to several days after the release of another hotterthanexpected inflation report indicating consumer prices gradually ticking back up Trump attacked Bidens economic policies immediately following the release of the data As consumer prices heat up again the Biden administration has kept its message on inflation the same and turned more of its attention to other aspects of the economy jobs tariffs and taxes Bidens heavy focus on those issues was evident as he made the rounds in the key battleground state of Pennsylvania last week During a Wednesday

In [80]:
final_df.to_csv('data/labeled_articles.csv', index=False)