### Since the sentiment data collected is not enough (some dates remain empty), we need to find more data in order to complete the dataset

In [51]:
from gnews import GNews
import pandas as pd
import concurrent.futures

In [52]:
data_sentiment = pd.read_csv('sentiment_gpt4_only.csv')

In [11]:
data_sentiment

Unnamed: 0,Date,Title,Sentiment
0,2010-01-04 00:00:00,Global Stocks and Commodities Rally on First T...,positive
1,2010-01-04 00:00:00,Dollar Slumps Amid Worldwide Manufacturing Imp...,negative
2,2010-01-04 00:00:00,Oil Prices Surge Above $81 a Barrel Due to U.S...,negative
3,2010-01-04 00:00:00,"S&P 500 Sees 1.6 Percent Increase, Hits 15-Mon...",positive
4,2010-01-04 00:00:00,"Argentina's Merval Index Reaches Record High, ...",positive
...,...,...,...
8134,2011-12-30 00:00:00,Earnings Season Begins with JPMorgan and Citi...,indecisive
8135,2011-12-30 00:00:00,June Inflation Report Expected to Show Price ...,indecisive
8136,2011-12-30 00:00:00,Chinese Trade Data Expected to Show Slower Ex...,negative
8137,2011-12-30 00:00:00,S&P 500 Exchange-Traded Fund Slips 0.5 Percen...,negative


In [8]:
google_news = GNews(language='en',country='US',max_results=5)
google_news.start_date = (2000,1, 4) 
google_news.end_date = (2011, 12, 30)

The data collecting takes so much time, so the best thing to reduce the compilation time is the use parallel methopd compilation

In [13]:
def fetch_related_news(title):
    news = google_news.get_news(title)
    return news[:5]  

# Function to parallelize fetching news
def parallel_fetch_news(titles):
    with concurrent.futures.ThreadPoolExecutor() as executor:
        results = list(executor.map(fetch_related_news, titles))
    return results
 
# Fetch related news in parallel
data_sentiment['related_news'] = parallel_fetch_news(data_sentiment['Title'])

In [36]:
data_sentiment['related_news'][0]

[{'title': 'CNNMoney.com Market Report - Feb. 4, 2010 - CNN',
  'description': 'CNNMoney.com Market Report - Feb. 4, 2010  CNN',
  'published date': 'Thu, 04 Feb 2010 08:00:00 GMT',
  'url': 'https://news.google.com/rss/articles/CBMiQmh0dHBzOi8vbW9uZXkuY25uLmNvbS8yMDEwLzAyLzA0L21hcmtldHMvbWFya2V0c19uZXd5b3JrL2luZGV4Lmh0bdIBAA?oc=5&hl=en-US&gl=US&ceid=US:en',
  'publisher': {'href': 'https://money.cnn.com', 'title': 'CNN'}},
 {'title': 'Debt crisis sends financial markets into turmoil – Monday 8 August 2011 - The Guardian',
  'description': 'Debt crisis sends financial markets into turmoil – Monday 8 August 2011  The Guardian',
  'published date': 'Mon, 08 Aug 2011 07:00:00 GMT',
  'url': 'https://news.google.com/rss/articles/CBMiYmh0dHBzOi8vd3d3LnRoZWd1YXJkaWFuLmNvbS9idXNpbmVzcy8yMDExL2F1Zy8wOC9zdG9jay1tYXJrZXQtdHVybW9pbC11cy1kb3duZ3JhZGUtZXVyb3pvbmUtY3Jpc2lz0gEA?oc=5&hl=en-US&gl=US&ceid=US:en',
  'publisher': {'href': 'https://www.theguardian.com',
   'title': 'The Guardian'}},
 {'tit

In [44]:
list_title = []
list_date = []
for news in data_sentiment['related_news']:
    if news:
        list_date.append(news[0]['published date'])
        list_title.append(news[0]['title'])

In [46]:
new_data = pd.DataFrame({'Date': list_date, 'Title': list_title})

In [48]:
new_data['Date'] = pd.to_datetime(new_data['Date'])
new_data['Date'] = new_data['Date'].dt.date

In [50]:
new_data.to_csv('new_data.csv',index=False)

There are still so many empty cells so we reduced the time interval in order to increase the probability of having data at the specific dates

In [54]:
list_years= []
google_news = GNews(language='en',country='US',max_results=5)
for year in range(2000,2010):
    google_news.start_date = (year,1, 1) 
    google_news.end_date = (year, 12, 30)
    def fetch_related_news(title):
        news = google_news.get_news(title)
        return news[:5]  

    # Function to parallelize fetching news
    def parallel_fetch_news(titles):
        with concurrent.futures.ThreadPoolExecutor() as executor:
            results = list(executor.map(fetch_related_news, titles))
        return results
 
    #Fetch related news in parallel
    list_years.append(parallel_fetch_news(data_sentiment['Title']))

06/06/2024 08:25:11 PM - Remote end closed connection without response
06/06/2024 08:25:22 PM - Remote end closed connection without response
06/06/2024 08:25:22 PM - ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
06/06/2024 08:30:22 PM - Remote end closed connection without response
06/06/2024 08:31:31 PM - HTTPSConnectionPool(host='news.google.com', port=443): Max retries exceeded with url: /rss/articles/CBMiZ2h0dHBzOi8vd3d3Lm5kdHZwcm9maXQuY29tL2FtcC9wb2xpdGljcy9zcGFpbi1zLWRlLWd1aW5kb3Mtc2F5cy1jYXRhbG9uaWEtcy1pbmRlcGVuZGVuY2UtcHVzaC1pcy1kb29tZWTSAWdodHRwczovL3d3dy5uZHR2cHJvZml0LmNvbS9hbXAvcG9saXRpY3Mvc3BhaW4tcy1kZS1ndWluZG9zLXNheXMtY2F0YWxvbmlhLXMtaW5kZXBlbmRlbmNlLXB1c2gtaXMtZG9vbWVk?oc=5 (Caused by SSLError(SSLEOFError(8, 'EOF occurred in violation of protocol (_ssl.c:997)')))
06/06/2024 08:31:32 PM - Remote end closed connection without response
06/06/2024 08:31:32 PM - ('Connection aborted.', RemoteDisconnected('Remote end closed conne