## start from scratch

**News Data**

In [1]:
import requests
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from tqdm import tqdm

pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_rows", None)

In [2]:
def get_news_data(API_KEY: str, from_date: str, to_date:str):
    try:
      start = datetime.strptime(from_date, "%Y-%m-%d").date()
      end = datetime.strptime(to_date, "%Y-%m-%d").date()
      dates_generated = [start + timedelta(days=x) for x in range(0, (end-start).days+1)]
      # print("dates done")

      articles = []
      for days in tqdm(dates_generated):
          days = datetime.strftime(days, "%Y-%m-%d")

          BASE_URL = "https://content.guardianapis.com/search"
          params = {
              "api-key": API_KEY,
              "from-date": days,
              "to-date": days,
              "section": "business",
              # "sectionName": "business news",
              # "q": f"{dow_jones_companies[ticker]}",
              "show-fields": "headline, body",
              "order-by": "newest",
              "page-size": 100
          }

          response = requests.get(BASE_URL, params)
          if response is not None:
              data = response.json()

              for article in data["response"]["results"]:
                  articles.append({
                      "Title": article["webTitle"],
                      "URL": article["webUrl"],
                      "Publication Date": article["webPublicationDate"],
                  })
          else:
              pass

      df = pd.DataFrame(articles)
      df = df.set_index("Publication Date")

      return df

    except:
      df = pd.DataFrame(articles)
      df = df.set_index("Publication Date")
      print("Limit completed...")
      return df

  # CHANGES: change data collection to real time collection, adding directly in the loop itself

In [4]:
news = get_news_data("75e3c8c0-28e6-4166-961c-a72883c8ea3a", "2023-04-07", "2025-02-27")
# 1e78027b-d07c-4e35-9a0a-8f1d2b4e5549
# 75e3c8c0-28e6-4166-961c-a72883c8ea3a

100%|██████████| 693/693 [16:59<00:00,  1.47s/it]  


In [5]:
news.head()

Unnamed: 0_level_0,Title,URL
Publication Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2023-04-07T16:17:24Z,"Cheap flights, Brexit, now Dover chaos – is this the end of the road for continental coach tours?",https://www.theguardian.com/business/2023/apr/07/dover-chaos-adds-to-strain-on-uks-continental-coach-operators
2023-04-07T13:43:11Z,Jes Staley’s lawyers hit out at ‘slanderous’ attacks by JP Morgan,https://www.theguardian.com/business/2023/apr/07/jes-staley-lawyers-hit-out-at-slanderous-attacks-by-jp-morgan
2023-04-07T13:38:36Z,"US adds 236,000 jobs in March as labor market weakens",https://www.theguardian.com/business/2023/apr/07/us-jobs-report-march-2023
2023-04-07T11:00:02Z,‘I could barely speak. I felt like a ghost inside my own skin’: the month that shook the CBI,https://www.theguardian.com/business/2023/apr/07/i-could-barely-speak-i-felt-like-a-ghost-inside-my-own-skin-the-month-that-shook-the-cbi
2023-04-07T08:00:47Z,Workers protest Energizer’s plans to close Wisconsin plants,https://www.theguardian.com/business/2023/apr/07/energizer-wisconsin-factory-closure-job-loss


In [6]:
len(news)

6627

In [7]:
news.to_csv("news_data.csv", mode="a", header=False, index=True)

**Process Data** (remove duplicates)

In [9]:
import pandas as pd

# remove duplicates from the data
news_df = pd.read_csv("news_data.csv")

news_df.drop_duplicates(inplace=True)
news_df.head()

Unnamed: 0,Publication Date,Title,URL
0,2015-03-02T21:15:16Z,Medium-sized UK firms add more to economy than German peers – report,https://www.theguardian.com/business/2015/mar/02/medium-sized-uk-firms-add-more-economy-than-german-peers-report
1,2015-03-02T21:00:48Z,Wells Fargo caps subprime car loans to 'responsibly' manage risk,https://www.theguardian.com/business/2015/mar/02/wells-fargo-caps-subprime-car-loans
2,2015-03-02T20:14:51Z,Welcome drop in eurozone deflation and unemployment figures,https://www.theguardian.com/business/2015/mar/02/drop-eurozone-deflation-unemployment
3,2015-03-02T19:52:24Z,The IoD is right to call for more clarity on fund manager pay,https://www.theguardian.com/business/2015/mar/02/iod-right-call-clarity-fund-manager-pay
4,2015-03-02T19:28:46Z,First Virgin Trains East Coast service leaves London,https://www.theguardian.com/business/2015/mar/02/repainted-and-rebranded-virgin-trains-east-coast-service-leaves-london


In [10]:
news_df.to_csv("news_data.csv", mode="w", header=False, index=True)

In [11]:
news_df.tail()

Unnamed: 0,Publication Date,Title,URL
39989,2025-02-27T14:00:50Z,The Virgin-Qatar deal is welcome but it’s not the magic bullet Australian aviation needs,https://www.theguardian.com/australia-news/2025/feb/28/qatar-virgin-airways-australia-deal-jim-chalmers
39990,2025-02-27T13:33:18Z,Drax power plant to cut carbon capture investment despite record £1bn profit,https://www.theguardian.com/business/2025/feb/27/drax-power-plant-owner-reports-highest-earnings-since-pandemic
39991,2025-02-27T11:42:59Z,Ocado to cut 500 technology and finance jobs as AI reduces costs,https://www.theguardian.com/business/2025/feb/27/ocado-to-cut-500-technology-and-finance-jobs-as-ai-reduces-costs
39992,2025-02-27T10:00:25Z,Rolls-Royce brings back dividend and announces £1bn share buyback,https://www.theguardian.com/business/2025/feb/27/rolls-royce-dividend-share-buyback-british-jet-engine-maker-profit
39993,2025-02-27T01:02:17Z,Qantas posts $1.39bn profit as holidaymakers flock to Jetstar,https://www.theguardian.com/business/2025/feb/27/qantas-1-39bn-profit-half-year-results-jetstar


**News Sntiment**

In [1]:
import pandas as pd
import numpy as np
import scipy
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def calculate_compound_scores(probabilities):
    return probabilities['positive'] - probabilities['negative']

def get_news_sentiment(df):

    titles = list(df["Title"].values)

    tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
    model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

    inputs = tokenizer(titles, return_tensors="pt", padding=True, truncation=True)

    with torch.no_grad():
        logits = model(**inputs).logits
    print("[Logits generated...]")
    print()

    probabilities = scipy.special.softmax(logits.numpy(), axis=1)
    labels = list(model.config.id2label.values())
    print("[Labels are listed...]")
    print()

    sentiment_scores = []
    for i, title in enumerate(titles):
        scores_dict = {labels[j]: probabilities[i][j] for j in range(len(labels))}
        sentiment_scores.append(scores_dict)
    print("[Sentiment scores are given...]")
    print()

    compound_scores = [calculate_compound_scores(scores) for scores in sentiment_scores]
    compound_scores = [float(x) for x in compound_scores]

    return compound_scores

In [3]:
news = pd.read_csv("news_data.csv")

In [4]:
news.head()

Unnamed: 0,Publish Date,Title,Source
0,2015-03-02T21:15:16Z,Medium-sized UK firms add more to economy than...,https://www.theguardian.com/business/2015/mar/...
1,2015-03-02T21:00:48Z,Wells Fargo caps subprime car loans to 'respon...,https://www.theguardian.com/business/2015/mar/...
2,2015-03-02T20:14:51Z,Welcome drop in eurozone deflation and unemplo...,https://www.theguardian.com/business/2015/mar/...
3,2015-03-02T19:52:24Z,The IoD is right to call for more clarity on f...,https://www.theguardian.com/business/2015/mar/...
4,2015-03-02T19:28:46Z,First Virgin Trains East Coast service leaves ...,https://www.theguardian.com/business/2015/mar/...


In [None]:
scores = get_news_sentiment(news)

In [None]:
scores[:10]

In [15]:
news["Sentiment Score"] = scores
news

Unnamed: 0_level_0,Title,URL,Sentiment Score
Publication Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2025-03-02T15:23:46Z,Chinese manufacturing returns to growth despite threat of higher Trump tariffs,https://www.theguardian.com/business/2025/mar/02/chinese-manufacturing-surges-despite-threat-of-higher-trump-tariffs,0.925995
2025-03-02T15:17:25Z,Pay soars at Barclays and HSBC after end of UK banker bonus cap,https://www.theguardian.com/business/2025/mar/02/pay-soars-at-barclays-and-hsbc-after-end-of-uk-banker-bonus-cap,-0.688042
2025-03-02T13:00:10Z,US Postal Service faces murky future as Trump mulls dismantling institution,https://www.theguardian.com/business/2025/mar/02/usps-trump-postal-service-cuts,-0.673867
2025-03-02T13:00:08Z,Read the signs of Trump’s federal firings: AI is coming for private sector jobs too,https://www.theguardian.com/business/2025/mar/02/ai-layoffs-trump-irs,-0.91236
2025-03-02T11:11:33Z,The world has changed – it's time for a radical Labour rethink on the economy | Heather Stewart,https://www.theguardian.com/business/2025/mar/02/the-world-has-changed-its-time-for-a-radical-labour-rethink-on-the-economy,0.063119
2025-03-02T07:00:03Z,"In renouncing aid and Europe, Starmer is sucking up to Trump | William Keegan",https://www.theguardian.com/business/2025/mar/02/in-renouncing-aid-and-europe-starmer-is-sucking-up-to-trump,0.597072
2025-03-03T22:00:27Z,"Soaring UK crime costing up to £250bn a year, says thinktank",https://www.theguardian.com/business/2025/mar/03/soaring-uk-crime-cost-up-policy-exchange-policing-prisons,0.529917
2025-03-03T18:33:31Z,"Jes Staley received images of ‘mature women’ from Jeffrey Epstein, court hears",https://www.theguardian.com/business/2025/mar/03/ex-barclays-ceo-jes-staley-tells-court-bank-well-aware-of-his-jeffrey-epstein-links,0.044642
2025-03-03T17:22:58Z,"UK rail passengers may lose patience as problems take years to fix, says minister",https://www.theguardian.com/business/2025/mar/03/ukrail-passengers-problems-lord-hendy-northern,-0.930677
2025-03-03T15:38:09Z,"Markets climb as defence stocks, euro and pound rally on Europe’s Ukraine peace push – as it happened",https://www.theguardian.com/business/live/2025/mar/03/bitcoin-jumps-crypto-reserve-plan-euro-rises-europe-ukraine-peace-push-business-live,0.234449
