## start from scratch

**News Data**

In [None]:
import requests
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from tqdm import tqdm

pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_rows", None)

In [None]:
def get_news_data(API_KEY: str, from_date: str, to_date:str):
    try:
      start = datetime.strptime(from_date, "%Y-%m-%d").date()
      end = datetime.strptime(to_date, "%Y-%m-%d").date()
      dates_generated = [start + timedelta(days=x) for x in range(0, (end-start).days+1)]
      # print("dates done")

      articles = []
      for days in tqdm(dates_generated):
          days = datetime.strftime(days, "%Y-%m-%d")

          BASE_URL = "https://content.guardianapis.com/search"
          params = {
              "api-key": API_KEY,
              "from-date": days,
              "to-date": days,
              "section": "business",
              # "sectionName": "business news",
              # "q": f"{dow_jones_companies[ticker]}",
              "show-fields": "headline, body",
              "order-by": "newest",
              "page-size": 100
          }

          response = requests.get(BASE_URL, params)
          if response is not None:
              data = response.json()

              for article in data["response"]["results"]:
                  articles.append({
                      "Title": article["webTitle"],
                      "URL": article["webUrl"],
                      "Publication Date": article["webPublicationDate"],
                  })
          else:
              pass

      df = pd.DataFrame(articles)
      df = df.set_index("Publication Date")

      return df

    except:
      df = pd.DataFrame(articles)
      df = df.set_index("Publication Date")
      print("Limit completed...")
      return df

  # CHANGES: change data collection to real time collection, adding directly in the loop itself

In [None]:
news = get_news_data("75e3c8c0-28e6-4166-961c-a72883c8ea3a", "2023-04-07", "2025-02-27")
# 1e78027b-d07c-4e35-9a0a-8f1d2b4e5549
# 75e3c8c0-28e6-4166-961c-a72883c8ea3a

100%|██████████| 693/693 [16:59<00:00,  1.47s/it]  


In [None]:
news.head()

Unnamed: 0_level_0,Title,URL
Publication Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2023-04-07T16:17:24Z,"Cheap flights, Brexit, now Dover chaos – is this the end of the road for continental coach tours?",https://www.theguardian.com/business/2023/apr/07/dover-chaos-adds-to-strain-on-uks-continental-coach-operators
2023-04-07T13:43:11Z,Jes Staley’s lawyers hit out at ‘slanderous’ attacks by JP Morgan,https://www.theguardian.com/business/2023/apr/07/jes-staley-lawyers-hit-out-at-slanderous-attacks-by-jp-morgan
2023-04-07T13:38:36Z,"US adds 236,000 jobs in March as labor market weakens",https://www.theguardian.com/business/2023/apr/07/us-jobs-report-march-2023
2023-04-07T11:00:02Z,‘I could barely speak. I felt like a ghost inside my own skin’: the month that shook the CBI,https://www.theguardian.com/business/2023/apr/07/i-could-barely-speak-i-felt-like-a-ghost-inside-my-own-skin-the-month-that-shook-the-cbi
2023-04-07T08:00:47Z,Workers protest Energizer’s plans to close Wisconsin plants,https://www.theguardian.com/business/2023/apr/07/energizer-wisconsin-factory-closure-job-loss


In [None]:
len(news)

6627

In [None]:
news.to_csv("news_data.csv", mode="a", header=False, index=True)

**Process Data** (remove duplicates)

In [None]:
import pandas as pd

# remove duplicates from the data
news_df = pd.read_csv("news_data.csv")

news_df.drop_duplicates(inplace=True)
news_df.head()

Unnamed: 0,Publication Date,Title,URL
0,2015-03-02T21:15:16Z,Medium-sized UK firms add more to economy than German peers – report,https://www.theguardian.com/business/2015/mar/02/medium-sized-uk-firms-add-more-economy-than-german-peers-report
1,2015-03-02T21:00:48Z,Wells Fargo caps subprime car loans to 'responsibly' manage risk,https://www.theguardian.com/business/2015/mar/02/wells-fargo-caps-subprime-car-loans
2,2015-03-02T20:14:51Z,Welcome drop in eurozone deflation and unemployment figures,https://www.theguardian.com/business/2015/mar/02/drop-eurozone-deflation-unemployment
3,2015-03-02T19:52:24Z,The IoD is right to call for more clarity on fund manager pay,https://www.theguardian.com/business/2015/mar/02/iod-right-call-clarity-fund-manager-pay
4,2015-03-02T19:28:46Z,First Virgin Trains East Coast service leaves London,https://www.theguardian.com/business/2015/mar/02/repainted-and-rebranded-virgin-trains-east-coast-service-leaves-london


In [None]:
news_df.to_csv("news_data.csv", mode="w", header=False, index=True)

In [None]:
news_df.tail()

Unnamed: 0,Publication Date,Title,URL
39989,2025-02-27T14:00:50Z,The Virgin-Qatar deal is welcome but it’s not the magic bullet Australian aviation needs,https://www.theguardian.com/australia-news/2025/feb/28/qatar-virgin-airways-australia-deal-jim-chalmers
39990,2025-02-27T13:33:18Z,Drax power plant to cut carbon capture investment despite record £1bn profit,https://www.theguardian.com/business/2025/feb/27/drax-power-plant-owner-reports-highest-earnings-since-pandemic
39991,2025-02-27T11:42:59Z,Ocado to cut 500 technology and finance jobs as AI reduces costs,https://www.theguardian.com/business/2025/feb/27/ocado-to-cut-500-technology-and-finance-jobs-as-ai-reduces-costs
39992,2025-02-27T10:00:25Z,Rolls-Royce brings back dividend and announces £1bn share buyback,https://www.theguardian.com/business/2025/feb/27/rolls-royce-dividend-share-buyback-british-jet-engine-maker-profit
39993,2025-02-27T01:02:17Z,Qantas posts $1.39bn profit as holidaymakers flock to Jetstar,https://www.theguardian.com/business/2025/feb/27/qantas-1-39bn-profit-half-year-results-jetstar


**News Sntiment**

In [1]:
import pandas as pd
import numpy as np
import scipy
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch



In [3]:
def calculate_compound_scores(probabilities):
    return probabilities['positive'] - probabilities['negative']

def get_news_sentiment(df):

    titles = list(df["Title"].values)

    tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
    model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

    inputs = tokenizer(titles, return_tensors="pt", padding=True, truncation=True)

    with torch.no_grad():
        logits = model(**inputs).logits
    print("[Logits generated...]")
    print()

    probabilities = scipy.special.softmax(logits.numpy(), axis=1)
    labels = list(model.config.id2label.values())
    print("[Labels are listed...]")
    print()

    sentiment_scores = []
    for i, title in enumerate(titles):
        scores_dict = {labels[j]: probabilities[i][j] for j in range(len(labels))}
        sentiment_scores.append(scores_dict)
    print("[Sentiment scores are given...]")
    print()

    compound_scores = [calculate_compound_scores(scores) for scores in sentiment_scores]
    compound_scores = [float(x) for x in compound_scores]

    return compound_scores

In [4]:
news = pd.read_csv("news_data.csv")

In [8]:
news.head()

Unnamed: 0,Publish Date,Title,Source
0,2015-03-02T21:15:16Z,Medium-sized UK firms add more to economy than...,https://www.theguardian.com/business/2015/mar/...
1,2015-03-02T21:00:48Z,Wells Fargo caps subprime car loans to 'respon...,https://www.theguardian.com/business/2015/mar/...
2,2015-03-02T20:14:51Z,Welcome drop in eurozone deflation and unemplo...,https://www.theguardian.com/business/2015/mar/...
3,2015-03-02T19:52:24Z,The IoD is right to call for more clarity on f...,https://www.theguardian.com/business/2015/mar/...
4,2015-03-02T19:28:46Z,First Virgin Trains East Coast service leaves ...,https://www.theguardian.com/business/2015/mar/...


In [7]:
scores = get_news_sentiment(news)

[Logits generated...]

[Labels are listed...]

[Sentiment scores are given...]



In [9]:
scores[:10]

[0.6645181179046631,
 0.14852099120616913,
 -0.9484508037567139,
 0.0748247355222702,
 -0.07280465960502625,
 0.6064808368682861,
 -0.9198222160339355,
 0.7578344941139221,
 -0.9382691979408264,
 -0.8379787802696228]

In [10]:
news["Sentiment Score"] = scores
news

Unnamed: 0,Publish Date,Title,Source,Sentiment Score
0,2015-03-02T21:15:16Z,Medium-sized UK firms add more to economy than...,https://www.theguardian.com/business/2015/mar/...,0.664518
1,2015-03-02T21:00:48Z,Wells Fargo caps subprime car loans to 'respon...,https://www.theguardian.com/business/2015/mar/...,0.148521
2,2015-03-02T20:14:51Z,Welcome drop in eurozone deflation and unemplo...,https://www.theguardian.com/business/2015/mar/...,-0.948451
3,2015-03-02T19:52:24Z,The IoD is right to call for more clarity on f...,https://www.theguardian.com/business/2015/mar/...,0.074825
4,2015-03-02T19:28:46Z,First Virgin Trains East Coast service leaves ...,https://www.theguardian.com/business/2015/mar/...,-0.072805
...,...,...,...,...
39978,2025-02-27T14:00:50Z,The Virgin-Qatar deal is welcome but it’s not ...,https://www.theguardian.com/australia-news/202...,0.163169
39979,2025-02-27T13:33:18Z,Drax power plant to cut carbon capture investm...,https://www.theguardian.com/business/2025/feb/...,0.894084
39980,2025-02-27T11:42:59Z,Ocado to cut 500 technology and finance jobs a...,https://www.theguardian.com/business/2025/feb/...,-0.811431
39981,2025-02-27T10:00:25Z,Rolls-Royce brings back dividend and announces...,https://www.theguardian.com/business/2025/feb/...,0.198386


In [11]:
news.to_csv("news_data.csv", index=False)