## start from scratch

**News Data**

In [None]:
import requests
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from tqdm import tqdm

pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_rows", None)

In [None]:
def get_news_data(API_KEY: str, from_date: str, to_date:str):
    try:
      start = datetime.strptime(from_date, "%Y-%m-%d").date()
      end = datetime.strptime(to_date, "%Y-%m-%d").date()
      dates_generated = [start + timedelta(days=x) for x in range(0, (end-start).days+1)]
      # print("dates done")

      articles = []
      for days in tqdm(dates_generated):
          days = datetime.strftime(days, "%Y-%m-%d")

          BASE_URL = "https://content.guardianapis.com/search"
          params = {
              "api-key": API_KEY,
              "from-date": days,
              "to-date": days,
              "section": "business",
              # "sectionName": "business news",
              # "q": f"{dow_jones_companies[ticker]}",
              "show-fields": "headline, body",
              "order-by": "newest",
              "page-size": 100
          }

          response = requests.get(BASE_URL, params)
          if response is not None:
              data = response.json()

              for article in data["response"]["results"]:
                  articles.append({
                      "Title": article["webTitle"],
                      "URL": article["webUrl"],
                      "Publication Date": article["webPublicationDate"],
                  })
          else:
              pass

      df = pd.DataFrame(articles)
      df = df.set_index("Publication Date")

      return df

    except:
      df = pd.DataFrame(articles)
      df = df.set_index("Publication Date")
      print("Limit completed...")
      return df

  # CHANGES: change data collection to real time collection, adding directly in the loop itself

In [None]:
news = get_news_data("75e3c8c0-28e6-4166-961c-a72883c8ea3a", "2023-04-07", "2025-02-27")
# 1e78027b-d07c-4e35-9a0a-8f1d2b4e5549
# 75e3c8c0-28e6-4166-961c-a72883c8ea3a

100%|██████████| 693/693 [16:59<00:00,  1.47s/it]  


In [None]:
news.head()

Unnamed: 0_level_0,Title,URL
Publication Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2023-04-07T16:17:24Z,"Cheap flights, Brexit, now Dover chaos – is this the end of the road for continental coach tours?",https://www.theguardian.com/business/2023/apr/07/dover-chaos-adds-to-strain-on-uks-continental-coach-operators
2023-04-07T13:43:11Z,Jes Staley’s lawyers hit out at ‘slanderous’ attacks by JP Morgan,https://www.theguardian.com/business/2023/apr/07/jes-staley-lawyers-hit-out-at-slanderous-attacks-by-jp-morgan
2023-04-07T13:38:36Z,"US adds 236,000 jobs in March as labor market weakens",https://www.theguardian.com/business/2023/apr/07/us-jobs-report-march-2023
2023-04-07T11:00:02Z,‘I could barely speak. I felt like a ghost inside my own skin’: the month that shook the CBI,https://www.theguardian.com/business/2023/apr/07/i-could-barely-speak-i-felt-like-a-ghost-inside-my-own-skin-the-month-that-shook-the-cbi
2023-04-07T08:00:47Z,Workers protest Energizer’s plans to close Wisconsin plants,https://www.theguardian.com/business/2023/apr/07/energizer-wisconsin-factory-closure-job-loss


In [None]:
len(news)

6627

In [None]:
news.to_csv("news_data.csv", mode="a", header=False, index=True)

**Process Data** (remove duplicates)

In [None]:
import pandas as pd

# remove duplicates from the data
news_df = pd.read_csv("new_news_data.csv")

news_df.drop_duplicates(inplace=True)
news_df.head()

Unnamed: 0,2009-01-01T20:52:00Z,US government seizes control of Citigroup expenses after bail-out,https://www.theguardian.com/business/2009/jan/01/citigroup-banking
0,2009-01-01T20:14:00Z,New year predictions point to deepest UK reces...,https://www.theguardian.com/business/2009/jan/...
1,2009-01-01T16:24:09Z,Softwear patches have not solved the smartphon...,https://www.theguardian.com/business/2009/jan/...
2,2009-01-01T16:20:55Z,Cost of MG Rover inquiry tops £14m,https://www.theguardian.com/business/2009/jan/...
3,2009-01-01T14:22:50Z,Longannet is at the centre of Scottish Power's...,https://www.theguardian.com/business/2009/jan/...
4,2009-01-01T00:01:00Z,Small investors may lose out as Madoff prepare...,https://www.theguardian.com/business/2009/jan/...


In [None]:
news_df.to_csv("new_news_data.csv", mode="w", header=False, index=True)

In [None]:
news_df.tail()

Unnamed: 0,2009-01-01T20:52:00Z,US government seizes control of Citigroup expenses after bail-out,https://www.theguardian.com/business/2009/jan/01/citigroup-banking
44379,2016-01-01T18:41:33Z,Slowdown in Chinese manufacturing deepens fear...,https://www.theguardian.com/world/2016/jan/01/...
44380,2016-01-01T17:51:51Z,Rolls-Royce faces new questions in Brazil corr...,https://www.theguardian.com/business/2016/jan/...
44381,2016-01-01T17:48:17Z,Natwest and RBS customers hit by another banki...,https://www.theguardian.com/business/2016/jan/...
44382,2016-01-01T16:39:06Z,The business figures with most at stake in 2016,https://www.theguardian.com/business/2016/jan/...
44383,2016-01-01T16:21:15Z,Osborne prepares to pick right moment for Lloy...,https://www.theguardian.com/business/2016/jan/...


**News Sntiment**

In [1]:
import pandas as pd
import numpy as np
import scipy
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

In [2]:
def calculate_compound_scores(probabilities):
    return probabilities['positive'] - probabilities['negative']

def get_news_sentiment(df):

    titles = list(df["Title"].values)

    tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
    model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

    inputs = tokenizer(titles, return_tensors="pt", padding=True, truncation=True)

    with torch.no_grad():
        logits = model(**inputs).logits
    print("[Logits generated...]")
    print()

    probabilities = scipy.special.softmax(logits.numpy(), axis=1)
    labels = list(model.config.id2label.values())
    print("[Labels are listed...]")
    print()

    sentiment_scores = []
    for i, title in enumerate(titles):
        scores_dict = {labels[j]: probabilities[i][j] for j in range(len(labels))}
        sentiment_scores.append(scores_dict)
    print("[Sentiment scores are given...]")
    print()

    compound_scores = [calculate_compound_scores(scores) for scores in sentiment_scores]
    compound_scores = [float(x) for x in compound_scores]

    return compound_scores

In [8]:
news = pd.read_csv("new_news_data.csv", header=None)

In [10]:
news.columns = ["Sl. No", "Publish Date", "Title", "URL"]

In [11]:
news.head()

Unnamed: 0,Sl. No,Publish Date,Title,URL
0,0,2009-01-01T20:14:00Z,New year predictions point to deepest UK reces...,https://www.theguardian.com/business/2009/jan/...
1,1,2009-01-01T16:24:09Z,Softwear patches have not solved the smartphon...,https://www.theguardian.com/business/2009/jan/...
2,2,2009-01-01T16:20:55Z,Cost of MG Rover inquiry tops £14m,https://www.theguardian.com/business/2009/jan/...
3,3,2009-01-01T14:22:50Z,Longannet is at the centre of Scottish Power's...,https://www.theguardian.com/business/2009/jan/...
4,4,2009-01-01T00:01:00Z,Small investors may lose out as Madoff prepare...,https://www.theguardian.com/business/2009/jan/...


In [12]:
news.columns

Index(['Sl. No', 'Publish Date', 'Title', 'URL'], dtype='object')

In [13]:
# # news.rename(columns={0: "Sl. No", 1: "Publish Date", 2: "Title", 3: "URL"}, inplace=True)
# news.drop(columns=["Sl. No"])
news.to_csv('new_news_data.csv', index=False) # save to new csv file

In [16]:
news.drop(news.columns[0], axis=1, inplace=True)
news.head()

Unnamed: 0,Publish Date,Title,URL
0,2009-01-01T20:14:00Z,New year predictions point to deepest UK reces...,https://www.theguardian.com/business/2009/jan/...
1,2009-01-01T16:24:09Z,Softwear patches have not solved the smartphon...,https://www.theguardian.com/business/2009/jan/...
2,2009-01-01T16:20:55Z,Cost of MG Rover inquiry tops £14m,https://www.theguardian.com/business/2009/jan/...
3,2009-01-01T14:22:50Z,Longannet is at the centre of Scottish Power's...,https://www.theguardian.com/business/2009/jan/...
4,2009-01-01T00:01:00Z,Small investors may lose out as Madoff prepare...,https://www.theguardian.com/business/2009/jan/...


In [18]:
news.to_csv('new_news_data.csv', index=False)

In [19]:
news = pd.read_csv("new_news_data.csv")

In [20]:
scores = get_news_sentiment(news)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

[Logits generated...]

[Labels are listed...]

[Sentiment scores are given...]



In [21]:
scores[:10]

[-0.9440757036209106,
 -0.7698552012443542,
 0.06630305200815201,
 0.7493084669113159,
 -0.9234235286712646,
 -0.10073378682136536,
 -0.8768173456192017,
 -0.3178471624851227,
 -0.9042825698852539,
 -0.0006157010793685913]

In [22]:
news["Sentiment Score"] = scores
news

Unnamed: 0,Publish Date,Title,URL,Sentiment Score
0,2009-01-01T20:14:00Z,New year predictions point to deepest UK reces...,https://www.theguardian.com/business/2009/jan/...,-0.944076
1,2009-01-01T16:24:09Z,Softwear patches have not solved the smartphon...,https://www.theguardian.com/business/2009/jan/...,-0.769855
2,2009-01-01T16:20:55Z,Cost of MG Rover inquiry tops £14m,https://www.theguardian.com/business/2009/jan/...,0.066303
3,2009-01-01T14:22:50Z,Longannet is at the centre of Scottish Power's...,https://www.theguardian.com/business/2009/jan/...,0.749308
4,2009-01-01T00:01:00Z,Small investors may lose out as Madoff prepare...,https://www.theguardian.com/business/2009/jan/...,-0.923424
...,...,...,...,...
44353,2016-01-01T18:41:33Z,Slowdown in Chinese manufacturing deepens fear...,https://www.theguardian.com/world/2016/jan/01/...,-0.911292
44354,2016-01-01T17:51:51Z,Rolls-Royce faces new questions in Brazil corr...,https://www.theguardian.com/business/2016/jan/...,-0.532394
44355,2016-01-01T17:48:17Z,Natwest and RBS customers hit by another banki...,https://www.theguardian.com/business/2016/jan/...,-0.956507
44356,2016-01-01T16:39:06Z,The business figures with most at stake in 2016,https://www.theguardian.com/business/2016/jan/...,-0.012372


In [23]:
news.to_csv("new_news_data.csv", index=False)

**Average scores based on same date**

In [1]:
import pandas as pd

df = pd.read_csv("new_news_data.csv")
df.head()

Unnamed: 0,Publish Date,Title,URL,Sentiment Score
0,2009-01-01T20:14:00Z,New year predictions point to deepest UK reces...,https://www.theguardian.com/business/2009/jan/...,-0.944076
1,2009-01-01T16:24:09Z,Softwear patches have not solved the smartphon...,https://www.theguardian.com/business/2009/jan/...,-0.769855
2,2009-01-01T16:20:55Z,Cost of MG Rover inquiry tops £14m,https://www.theguardian.com/business/2009/jan/...,0.066303
3,2009-01-01T14:22:50Z,Longannet is at the centre of Scottish Power's...,https://www.theguardian.com/business/2009/jan/...,0.749308
4,2009-01-01T00:01:00Z,Small investors may lose out as Madoff prepare...,https://www.theguardian.com/business/2009/jan/...,-0.923424


In [5]:
df['Publish Date'] = pd.to_datetime(df['Publish Date'], errors='coerce')
df['Publish Date'] = df['Publish Date'].dt.date

In [6]:
df['Publish Date'].dtype

dtype('O')

In [7]:
df.head()

Unnamed: 0,Publish Date,Title,URL,Sentiment Score
0,2009-01-01,New year predictions point to deepest UK reces...,https://www.theguardian.com/business/2009/jan/...,-0.944076
1,2009-01-01,Softwear patches have not solved the smartphon...,https://www.theguardian.com/business/2009/jan/...,-0.769855
2,2009-01-01,Cost of MG Rover inquiry tops £14m,https://www.theguardian.com/business/2009/jan/...,0.066303
3,2009-01-01,Longannet is at the centre of Scottish Power's...,https://www.theguardian.com/business/2009/jan/...,0.749308
4,2009-01-01,Small investors may lose out as Madoff prepare...,https://www.theguardian.com/business/2009/jan/...,-0.923424


In [8]:
df_sentiment_avg = df.groupby("Publish Date", as_index=False).agg(Average=("Sentiment Score", "mean"))
df_sentiment_avg.head()

Unnamed: 0,Publish Date,Average
0,2009-01-01,-0.295749
1,2009-01-02,-0.169914
2,2009-01-03,-0.250856
3,2009-01-04,-0.248705
4,2009-01-05,-0.047598


In [9]:
df_sentiment_avg

Unnamed: 0,Publish Date,Average
0,2009-01-01,-0.295749
1,2009-01-02,-0.169914
2,2009-01-03,-0.250856
3,2009-01-04,-0.248705
4,2009-01-05,-0.047598
...,...,...
2511,2015-12-28,0.117331
2512,2015-12-29,0.318152
2513,2015-12-30,-0.095663
2514,2015-12-31,-0.171726


In [10]:
df_sentiment_avg.to_csv("new_sentiment_avg.csv", index=False, header=True)

**Combine News and Stocks Data**

In [None]:
import pandas as pd
df = pd.read_csv("./stocks_data/AAPL_data.csv")
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,EMA_12,EMA_26,MACD,Signal,RSI,CCI,ADX
0,2015-03-02 00:00:00-05:00,28.865108,29.095135,28.652947,28.829374,192386800,28.829374,28.829374,0.0,0.0,45.526105,56.240216,28.673114
1,2015-03-03 00:00:00-05:00,28.800338,28.925401,28.60604,28.889668,151265200,28.83865,28.83384,0.00481,0.000962,45.526105,56.240216,28.673114
2,2015-03-04 00:00:00-05:00,28.831606,28.934334,28.65741,28.706539,126665200,28.818325,28.824411,-0.006085,-0.000448,45.526105,56.240216,28.673114
3,2015-03-05 00:00:00-05:00,28.715477,28.753442,28.085693,28.230856,226068400,28.727945,28.780444,-0.052498,-0.010858,45.526105,56.240216,28.673114
4,2015-03-06 00:00:00-05:00,28.675274,28.891902,28.197354,28.273285,291368400,28.657998,28.742876,-0.084879,-0.025662,45.526105,56.240216,28.673114


In [None]:
df["Date"].dtype

dtype('O')

In [None]:
import os
import pandas as pd
# from datetime import date

directory = os.fsencode("./stocks_data")

for file in os.listdir(directory):
    filename = os.fsdecode(file)
    if filename.endswith(".csv"):
        # import Stock data and convert date format
        df1 = pd.read_csv("./stocks_data/" + filename)
        df1["Date"] = pd.to_datetime(df1["Date"], utc=True).dt.tz_convert(None).dt.date

        # import sentiment data and convert dateformat
        df2 = pd.read_csv("sentiment_avg.csv")
        df2["Publish Date"] = pd.to_datetime(df2["Publish Date"]).dt.date

        # merge dataframes
        df = pd.merge(df1, df2, left_on="Date", right_on="Publish Date")
        df = df.drop("Publish Date", axis=1)
        df.rename(columns={"Average": "Sentiment Average"}, inplace=True)

        # rewrite existing csv file with additional changes
        df.to_csv("./stocks_data/" + filename, index=False)


In [11]:
df1 = pd.read_csv("^DJI_data.csv")
df1["Date"] = pd.to_datetime(df1["Date"], utc=True).dt.tz_convert(None).dt.date

# import sentiment data and convert dateformat
df2 = pd.read_csv("new_sentiment_avg.csv")
df2["Publish Date"] = pd.to_datetime(df2["Publish Date"]).dt.date

# merge dataframes
df = pd.merge(df1, df2, left_on="Date", right_on="Publish Date")
df = df.drop("Publish Date", axis=1)
df.rename(columns={"Average": "Sentiment Average"}, inplace=True)

# rewrite existing csv file with additional changes
df.to_csv("^DJI_data.csv", index=False)
