![QuantConnect Logo](https://cdn.quantconnect.com/web/i/icon.png)
<hr>

In [1]:
#Data Reading
import pandas as pd
import os

print(os.listdir('.')) # List files in the current directory

df = pd.read_csv('headlines.csv')
df = df.drop(columns=["CP"])    # Dropping close prices
df = df.rename(columns = {"Title":"text", "Date":"date"})
df.head()

['.DS_Store', 'qc_error.code-workspace', 'backtests', 'research.ipynb', 'config.json', 'qc.code-workspace', '__pycache__', 'report.html', '.vscode', 'main.py', 'Data']


FileNotFoundError: [Errno 2] No such file or directory: 'headlines.csv'

In [None]:
#Data Cleaning
df["date"] = pd.to_datetime(df["date"]).dt.date
df = df.sort_values(by="date")
df = df.reset_index(drop=True)

df = df.drop_duplicates(subset=["date", "text"]).reset_index(drop=True)
print(f"Rows after drop: {len(df)}")

df.head()
df.describe()


In [None]:
#Assigning Sentiment Scores
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from transformers import pipeline
from tqdm import tqdm

#Initialize VADER sentiment analyzer
vader = SentimentIntensityAnalyzer()
finbert = pipeline(
    "sentiment-analysis",
    model="yiyanghkust/finbert-tone",
    tokenizer="yiyanghkust/finbert-tone",
    device=-1                  
)

#Defining a function to get sentiment scores
def get_sentiment(text):
    # VADER
    vader_score = vader.polarity_scores(text)["compound"]

    # FinBERT
    finbert_pred  = finbert(text, truncation=True, max_length=128)[0]
    label, score  = finbert_pred["label"].upper(), finbert_pred["score"]
    finbert_score = score if label == "POSITIVE" else -score

    return vader_score, finbert_score

#Applying the function to the DataFrame
tqdm.pandas(desc="Calculating Sentiment Scores")
df[["vader_score", "finbert_score"]] = df["text"].progress_apply(
    lambda x: pd.Series(get_sentiment(x))
    )

df.head()

In [2]:
import pandas_market_calendars as mcal
import numpy as np

#Loading daily sentiment scores
daily = df.groupby("date")[["vader_score", "finbert_score"]].mean()

#Bulding a market calendar and full index
nyse = mcal.get_calendar("NYSE") 
all_days = nyse.schedule(start_date=daily.index.min(), end_date=daily.index.max())  

#Reindex the daily sentiment scores to include all market days
daily = daily.reindex(all_days.index, fill_value=np.nan)
daily.index.name = "date"

#Forward fill up to 2
daily["vader_ff2"]   = daily["vader_score"].ffill(limit=2)
daily["finbert_ff2"] = daily["finbert_score"].ffill(limit=2)

print(daily.loc["2008-01-01":"2008-01-10"])


NameError: name 'df' is not defined