In [1]:
# Jupyter Notebook Cell
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

# 1. Load the news data
news_df = pd.read_csv(
    "../data/processed/news_data_polygon.csv",    
    parse_dates=["ds"]        
)

news_df = news_df.rename(columns={"ds": "Date", "ticker": "Ticker", "headline": "Text"})

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model     = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
sent_pipe = pipeline(
    "sentiment-analysis",
    model= model,
    tokenizer= tokenizer,
    return_all_scores=True
)

# 3. Define a helper to extract scores
def extract_scores(text):
    scores_list = sent_pipe(text)[0]   # list of dicts: [{"label":"POS","score":...}, â€¦]
    # map labels to lowercase keys
    return {d["label"].lower(): d["score"] for d in scores_list}

# 4. Run sentiment analysis (this may take a while if you have many rows)
sentiments = news_df["Text"].apply(extract_scores).apply(pd.Series)
news_df = pd.concat([news_df, sentiments], axis=1)

# 5. Aggregate to get a daily, per-ticker sentiment summary
daily_sent = (
    news_df
    .groupby(["Date","Ticker"])[["positive","neutral","negative"]]
    .mean()
    .reset_index()
)

Device set to use mps:0
  return forward_call(*args, **kwargs)


In [3]:
print(daily_sent)

                          Date Ticker  positive   neutral  negative
0    2023-06-01 00:00:00+00:00   AAPL  0.232926  0.643317  0.123757
1    2023-06-01 00:00:00+00:00  GOOGL  0.319815  0.497703  0.182482
2    2023-06-01 00:00:00+00:00   TSLA  0.292940  0.520992  0.186068
3    2023-06-02 00:00:00+00:00   AAPL  0.171210  0.651650  0.177140
4    2023-06-02 00:00:00+00:00  GOOGL  0.166422  0.706851  0.126727
...                        ...    ...       ...       ...       ...
2137 2025-05-31 00:00:00+00:00  GOOGL  0.057143  0.723633  0.219224
2138 2025-05-31 00:00:00+00:00   TSLA  0.086500  0.876373  0.037127
2139 2025-06-01 00:00:00+00:00   AAPL  0.048652  0.867846  0.083501
2140 2025-06-01 00:00:00+00:00  GOOGL  0.210159  0.747900  0.041942
2141 2025-06-01 00:00:00+00:00   TSLA  0.122550  0.854050  0.023399

[2142 rows x 5 columns]


In [4]:
# 6. Load your features (stationary) dataframe
feat_df = pd.read_csv(
    "../data/processed/features_stationary.csv",  # adjust path if needed
    parse_dates=["Date"]
)

feat_df["Date"] = pd.to_datetime(feat_df["Date"])\
                     .dt.tz_localize(None)

# 2) same for daily_sent["Date"]
daily_sent["Date"] = pd.to_datetime(daily_sent["Date"])\
                         .dt.tz_localize(None)


# 7. Merge the sentiment scores into your features
merged_df = feat_df.merge(
    daily_sent,
    on=["Date","Ticker"],
    how="left"
)

# 9. Quick check
print(merged_df.head())


        Date        Open        High         Low       Close       Volume  \
0 2022-09-13  157.593780  158.224549  151.157963  151.621185  122656600.0   
1 2022-09-14  152.557473  154.834168  151.394499  153.069977   87965400.0   
2 2022-09-15  152.419493  153.000995  149.196667  150.172379   90481100.0   
3 2022-09-16  149.029113  149.167093  146.230062  148.526459  162278800.0   
4 2022-09-19  147.156536  152.330816  146.949573  152.251968   81474200.0   

  Ticker       Ret       SMA20       SMA50  ...  MACD_diff      RSI14  \
0   AAPL -0.058680  160.654005  156.484178  ...  -1.110677  41.021561   
1   AAPL  0.009555  159.780782  156.810853  ...  -1.027681  43.356484   
2   AAPL -0.018930  158.687776  157.001034  ...  -1.101642  39.949951   
3   AAPL -0.010960  157.532187  157.090781  ...  -1.187715  38.117879   
4   AAPL  0.025083  156.692474  157.241455  ...  -0.931367  44.339906   

       BB_mid     BB_high      BB_low    BB_pct   BB_width  positive  neutral  \
0  160.654005  17

In [5]:

merged_df["has_sentiment"] = merged_df["positive"].notna().astype(int)

# neutral imputation
for col in ["positive","neutral","negative"]:
    merged_df[col].fillna(0.0, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_df[col].fillna(0.0, inplace=True)


In [6]:
# 8. Save the new dataframe
merged_df.to_csv(
    "../data/processed/features_with_sentiment.csv",
    index=False
)