In [1]:
import pandas as pd

In [15]:
INDEX = "R3000"

#Carico tutti i dati
prices = pd.read_csv( f"./data/{INDEX}/all_prices_D.csv" )
shorts = pd.read_csv( f"./data/{INDEX}/all_short_interest.csv" )
news = pd.read_csv( f"./data/{INDEX}/all_news_volume_D.csv" )
wiki = pd.read_csv( f"./data/{INDEX}/wiki_views.csv" )

tickers = pd.read_csv( f"./data/{INDEX}/all_tickers_filtered.csv" )["ticker"].unique()
prices = prices[prices["ticker"].astype(str).isin([str(t) for t in tickers])].copy()

In [16]:
#prendo i prices alla data degli shorts
prices = prices.copy()
shorts = shorts.copy()

prices["ticker"] = prices["ticker"].astype(str)
shorts["ticker"] = shorts["ticker"].astype(str)

prices["date"] = pd.to_datetime(prices["date"], errors="coerce").dt.tz_localize(None)
shorts["date"] = pd.to_datetime(shorts["date"],  errors="coerce").dt.tz_localize(None)

prices = prices.dropna(subset=["ticker","date"])
shorts = shorts.dropna(subset=["ticker","date"])

prices = prices[prices["ticker"].isin([str(t) for t in tickers])].copy()
shorts = shorts[shorts["ticker"].isin([str(t) for t in tickers])].copy()

left = (shorts[["ticker","date"]]
        .drop_duplicates()
        .sort_values("date", kind="mergesort")           
        .reset_index(drop=True))

right = (prices[["ticker","date","close"]]
         .sort_values("date", kind="mergesort")             
         .reset_index(drop=True))

prices_on_short = pd.merge_asof(
    left=left,
    right=right,
    on="date",            
    by="ticker",               
    direction="backward",
    allow_exact_matches=True
)

In [17]:
#prendo il numero di news alla data degli shorts (calcolati dallo shorts precedente allo shorts attuale)

shorts["date"] = pd.to_datetime(shorts["date"], errors="coerce").dt.tz_localize(None).dt.floor("D")
news["date"]  = pd.to_datetime(news["date"],  errors="coerce", utc=True).dt.tz_convert(None).dt.floor("D")

shorts = shorts.dropna(subset=["ticker","date"]).reset_index(drop=True)
news  = news.dropna(subset=["ticker","date"]).reset_index(drop=True)

shorts = shorts.sort_values(["ticker","date"], kind="mergesort").reset_index(drop=True)
shorts["prev_date"] = shorts.groupby("ticker")["date"].shift(1)

news = news.sort_values(["ticker","date"], kind="mergesort").reset_index(drop=True)
news["cum_news"] = news.groupby("ticker")["news_count"].cumsum()

left_curr  = shorts[["ticker","date"]].sort_values("date", kind="mergesort").reset_index(drop=True)
right_curr = news[["ticker","date","cum_news"]].sort_values("date", kind="mergesort").reset_index(drop=True)

curr = pd.merge_asof(
    left=left_curr,
    right=right_curr,
    on="date",
    by="ticker",
    direction="backward",
    allow_exact_matches=True
).rename(columns={"cum_news":"cum_curr"})

prev_left = (
    shorts[["ticker","prev_date"]]
    .dropna(subset=["prev_date"])
    .rename(columns={"prev_date":"date_left"})
    .sort_values("date_left", kind="mergesort")
    .reset_index(drop=True)
)

right_prev = news[["ticker","date","cum_news"]].sort_values("date", kind="mergesort").reset_index(drop=True)

prev = pd.merge_asof(
    left=prev_left,
    right=right_prev,
    left_on="date_left",
    right_on="date",
    by="ticker",
    direction="backward",
    allow_exact_matches=True
).rename(columns={"cum_news":"cum_prev"})[["ticker","date_left","cum_prev"]]

news_window = (
    curr
    .merge(shorts[["ticker","date","prev_date"]], on=["ticker","date"], how="left")
    .merge(prev, left_on=["ticker","prev_date"], right_on=["ticker","date_left"], how="left")
    .assign(
        cum_curr=lambda d: d["cum_curr"].fillna(0),
        cum_prev=lambda d: d["cum_prev"].fillna(0),
        news_since_prev_short=lambda d: (d["cum_curr"] - d["cum_prev"]).astype("int64")
    )[["ticker","date","news_since_prev_short"]]
    .sort_values(["ticker","date"], kind="mergesort")
    .reset_index(drop=True)
)

In [18]:
wiki["date"] = pd.to_datetime(wiki["date"], errors="coerce", utc=True).dt.tz_convert(None).dt.floor("D")
wiki = wiki.dropna(subset=["ticker","date"]).reset_index(drop=True)
wiki = wiki.sort_values(["ticker","date"], kind="mergesort").reset_index(drop=True)

wiki["wiki_views"] = pd.to_numeric(wiki["wiki_views"], errors="coerce").fillna(0)
wiki["cum_wiki"]   = wiki.groupby("ticker")["wiki_views"].cumsum()

left_curr  = shorts[["ticker","date"]].sort_values("date", kind="mergesort").reset_index(drop=True)
right_curr = wiki[["ticker","date","cum_wiki"]].sort_values("date", kind="mergesort").reset_index(drop=True)

curr = pd.merge_asof(
    left=left_curr,
    right=right_curr,
    on="date",
    by="ticker",
    direction="backward",
    allow_exact_matches=True
).rename(columns={"cum_wiki":"cum_curr"})

prev_left = (
    shorts[["ticker","prev_date"]]
    .dropna(subset=["prev_date"])
    .rename(columns={"prev_date":"date_left"})
    .sort_values("date_left", kind="mergesort")
    .reset_index(drop=True)
)

right_prev = wiki[["ticker","date","cum_wiki"]].sort_values("date", kind="mergesort").reset_index(drop=True)

prev = pd.merge_asof(
    left=prev_left,
    right=right_prev,
    left_on="date_left",
    right_on="date",
    by="ticker",
    direction="backward",
    allow_exact_matches=True
).rename(columns={"cum_wiki":"cum_prev"})[["ticker","date_left","cum_prev"]]

wiki_window = (
    curr
    .merge(shorts[["ticker","date","prev_date"]], on=["ticker","date"], how="left")
    .merge(prev, left_on=["ticker","prev_date"], right_on=["ticker","date_left"], how="left")
    .assign(
        cum_curr=lambda d: d["cum_curr"].fillna(0),
        cum_prev=lambda d: d["cum_prev"].fillna(0),
        wiki_views_since_prev_short=lambda d: (d["cum_curr"] - d["cum_prev"]).astype("int64")
    )[["ticker","date","wiki_views_since_prev_short"]]
    .sort_values(["ticker","date"], kind="mergesort")
    .reset_index(drop=True)
)

In [19]:
#unisco price con shorts
df_merged = (
    prices_on_short
      .merge(shorts.rename(columns={"volume": "si_volume"}),
             on=["ticker","date"], how="left")
      .sort_values(["ticker","date"])
      .reset_index(drop=True)
)

In [20]:
#unisco news
df_merged = (
    df_merged.merge(news_window, on=["ticker","date"], how="left")
         .assign(news_since_prev_short=lambda d: d["news_since_prev_short"].fillna(0).astype("int64"))
         .sort_values(["ticker","date"])
         .reset_index(drop=True)
)

In [21]:
#unisco wiki_views
df_merged = (
    df_merged.merge(wiki_window, on=["ticker","date"], how="left")
         .assign(wiki_views_since_prev_short=lambda d: d["wiki_views_since_prev_short"].fillna(0).astype("int64"))
         .sort_values(["ticker","date"])
         .reset_index(drop=True)
)

In [26]:
print( len(df_merged["ticker"].unique().tolist()) )

635


In [24]:
#sistemo df finale
df_merged = df_merged.drop(columns=["prev_date"])
df_merged = df_merged.dropna().sort_values(["ticker", "date"]).reset_index(drop=True)
df_merged = df_merged[["date","ticker", *[c for c in df_merged.columns if c not in ("date","ticker")]]]
df_merged.columns = ["date", "ticker", "close", "d2c", "shorts", "volume", "news_volume", "wiki_views"]


In [25]:
df_merged

Unnamed: 0,date,ticker,close,d2c,shorts,volume,news_volume,wiki_views
0,2020-08-14,AA,14.640,2.57,15596730,6075805,34,3
1,2020-08-31,AA,14.620,3.88,13035710,3358959,20,6
2,2020-09-15,AA,13.780,3.48,12782965,3677581,19,15
3,2020-09-30,AA,11.630,2.36,12587689,5328743,38,2
4,2020-10-15,AA,12.300,2.05,11134193,5444408,36,7
...,...,...,...,...,...,...,...,...
68838,2025-07-15,ZYME,12.880,18.15,6868563,378522,0,3
68839,2025-07-31,ZYME,12.560,16.64,6806921,409137,3,4
68840,2025-08-15,ZYME,12.155,13.08,6807564,520341,5,1
68841,2025-08-29,ZYME,12.155,17.64,6885888,390459,1,1


In [28]:
df_merged.to_csv( f"./data/{INDEX}/merged_data.csv", index=False )