In [23]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import numpy as np
import plotly.graph_objects as go
from datetime import timedelta
from sklearn.preprocessing import StandardScaler

In [83]:
INDEX = "MS8"
TICKER = "BB"

In [104]:
#carico dati di google trends
trends_data = pd.read_csv( f"data/{INDEX}_trends.csv", index_col=False )
trends = trends_data.melt(id_vars=["date"], var_name="ticker", value_name="trend")
trends = trends[trends["ticker"] == TICKER].copy()
trends["trend_diff"] = trends["trend"].diff()

#carico dati su shorts
shorts_data = pd.read_csv( f"data/{INDEX}_short_interest_W.csv", index_col=False)
shorts = shorts_data[shorts_data["ticker"] == TICKER].copy()
shorts["d2c_pct"] = shorts["d2c"].pct_change()
shorts["shorts_pct"] = shorts["shorts"].pct_change()
shorts["volume_pct"] = shorts["volume"].pct_change()

#carico prezzi e calcolo volatilità
prices = pd.read_csv( f"data/{INDEX}_prices.csv", index_col=False )
prices = prices.melt(id_vars=["Date"], var_name="ticker", value_name="prices")
prices = prices.rename(columns={"Date": "date"})
prices = prices[prices["ticker"] == TICKER].copy()
prices["returns"] = prices["prices"].pct_change()
prices["volatility"] = prices["returns"].rolling(window=10).std()
prices["volatility_diff"] = prices["volatility"].diff()

In [105]:
#formatto date
shorts["date"] = pd.to_datetime(shorts["date"])
shorts = shorts[shorts["date"] >= "2020-06-01"]
prices["date"] = pd.to_datetime(prices["date"])
trends["date"] = pd.to_datetime(trends["date"])

In [106]:
#mergio tutti i dati
merged = pd.merge_asof(
    trends.sort_values(["ticker", "date"]),
    shorts.sort_values(["ticker", "date"]),
    on="date",
    by="ticker",
    direction="backward",
    tolerance=pd.Timedelta("10D")  # accetta max ±X giorni
)
df = pd.merge_asof(
    merged.sort_values(["ticker", "date"]),
    prices.sort_values(["ticker", "date"]),
    on="date",
    by="ticker",
    direction="backward",
    tolerance=pd.Timedelta("3D")
)

#tolgo valori nulli
df = df.dropna(subset=[ "returns", "d2c", "shorts", "volume", "trend", "volatility"])

In [114]:
df.head(15)

Unnamed: 0,date,ticker,trend,trend_diff,d2c,shorts,volume,d2c_pct,shorts_pct,volume_pct,prices,returns,volatility,volatility_diff
0,2020-06-21,BB,25,,3.52,24249312.0,6895743.0,-0.558344,-0.10361,1.032047,5.06,-0.003937,0.047652,-0.002844
2,2020-07-05,BB,23,-1.0,5.12,26017542.0,5084496.0,0.454545,0.072919,-0.262662,4.85,0.014644,0.019719,0.001157
4,2020-07-19,BB,25,1.0,7.61,24777599.0,3254738.0,0.486328,-0.047658,-0.35987,4.8,0.032258,0.026963,0.001883
6,2020-08-02,BB,28,2.0,9.58,26334563.0,2748465.0,0.25887,0.062838,-0.15555,4.74,0.0,0.025441,-0.002095
7,2020-08-09,BB,28,0.0,9.58,26334563.0,2748465.0,0.25887,0.062838,-0.15555,4.84,-0.024194,0.021714,0.000706
8,2020-08-16,BB,28,0.0,9.83,27744981.0,2822509.0,0.026096,0.053558,0.02694,4.8,-0.018405,0.015777,0.001568
9,2020-08-23,BB,28,0.0,9.83,27744981.0,2822509.0,0.026096,0.053558,0.02694,4.75,-0.016563,0.010915,-0.001331
11,2020-09-06,BB,27,0.0,8.57,24423676.0,2848779.0,-0.128179,-0.119708,0.009307,5.11,-0.013513,0.039301,-0.000197
13,2020-09-20,BB,28,0.0,6.06,24892819.0,4107562.0,-0.292882,0.019209,0.441868,4.86,0.027484,0.018255,-0.003189
15,2020-10-04,BB,27,0.0,5.08,26812588.0,5275025.0,-0.161716,0.077121,0.284223,4.44,-0.030568,0.02152,-0.001265


In [101]:
#shifto tutte le varibili dipendenti
df["d2c_lag"] = df["d2c"].shift(1)
df["shorts_lag"] = df["shorts"].shift(1)
df["volu_lag"] = df["volume"].shift(1)
df["trend_lag"] = df["trend"].shift(1)
df["vola_lag"] = df["volatility"].shift(1)

#guardo correlazioni con i ritorni
df[["returns", "d2c_lag", "trend_lag", "vola_lag", "shorts_lag", "volu_lag"]].corr(method="pearson")

Unnamed: 0,returns,d2c_lag,trend_lag,vola_lag,shorts_lag,volu_lag
returns,1.0,-0.032205,-0.101261,0.091582,-0.134698,0.12744
d2c_lag,-0.032205,1.0,-0.133687,-0.414124,0.121333,-0.413864
trend_lag,-0.101261,-0.133687,1.0,0.416509,0.231122,0.275408
vola_lag,0.091582,-0.414124,0.416509,1.0,0.001178,0.721905
shorts_lag,-0.134698,0.121333,0.231122,0.001178,1.0,-0.001133
volu_lag,0.12744,-0.413864,0.275408,0.721905,-0.001133,1.0


In [115]:
df[df["date"] == "2021-02-07"]

Unnamed: 0,date,ticker,trend,trend_diff,d2c,shorts,volume,d2c_pct,shorts_pct,volume_pct,prices,returns,volatility,volatility_diff
33,2021-02-07,BB,35,-15.0,1.0,20406949.0,177806832.0,-0.618321,-0.530717,9.703847,13.23,0.088889,0.215233,-0.00014


In [117]:
#classifico settimane come squeeze (se ritorno > X% e shorts quantity è diminuita di Y%) o no (0)
df["squeeze_signal"] = ((df["returns"] > 0.10) & (df["shorts_pct"] < -0.10)).astype(int)
print( df["squeeze_signal"].sum() )

0


In [120]:
def normalize(series):
    return (2 * ((series - series.min()) / (series.max() - series.min())) - 1)

fig = go.Figure()

fig.add_trace(go.Scatter(x=df["date"], y=normalize(df["returns"]), name="Returns"))
fig.add_trace(go.Scatter(x=df["date"], y=normalize(df["prices"]), name="Price", visible="legendonly"))
fig.add_trace(go.Scatter(x=df["date"], y=normalize(df["d2c"]), name="Days to Cover", visible="legendonly"))
fig.add_trace(go.Scatter(x=df["date"], y=normalize(df["trend"]), name="Google Trends", visible="legendonly"))
fig.add_trace(go.Scatter(x=df["date"], y=normalize(df["volatility"]), name="Volatility", visible="legendonly"))
fig.add_trace(go.Scatter(x=df["date"], y=normalize(df["shorts"]), name="Shorts", visible="legendonly"))
fig.add_trace(go.Scatter(x=df["date"], y=normalize(df["shorts_pct"]), name="Shorts Pct"))
fig.add_trace(go.Scatter(x=df["date"], y=normalize(df["volume"]), name="Avg Daily Volume", visible="legendonly"))


# Aggiungi strisce verticali dove c'è uno squeeze
for date in df[df["squeeze_signal"] == 1]["date"]:
    fig.add_shape(
        type="rect",
        x0=date,
        x1=date + timedelta(weeks=1),
        y0=0,  # o min(df["returns"]) per coprire tutta l'area
        y1=1,  # o max(df["returns"])
        xref='x',
        yref='paper',  # yref='paper' va da 0 a 1 sull’altezza del grafico
        fillcolor="rgba(200,200,200,0.5)",
        line_width=0,
        layer="below"
    )

fig.update_layout(
    title=f"{TICKER}",
    hovermode="x unified",
    template="plotly_white"
)

fig.show()


In [237]:
scaler = StandardScaler()
df_scaled = df[["returns", "trend_lag", "d2c_lag", "shorts_lag", "vola_lag", "volu_lag"]].dropna()
scaled_vals = scaler.fit_transform(df_scaled)
df_scaled[["returns", "trend_lag", "d2c_lag", "shorts_lag", "vola_lag", "volu_lag"]] = scaled_vals
df_scaled = df[["returns", "trend_lag", "d2c_lag", "shorts_lag", "vola_lag", "volu_lag"]].dropna()

In [238]:
# Costruisci il modello
X = df_scaled[["trend_lag", "d2c_lag", "shorts_lag", "volu_lag", "vola_lag"]]
X = sm.add_constant(X)  # aggiunge l'intercetta
y = df_scaled["returns"]

model = sm.OLS(y, X).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                returns   R-squared:                       0.216
Model:                            OLS   Adj. R-squared:                  0.200
Method:                 Least Squares   F-statistic:                     13.78
Date:                Thu, 26 Jun 2025   Prob (F-statistic):           6.86e-12
Time:                        15:22:26   Log-Likelihood:                 306.01
No. Observations:                 256   AIC:                            -600.0
Df Residuals:                     250   BIC:                            -578.8
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0044      0.005      0.829      0.4

In [None]:
scaler = StandardScaler()
df[["d2c_lag", "trend_lag", "vola_lag", "volu_lag", "shorts_lag"]] = scaler.fit_transform(df[["d2c_lag", "trend_lag", "vola_lag", "volu_lag", "shorts_lag"]])

# Drop NA (dovuti a shift o missing)
df_clean = df[["squeeze_signal", "d2c_lag", "trend_lag", "vola_lag", "volu_lag", "shorts_lag"]].dropna()

# Variabili
X = df_clean[["d2c_lag", "trend_lag", "vola_lag", "volu_lag", "shorts_lag"]]
y = df_clean["squeeze_signal"]

# Aggiungi costante
X = sm.add_constant(X)

# Modello logit
logit_model = sm.Logit(y, X)
result = logit_model.fit()

# Riassunto
print(result.summary())

Optimization terminated successfully.
         Current function value: 0.248919
         Iterations 8
                           Logit Regression Results                           
Dep. Variable:            squeeze_day   No. Observations:                  256
Model:                          Logit   Df Residuals:                      250
Method:                           MLE   Df Model:                            5
Date:                Thu, 26 Jun 2025   Pseudo R-squ.:                  0.1226
Time:                        14:25:33   Log-Likelihood:                -63.723
converged:                       True   LL-Null:                       -72.628
Covariance Type:            nonrobust   LLR p-value:                  0.003195
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -2.8407      0.313     -9.080      0.000      -3.454      -2.227
d2c_lag        0.2079      0.