# Stationarity Testing & Feature Differencing
This notebook performs ADF and KPSS tests on key features, identifies non-stationary series, and applies differencing to make them stationary.

In [52]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.stattools import adfuller, kpss

# display
pd.set_option("display.max_columns", None)
sns.set_theme(style="whitegrid")


# Adjust the path if your notebook is located elsewhere
df = pd.read_csv('../data/processed/features_panel.csv', parse_dates=['Date'], infer_datetime_format=True)
df = df.sort_values(["Ticker","Date"]).reset_index(drop=True)
df.head()

  df = pd.read_csv('../data/processed/features_panel.csv', parse_dates=['Date'], infer_datetime_format=True)


Unnamed: 0,Date,Open,High,Low,Close,Volume,Ticker,Ret,SMA20,SMA50,EMA20,MACD,MACD_sig,MACD_diff,RSI14,BB_mid,BB_high,BB_low,BB_pct,BB_width
0,2022-08-01,158.467658,161.006921,158.349557,158.959763,67829400.0,AAPL,-0.006153,148.288956,,149.388499,,,,75.735517,148.288956,160.071095,136.506816,0.952838,15.890785
1,2022-08-02,157.572035,159.845557,157.109455,157.483444,59907000.0,AAPL,-0.009287,149.326315,,150.159447,,,,71.040631,149.326315,160.494536,138.158093,0.865194,14.958142
2,2022-08-03,158.300341,163.959549,158.211766,163.506821,82507500.0,AAPL,0.038248,150.468491,,151.430625,,,,77.239943,150.468491,162.498399,138.438583,1.041913,15.989936
3,2022-08-04,163.388703,164.550079,161.833649,163.191864,55474100.0,AAPL,-0.001926,151.426127,,152.550743,,,,76.319943,151.426127,164.277584,138.57467,0.957759,16.973897
4,2022-08-05,160.85603,163.457953,160.649052,162.965164,56697000.0,AAPL,-0.001389,152.338473,,153.542593,,,,75.621746,152.338473,165.734785,138.94216,0.896627,17.587563


In [53]:
# we’ll leave the raw price/volume columns alone,
# and test every other numeric column
raw_cols = ["Date","Ticker","Open","High","Low","Close","Volume"]
feature_cols = [
    c for c in df.columns
    if c not in raw_cols and pd.api.types.is_numeric_dtype(df[c])
]
print("Will test stationarity of:", feature_cols)


Will test stationarity of: ['Ret', 'SMA20', 'SMA50', 'EMA20', 'MACD', 'MACD_sig', 'MACD_diff', 'RSI14', 'BB_mid', 'BB_high', 'BB_low', 'BB_pct', 'BB_width']


In [54]:
results = []
features = ['Ret', 'SMA20', 'SMA50', 'EMA20', 'MACD', 'MACD_sig', 'MACD_diff', 'RSI14', 'BB_mid', 'BB_high', 'BB_low', 'BB_pct', 'BB_width']
for ticker, group in df.groupby('Ticker'):
    for feat in features:
        series = group[feat].dropna()
        adf_p = adfuller(series, autolag='AIC')[1]
        kpss_p = kpss(series, regression='c', nlags='auto')[1]
        results.append({'Ticker': ticker, 'Feature': feat, 'ADF p-value': adf_p, 'KPSS p-value': kpss_p})
res_df = pd.DataFrame(results)
res_df

look-up table. The actual p-value is greater than the p-value returned.

  kpss_p = kpss(series, regression='c', nlags='auto')[1]
look-up table. The actual p-value is smaller than the p-value returned.

  kpss_p = kpss(series, regression='c', nlags='auto')[1]
look-up table. The actual p-value is smaller than the p-value returned.

  kpss_p = kpss(series, regression='c', nlags='auto')[1]
look-up table. The actual p-value is smaller than the p-value returned.

  kpss_p = kpss(series, regression='c', nlags='auto')[1]
look-up table. The actual p-value is greater than the p-value returned.

  kpss_p = kpss(series, regression='c', nlags='auto')[1]
look-up table. The actual p-value is greater than the p-value returned.

  kpss_p = kpss(series, regression='c', nlags='auto')[1]
look-up table. The actual p-value is greater than the p-value returned.

  kpss_p = kpss(series, regression='c', nlags='auto')[1]
look-up table. The actual p-value is greater than the p-value returned.

  kpss_p = kpss(s

Unnamed: 0,Ticker,Feature,ADF p-value,KPSS p-value
0,AAPL,Ret,7.2297800000000005e-28,0.1
1,AAPL,SMA20,0.7478754,0.01
2,AAPL,SMA50,0.49942,0.01
3,AAPL,EMA20,0.6801657,0.01
4,AAPL,MACD,0.0002172474,0.1
5,AAPL,MACD_sig,0.0005288422,0.1
6,AAPL,MACD_diff,5.971797e-14,0.1
7,AAPL,RSI14,9.757952e-06,0.1
8,AAPL,BB_mid,0.7478754,0.01
9,AAPL,BB_high,0.5380162,0.01


In [55]:
to_diff = (
    res_df
    .query("`ADF p-value` > 0.05 or `KPSS p-value` < 0.05")
    .groupby("Feature")["Feature"]
    .first()
    .tolist()
)

print("Will difference:", to_diff)

Will difference: ['BB_high', 'BB_low', 'BB_mid', 'EMA20', 'SMA20', 'SMA50']


In [56]:
df_stat = df.copy()

for feat in to_diff:
    # new column with suffix `_stat`
    df_stat[f"{feat}_stat"] = (
        df_stat
        .groupby("Ticker")[feat]
        .transform(lambda x: x.diff())
    )
df_stat.describe()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Ret,SMA20,SMA50,EMA20,MACD,MACD_sig,MACD_diff,RSI14,BB_mid,BB_high,BB_low,BB_pct,BB_width,BB_high_stat,BB_low_stat,BB_mid_stat,EMA20_stat,SMA20_stat,SMA50_stat
count,2193,2193.0,2193.0,2193.0,2193.0,2193.0,2193.0,2193.0,2103.0,2193.0,2175.0,2151.0,2151.0,2193.0,2193.0,2193.0,2193.0,2193.0,2193.0,2190.0,2190.0,2190.0,2190.0,2190.0,2100.0
mean,2024-01-12 22:05:44.733242112,189.287037,192.287224,186.295437,189.339275,67969700.0,0.000712,188.580396,187.721972,188.548166,0.573803,0.529462,-0.033764,52.055188,188.580396,205.797252,171.363539,0.534803,17.074492,0.075165,0.09114,0.083152,0.081022,0.083152,0.064206
min,2022-08-01 00:00:00,84.890877,86.0042,82.843157,82.932625,9701400.0,-0.154262,88.329755,91.859885,88.994238,-35.583228,-32.238555,-10.028737,16.560038,88.329755,91.554984,81.929446,-0.319941,3.6759,-14.72206,-15.35106,-6.9735,-8.653568,-6.9735,-4.634
25%,2023-04-21 00:00:00,149.32446,151.630619,147.661232,149.789688,33591100.0,-0.012029,148.620343,150.562757,147.709874,-2.191566,-2.063741,-0.815339,42.506298,148.620343,158.491993,137.662299,0.25579,9.361939,-0.564929,-0.526757,-0.527696,-0.477614,-0.527696,-0.283356
50%,2024-01-12 00:00:00,178.580002,181.474029,176.076896,178.649994,55751900.0,0.000974,178.182964,178.5592,178.274638,0.750967,0.707421,-0.006631,52.269161,178.182964,191.991976,166.139199,0.564387,13.795413,0.037074,0.049281,0.090025,0.12086,0.090025,0.088256
75%,2024-10-04 00:00:00,222.669998,225.087029,218.399994,221.93985,91843300.0,0.012772,223.208,222.429178,223.425851,3.115308,3.017474,0.833456,61.446648,223.208,238.755801,200.90342,0.811793,20.399905,0.730431,0.791819,0.587108,0.587242,0.587108,0.410232
max,2025-06-30 00:00:00,475.899994,488.540009,457.51001,479.859985,318679900.0,0.2269,425.515999,402.026399,414.240657,40.538606,34.756622,10.073013,88.544267,425.515999,494.109115,381.407834,1.446327,70.375514,19.851698,18.572988,7.114999,9.745769,7.114999,4.8624
std,,62.254076,63.888251,60.320264,62.089764,44003430.0,0.027521,60.799875,58.117862,60.160166,7.456158,7.060482,2.176603,12.667441,60.799875,71.924726,51.875336,0.334712,11.44079,2.335003,2.261996,1.447889,1.434466,1.447889,0.938038


In [57]:
# require that all new `_stat` columns are non-null
stat_cols = [f + "_stat" for f in to_diff]
df_stat_clean = df_stat.dropna(subset=stat_cols).reset_index(drop=True)

print("Shape before:", df.shape)
print("Shape after dropping NaNs:", df_stat_clean.shape)


Shape before: (2193, 20)
Shape after dropping NaNs: (2100, 26)


In [58]:
stat_cols = [f + "_stat" for f in to_diff]

# 10a) Drop the originals
df_final = df_stat_clean.drop(columns=to_diff)

# 10b) Rename the differenced cols to their original names
rename_map = {f"{feat}_stat": feat for feat in to_diff}
df_final = df_final.rename(columns=rename_map)

# Sanity check
print("Final shape:", df_final.shape)
df_final.head()

Final shape: (2100, 20)


Unnamed: 0,Date,Open,High,Low,Close,Volume,Ticker,Ret,MACD,MACD_sig,MACD_diff,RSI14,BB_pct,BB_width,BB_high,BB_low,BB_mid,EMA20,SMA20,SMA50
0,2022-09-14,152.557473,154.834168,151.394499,153.069977,87965400.0,AAPL,0.009555,-1.583036,-0.555355,-1.027681,43.356484,0.249413,16.760661,-1.280438,-0.466008,-0.873223,-0.456788,-0.873223,0.326674
1,2022-09-15,152.419493,153.000995,149.196667,150.172379,90481100.0,AAPL,-0.01893,-1.932408,-0.830766,-1.101642,39.949951,0.166471,16.088979,-1.717541,-0.46847,-1.093005,-0.689246,-1.093005,0.190182
2,2022-09-16,149.029113,149.167093,146.230062,148.526459,162278800.0,AAPL,-0.01096,-2.315409,-1.127694,-1.187715,38.117879,0.125705,15.273396,-1.890953,-0.420225,-1.155589,-0.780358,-1.155589,0.089746
3,2022-09-19,147.156536,152.330816,146.949573,152.251968,81474200.0,AAPL,0.025083,-2.291903,-1.360536,-0.931367,44.339906,0.298132,14.038395,-1.871416,0.191991,-0.839713,-0.351227,-0.839713,0.150674
4,2022-09-20,151.187519,155.800028,150.872143,154.637039,107689800.0,AAPL,0.015665,-2.057106,-1.49985,-0.557256,47.948206,0.425851,13.209713,-1.209775,0.158164,-0.525806,-0.090628,-0.525806,0.24109


In [59]:
df_final.to_csv(
    "../data/processed/features_stationary.csv",
    index=False
)


In [60]:
# … your code that builds `df_stat` …

# 1) Drop any row that has at least one NaN in any column
df_stat = df.dropna()

# 2) (Optionally) reset the index so it’s nice and tidy
df_stat = df_stat.reset_index(drop=True)

# 3) Save out the fully non‐NaN file
df_stat.to_csv("../data/processed/features_stationary.csv", index=False)