In [11]:
import numpy as np
import pandas as pd
from statsmodels.tsa.stattools import adfuller, acf
from statsmodels.tsa.seasonal import STL
from scipy.stats import zscore
from scipy.fft import rfft, rfftfreq
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

def estimate_seasonal_period(ts, min_period=2, max_period=None):
    ts = ts.dropna().values
    n = len(ts)
    if max_period is None:
        max_period = n // 2
    if n < min_period * 2:
        return None

    freqs = rfftfreq(n)
    power = np.abs(rfft(ts))
    power[0] = 0  # Ignore DC/trend component

    search_range = power[min_period:max_period]
    if len(search_range) == 0:
        return None

    dominant_idx = search_range.argmax() + min_period
    estimated_period = int(n / dominant_idx) if dominant_idx != 0 else None
    return estimated_period if estimated_period and estimated_period < n else None

def has_significant_autocorrelation(ts, threshold=0.3, max_lag=50):
    autocorrs = acf(ts, nlags=max_lag, fft=False)
    # Ignore lag 0 and look for any significant spikes
    return np.any(np.abs(autocorrs[1:]) > threshold)

def analyze_series(ts: pd.Series):
    result = {}
    ts_clean = ts.dropna()
    if len(ts_clean) < 10:
        return {"stationary": "n/a", "seasonal": "n/a", "autocorrelated": "n/a", "noisy": "n/a"}

    # --- Stationarity ---
    try:
        adf_p = adfuller(ts_clean, autolag='AIC')[1]
        result["stationary"] = "yes" if adf_p < 0.05 else "no"
    except:
        result["stationary"] = "n/a"

    # --- Seasonality + Noise via STL ---
    try:
        period = estimate_seasonal_period(ts_clean)
        if period and period > 1:
            #result["period"] = period
            stl = STL(ts_clean, period=period, robust=True).fit()
            seasonal_strength = np.var(stl.seasonal) / np.var(ts_clean)
            noise_strength = np.var(stl.resid) / np.var(ts_clean)

            result["seasonal"] = "yes" if seasonal_strength > 0.3 else "no"
            result["noisy"] = "yes" if noise_strength > 0.5 else "no"
        else:
            result["seasonal"] = "no"
            result["noisy"] = "yes"  # if no structure found, assume it's mostly noise
    except:
        result["seasonal"] = "n/a"
        result["noisy"] = "n/a"

    # --- Autocorrelation: multiple lags ---
    try:
        result["autocorrelated"] = "yes" if has_significant_autocorrelation(ts_clean) else "no"
    except:
        result["autocorrelated"] = "n/a"

    return result


In [12]:
data_output = pd.read_csv("C:/Users/met48/Desktop/TS-Clustering/SimData/epsteinCV_outputs_active.csv", header=None)

scaler = MinMaxScaler()
data_output_scaled = scaler.fit_transform(data_output)
data_output = pd.DataFrame(data_output_scaled)

data =data_output

# --- Sample to 1280 examples if needed ---
data = data.sample(n=20000, random_state=1)

# --- Split into training and validation sets ---
train_data_pd, valid_data = train_test_split(data, test_size=0.2, random_state=42)

In [13]:
summary = train_data_pd.apply(lambda row: pd.Series(analyze_series(row)), axis=1)
print(summary)

  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  llf = -nobs2*np.log(2*np.pi) - nobs2*np.log(ssr / nobs) - nobs2
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  llf = -nobs2*np.log(2*np.pi) - nobs2*np.log(ssr / nobs) - nobs2
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  llf = -nobs2*np.log(2*np.pi) - nobs2*np.log(ssr / nobs) - nobs2
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags 

      stationary seasonal noisy autocorrelated
19701        yes      yes    no            yes
27590        yes      yes    no            yes
49071        yes      yes    no            yes
26310        yes       no   yes             no
36195        yes       no   yes             no
...          ...      ...   ...            ...
22519        n/a      n/a   n/a             no
42290        yes       no    no             no
10880        n/a      n/a   n/a             no
45708        yes       no   yes             no
17258        yes      yes   yes             no

[16000 rows x 4 columns]


In [14]:
column_summary = summary.apply(lambda col: col.value_counts()).fillna(0).astype(int)
print(column_summary)

     stationary  seasonal  noisy  autocorrelated
n/a        1789      1789   1789               0
no           32      9204   2554           10884
yes       14179      5007  11657            5116


In [15]:
data_output = pd.read_csv("C:/Users/met48/Desktop/TS-Clustering/SimData/bank_reserves_outputs_poor.csv", header=None)

scaler = MinMaxScaler()
data_output_scaled = scaler.fit_transform(data_output)
data_output = pd.DataFrame(data_output_scaled)

data =data_output

# --- Sample to 1280 examples if needed ---
data = data.sample(n=20000, random_state=1)

# --- Split into training and validation sets ---
train_data_pd, valid_data = train_test_split(data, test_size=0.2, random_state=42)

In [16]:
summary = train_data_pd.apply(lambda row: pd.Series(analyze_series(row)), axis=1)
print(summary)

  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  llf = -nobs2*np.log(2*np.pi) - nobs2*np.log(ssr / nobs) - nobs2
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nl

      stationary seasonal noisy autocorrelated
76308         no      yes    no            yes
95793        yes      yes    no            yes
4738         yes      yes    no            yes
45998        yes      yes    no            yes
9902         yes      yes    no            yes
...          ...      ...   ...            ...
47968         no      yes    no            yes
55380        yes      yes    no            yes
9943         yes      yes    no            yes
57969         no      yes    no            yes
53162        yes      yes    no            yes

[16000 rows x 4 columns]


In [17]:
column_summary = summary.apply(lambda col: col.value_counts()).fillna(0).astype(int)
print(column_summary)

     stationary  seasonal  noisy  autocorrelated
n/a         404       404    404               0
no         4577       548  14596             431
yes       11019     15048   1000           15569


In [18]:
def import_ff_data(filename):
    expected_columns=155
    data = []
    with open(filename, 'r') as file:
        for line in file:
            row = line.strip().split(',')
            if len(row) < expected_columns:
                row += [np.nan] * (expected_columns - len(row))
            data.append(row)
    df = pd.DataFrame(data)
    def fill_last_valid(row):
        for i in range(1, len(row)):
            if pd.isna(row[i]):
                row[i] = row[i - 1]  
        return row
    df_filled = df.apply(fill_last_valid, axis=1)
    return df_filled

In [19]:
ff_burned = import_ff_data("C:/Users/met48/Desktop/TS-Clustering/SimData/forest_fire_outputs_burned.csv")

In [20]:
data_output = ff_burned
data_output_helper = data_output
new_row = [500] * data_output_helper.shape[1]
data_output_helper.loc[len(data_output_helper)] = new_row
scaler = MinMaxScaler()
scaler.fit(data_output_helper)
data_output_scaled = scaler.transform(data_output)
data_output = pd.DataFrame(data_output_scaled)


# --- Combine input + output ---
data = data_output

# --- Sample to 1280 examples if needed ---
data = data.sample(n=20000, random_state=1)

# --- Split into training and validation sets ---
train_data_pd, valid_data = train_test_split(data, test_size=0.2, random_state=42)

In [21]:
summary = train_data_pd.apply(lambda row: pd.Series(analyze_series(row)), axis=1)
print(summary)

  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: 

      stationary seasonal noisy autocorrelated
71058        yes      yes    no            yes
91267        yes       no    no            yes
47561        yes      yes    no            yes
91543        yes      yes    no            yes
5113         yes      yes    no            yes
...          ...      ...   ...            ...
36694        yes      yes    no            yes
8535         yes      yes    no            yes
92107        yes      yes    no            yes
12673        yes      yes    no            yes
74884        yes      yes    no            yes

[16000 rows x 4 columns]


In [22]:
column_summary = summary.apply(lambda col: col.value_counts()).fillna(0).astype(int)
print(column_summary)

     stationary  seasonal  noisy  autocorrelated
n/a         170       170    170               0
no          113       761  15583             319
yes       15717     15069    247           15681
