# single customer - single feature - long history

In [1]:
%pyspark


SN_NUM = "ABB24511524"

In [2]:
%pyspark

snr_data_path = "/user/ZheS//owl_anomally/capacity_records/"
#snr_data_path ="/user/ZheS//owl_anomally////capacity_pplan50127_sliced"
feature_col = "avg_4gsnr"
time_col = "hour"
columns = ["sn", time_col, feature_col]

df_snr_all = spark.read.parquet(snr_data_path)

df_cap_hour_pd = df_snr_all.select(columns)\
                            .filter( col("sn")== SN_NUM )\
                            .orderBy( "sn","hour" )\
                            .toPandas()



In [3]:
%pyspark
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from typing import Optional, Union
from pyod.models.auto_encoder_torch import AutoEncoder

class AutoencoderAnomalyDetector:
    def __init__(self, 
                 df: pd.DataFrame, 
                 time_col: str, 
                 feature: str, 
                 window_type: str = "sliding",
                 n_lags: int = 24,
                 model_params: Optional[dict] = None,
                 model: Optional[object] = None,
                 scaler: Union[str, object, None] = "standard",
                 threshold_percentile = 99
                 ):
        """
        Initialization.

        Parameters
        ----------
        df : pd.DataFrame
        time_col : str
        feature : str
        n_lags : int
        model_params : dict, optional
        model : object, optional
            If provided, this custom model will be used instead of the default autoencoder.
        scaler : {'standard', 'minmax', object, None}
            'standard' for StandardScaler, 'minmax' for MinMaxScaler,
            a custom scaler instance (must implement fit_transform), or None.
        """
        self.df_raw = df.copy()
        self.time_col = time_col
        self.feature = feature
        self.window_type = window_type
        self.n_lags = n_lags
        self.model_params = model_params
        self.external_model = None
        self.scaler_type = scaler
        self.scaler = None
        self.model = None
        self.threshold_percentile = threshold_percentile
        
        self.df = None
        self.input_data = None
        self.input_data_scaled = None
        
        self.anomaly_scores = None
        self.threshold_scores = None
        
    def _format_time_series(self):
        df = self.df_raw[[self.time_col, self.feature]].copy()
        df = df.rename(columns={self.time_col: "ds", self.feature: "y"})
        df["unique_id"] = "series_1"
        return df

    def _segment_time_series(self, series: pd.Series) -> np.ndarray:
        """
        Generate lagged input sequences from a univariate time series.
    
        Parameters
        ----------
        series : pd.Series
            Input univariate time series.
        window_type : str
            Type of windowing. Options:
                - 'sliding': overlapping windows (default)
                - 'block': non-overlapping segments
    
        Returns
        -------
        np.ndarray
            2D array where each row is a lagged input sequence.
        """
        if self.window_type == "sliding":
            return np.array([
                series.iloc[i - self.n_lags:i].values
                for i in range(self.n_lags, len(series))
            ])
        
        elif self.window_type == "block":
            num_blocks = len(series) // self.n_lags
            return np.array([
                series.iloc[i * self.n_lags : (i + 1) * self.n_lags].values
                for i in range(num_blocks)
            ])
    
        else:
            raise ValueError("Invalid window_type. Choose 'sliding' or 'block'.")


    def _apply_scaler(self, X: np.ndarray) -> np.ndarray:
        if self.scaler_type is None:
            return X
        elif self.scaler_type == "standard":
            self.scaler = StandardScaler()
        elif self.scaler_type == "minmax":
            from sklearn.preprocessing import MinMaxScaler
            self.scaler = MinMaxScaler()
        else:
            self.scaler = self.scaler_type
        return self.scaler.fit_transform(X)

    def prepare(self):
        self.df = self._format_time_series()
        self.input_data = self._segment_time_series(self.df["y"])
        self.input_data_scaled = self._apply_scaler(self.input_data)

    def _init_model(self):
        if self.external_model is not None:
            return self.external_model

        default_params = {
            "hidden_neurons": [self.n_lags, 4, 4, self.n_lags],
            "hidden_activation": "relu",
            "epochs": 20,
            "batch_norm": True,
            "learning_rate": 0.001,
            "batch_size": 32,
            "dropout_rate": 0.2,
        }
        if self.model_params:
            default_params.update(self.model_params)
        return AutoEncoder(**default_params)

    def fit(self, threshold_percentile=None):
        if self.input_data_scaled is None:
            raise ValueError("Call prepare() before fit().")
        if threshold_percentile is None:
            threshold_percentile = self.threshold_percentile
        
        self.model = self._init_model()
        self.model.fit(self.input_data_scaled)
        
        self.anomaly_scores = self.model.decision_scores_
        self.threshold_scores = np.percentile(self.anomaly_scores, threshold_percentile)
        
    def predict(self, input_series: pd.Series) -> np.ndarray:
        if self.model is None:
            raise ValueError("Call fit() before predict().")
            
        input_matrix = self._segment_time_series(input_series)
        
        if self.scaler:
            input_matrix = self.scaler.transform(input_matrix)
        
        return self.model.decision_function(input_matrix)

    def plot_score_distribution(self, title_id):
        if self.anomaly_scores is None:
            raise ValueError("Model not trained. Call fit() first.")
        plt.figure(figsize=(10, 4))
        plt.hist(self.anomaly_scores, bins=20, edgecolor='black')
        plt.title(f"Histogram of Anomaly Scores at {title_id}")
        plt.xlabel("Anomaly Score")
        plt.ylabel("Frequency")
        plt.grid(True)
        plt.tight_layout()
        plt.show()

    def plot_series_with_anomalies(self,title_id):
        
        if self.anomaly_scores is None:
            raise ValueError("Model not trained. Call fit() first.")
        
        plt.figure(figsize=(16, 6))
        plt.plot(self.df['ds'], self.df['y'], label="Original Time Series", color="blue")
        plt.plot(
            self.df['ds'][self.n_lags:].values,
            self.anomaly_scores,
            color="orange",
            label="Anomaly Score",
            linewidth=2
        )
        plt.xlabel("Time")
        plt.ylabel("Value / Anomaly Score")
        plt.title(f"Time Series and Anomaly Scores at {title_id}")
        plt.legend()
        plt.grid(True)
        plt.tight_layout()
        plt.show()

    def get_anomaly_stats(self):
        """
        Return anomaly records and scores.
        """
        
        if self.anomaly_scores is None:
            raise ValueError("Model not trained. Call fit() first.")
    
    
        is_outlier = self.anomaly_scores > self.threshold_scores
    
        # Create mask for valid rows depending on windowing type
        if self.window_type == "sliding":
            base_df = self.df_raw.iloc[self.n_lags:].copy()
        else:  # "block"
            total_windows = len(self.anomaly_scores)
            base_df = self.df_raw.iloc[:total_windows * self.n_lags].copy()
            base_df = base_df.groupby(np.arange(len(base_df)) // self.n_lags).last().reset_index(drop=True)
    
        base_df["anomaly_score"] = self.anomaly_scores
        base_df["is_outlier"] = is_outlier
    
        anomaly_df = base_df[base_df["is_outlier"]][["sn", self.time_col, self.feature, "is_outlier"]]
    
        return anomaly_df




In [4]:
%pyspark
detector = AutoencoderAnomalyDetector(  df=df_cap_hour_pd, 
                                        time_col="hour", 
                                        feature="avg_4gsnr",
                                        window_type = "sliding",
                                        n_lags = 24,
                                        scaler = None)


In [5]:
%pyspark
detector.prepare()

In [6]:
%pyspark
detector.input_data.shape

In [7]:
%pyspark

detector.fit()



In [8]:
%pyspark
detector.plot_score_distribution(SN_NUM)



In [9]:
%pyspark
detector.plot_series_with_anomalies(SN_NUM)

In [10]:
%pyspark
result = detector.get_anomaly_stats()

result

# multiple customer - single feature - short history


## Prepare dataframe

In [13]:
%pyspark


snr_data_path ="/user/ZheS//owl_anomally////capacity_pplan50127_sliced"
feature_col = "avg_4gsnr"
time_col = "hour"
columns = ["sn", time_col, feature_col]

df_snr_all = spark.read.parquet(snr_data_path)

df_pandas = df_snr_all.select("slice_id", "hour", "avg_4gsnr").toPandas()
df_pandas.head(3)



In [14]:
%pyspark
df_snr_all.select("slice_id").distinct().count()

## building Model

In [16]:
%pyspark
detector = AutoencoderAnomalyDetector(  df=df_pandas, 
                                        time_col="hour", 
                                        feature="avg_4gsnr",
                                        slice_col = "slice_id",
                                        scaler = None)

# t = detector._build_tensor_from_slices()

In [17]:
%pyspark
detector.prepare()

In [18]:
%pyspark
detector.fit()

In [19]:
%pyspark
import matplotlib.pyplot as plt
from types import MethodType

import numpy as np
from types import MethodType

def predict_score_and_label(self, input_array: np.ndarray, threshold: float = None):
    """
    Predict anomaly scores and labels for new data.

    Parameters
    ----------
    input_array : np.ndarray
        Array of shape (n_samples, n_lags) matching the model's input structure.
    threshold : float, optional
        Manual threshold for outlier decision. If None, uses trained `self.threshold_scores`.

    Returns
    -------
    dict
        {
            "anomaly_scores": np.ndarray,
            "is_outlier": np.ndarray (bool)
        }
    """
    if self.model is None:
        raise ValueError("Model is not trained. Call fit() first.")
    
    # Optionally scale input
    if self.scaler:
        input_array = self.scaler.transform(input_array)
    
    scores = self.model.decision_function(input_array)

    if threshold is None:
        if self.threshold_scores is None:
            raise ValueError("Threshold not defined. Either provide it or call fit() first.")
        threshold = self.threshold_scores

    is_outlier = scores > threshold

    return {
        "anomaly_scores": scores,
        "is_outlier": is_outlier
    }


detector.predict_score_and_label = MethodType(predict_score_and_label, detector)


## get_anomaly_stats

In [22]:
%pyspark
df_outlier = detector.get_anomaly_stats()
df_outlier[ df_outlier["is_outlier"]==True ]

## plot_anomaly_score_distribution

In [24]:
%pyspark
detector.plot_anomaly_score_distribution()

## predict_score_and_label

In [26]:
%pyspark
import numpy as np

df_one_slice = df_pandas[ df_pandas["slice_id"]=="ABB24700945_0" ]

# Assume df_one_slice is your 170-row DataFrame
input_array = df_one_slice["avg_4gsnr"].values.reshape(1, -1)
result = detector.predict_score_and_label(input_array)
result

## plot_series_btw_anomaly_and_normal

In [28]:
%pyspark

detector.plot_time_series_by_category = MethodType(plot_time_series_by_category, detector)

In [29]:
%pyspark
# Plot 100 normal samples
detector.plot_time_series_by_category(category="normal", n_samples=100)



In [30]:
%pyspark
# Plot 100 abnormal samples
detector.plot_time_series_by_category(category="abnormal", n_samples=100)

In [31]:
%pyspark



# Plot samples with scores in a specific range
detector.plot_time_series_by_category(category=(0, 6), n_samples=10)

In [34]:
%pyspark
detector.plot_sample_series_by_anomaly()

## plot_mean_and_spread

In [36]:
%pyspark
detector.plot_mean_and_spread()

## predict_and_compare_with_normal

In [38]:
%pyspark
from matplotlib import cm
def predict_and_compare_with_normal(self,
                                    input_array: np.ndarray,
                                    title_id: str = "",
                                    n_normal_samples: int = 100,
                                    normal_score_range: Optional[Tuple[float, float]] = None):
    """
    Predict anomaly score of an input time series and compare it with normal training samples.

    Parameters
    ----------
    input_array : np.ndarray
        Shape (1, time_steps). The new time series to evaluate.
    title_id : str
        Identifier for labeling the input series on the plot.
    n_normal_samples : int
        Number of normal training series to plot for comparison.
    normal_score_range : tuple, optional
        If provided, defines (min_score, max_score) range to select normal samples
        from the training data. Overrides threshold-based selection.
    """
    if self.model is None or self.anomaly_scores is None:
        raise ValueError("Model not trained. Call fit() first.")

    # Ensure 2D shape for input
    if input_array.ndim == 1:
        input_array = input_array.reshape(1, -1)

    if self.scaler:
        input_array_scaled = self.scaler.transform(input_array)
    else:
        input_array_scaled = input_array

    scores = self.model.decision_function(input_array_scaled)
    labels = scores > self.threshold_scores

    # Select normal samples from training
    scores_all = self.anomaly_scores
    if normal_score_range is not None:
        min_score, max_score = normal_score_range
        normal_idx = np.where((scores_all >= min_score) & (scores_all <= max_score))[0]
    else:
        normal_idx = np.where(scores_all <= self.threshold_scores)[0]

    sample_n_normal = min(n_normal_samples, len(normal_idx))
    if sample_n_normal == 0:
        raise ValueError("No normal samples found in the specified range.")

    selected_idx = np.random.choice(normal_idx, size=sample_n_normal, replace=False)
    normal_samples = self.input_tensor[selected_idx, :, 0]

    # Plot
    plt.figure(figsize=(12, 5))
    cmap = cm.get_cmap('viridis', sample_n_normal)
    for i, series in enumerate(normal_samples):
        plt.plot(series, color=cmap(i), alpha=0.5)

    # Plot the input series
    input_series = input_array[0]
    label = 'Abnormal' if labels[0] else 'Normal'
    color = 'red' if labels[0] else 'darkgreen'
    plt.plot(input_series, linewidth=2.5, color=color, label=f"Input Series")

    # Add annotation in bottom-right
    annotation = (f"Score: {scores[0]:.4f}\n"
                  f"Thresh: {self.threshold_scores:.4f}\n"
                  f"Outlier: {label}")
    plt.annotate(annotation,
                 xy=(1.0, 0.0), xycoords='axes fraction',
                 xytext=(-10, 10), textcoords='offset points',
                 ha='right', va='bottom',
                 fontsize=9,
                 bbox=dict(boxstyle="round", fc="white", ec=color, alpha=0.8))

    plt.title(f"Comparison of Input Series with Normal Samples")
    plt.xlabel("Time Index")
    plt.ylabel(self.feature)
    plt.grid(True)
    plt.tight_layout()
    plt.show()



detector.predict_and_compare_with_normal = MethodType(predict_and_compare_with_normal, detector)


In [39]:
%pyspark
import numpy as np

df_one_slice = df_pandas[ df_pandas["slice_id"]=="ABB24700945_0" ]

# Assume df_one_slice is your 170-row DataFrame
input_array = df_one_slice["avg_4gsnr"].values.reshape(1, -1)
detector.predict_and_compare_with_normal(input_array,
                                        n_normal_samples = 10,
                                        title_id = "ABB24700945_0"
                                            )


In [40]:
%pyspark
detector.predict_and_compare_with_normal(input_array,
                                        n_normal_samples = 10,
                                        title_id = "ABB24700945_0",
                                        normal_score_range = [0,5]
                                            )

## Define Class

In [42]:
%pyspark
class AutoencoderAnomalyDetector:
    def __init__(self,
                 df: pd.DataFrame,
                 time_col: str,
                 feature: str,
                 slice_col: str = "slice_id",
                 model_params: Optional[dict] = None,
                 external_model: Optional[object] = None,
                 scaler: Union[str, object, None] = "None",
                 threshold_percentile: float = 99):
        self.df_raw = df.copy()
        self.time_col = time_col
        self.feature = feature
        self.slice_col = slice_col
        self.model_params = model_params
        self.external_model = external_model
        self.scaler_type = scaler
        self.scaler = None
        self.model = None
        self.threshold_percentile = threshold_percentile

        self.input_tensor = None
        self.input_tensor_scaled = None
        self.anomaly_scores = None
        self.threshold_scores = None

    def _build_tensor_from_slices(self):
        grouped = self.df_raw.groupby(self.slice_col)
        tensors = []

        for _, group in grouped:
            series = group.sort_values(by=self.time_col)[self.feature].values
            tensors.append(series)

        tensor_3d = np.stack(tensors)[:, :, np.newaxis]  # shape: (n_samples, n_timesteps, 1)
        return tensor_3d

    def _apply_scaler(self, X: np.ndarray) -> np.ndarray:
        if self.scaler_type is None:
            return X
        flat_X = X.reshape(-1, X.shape[-1])  # flatten across time axis
        if self.scaler_type == "standard":
            self.scaler = StandardScaler()
        elif self.scaler_type == "minmax":
            from sklearn.preprocessing import MinMaxScaler
            self.scaler = MinMaxScaler()
        else:
            self.scaler = self.scaler_type
        scaled_flat = self.scaler.fit_transform(flat_X)
        return scaled_flat.reshape(X.shape)

    def prepare(self):
        tensor = self._build_tensor_from_slices()
        self.input_tensor = tensor
        self.input_tensor_scaled = self._apply_scaler(tensor)

    def _init_model(self):
        if self.external_model:
            return self.external_model
        default_params = {
            "hidden_neurons": [self.input_tensor.shape[1], 32, 32, self.input_tensor.shape[1]],
            "hidden_activation": "relu",
            "epochs": 20,
            "batch_norm": True,
            "learning_rate": 0.001,
            "batch_size": 32,
            "dropout_rate": 0.2,
        }
        if self.model_params:
            default_params.update(self.model_params)
        return AutoEncoder(**default_params)

    def fit(self, threshold_percentile=None):
        if self.input_tensor_scaled is None:
            raise ValueError("Call prepare() before fit().")
        if threshold_percentile is None:
            threshold_percentile = self.threshold_percentile

        n_samples = self.input_tensor_scaled.shape[0]
        X = self.input_tensor_scaled.reshape(n_samples, -1)  # flatten to 2D for sklearn-compatible model
        self.model = self._init_model()
        self.model.fit(X)

        self.anomaly_scores = self.model.decision_scores_
        self.threshold_scores = np.percentile(self.anomaly_scores, threshold_percentile)


    def get_anomaly_stats(self):
        """
        Return anomaly scores and labels per slice (1 row per slice_id).

        Returns
        -------
        pd.DataFrame
            A DataFrame with columns ['sn', slice_col, 'anomaly_score', 'is_outlier']
        """
        if self.anomaly_scores is None:
            raise ValueError("Call fit() first.")

        is_outlier = self.anomaly_scores > self.threshold_scores

        unique_slices = self.df_raw[[self.slice_col]].drop_duplicates().reset_index(drop=True)
        result_df = unique_slices.copy()
        result_df["anomaly_score"] = self.anomaly_scores
        result_df["is_outlier"] = is_outlier
        result_df["sn"] = result_df[self.slice_col].apply(lambda x: str(x).split("_")[0])

        return result_df[["sn", self.slice_col, "anomaly_score", "is_outlier"]]


    def predict_score_and_label(self, input_array: np.ndarray, threshold: float = None):
        """
        Predict anomaly scores and labels for new data.

        Parameters
        ----------
        input_array : np.ndarray
            Array of shape (n_samples, n_lags) matching the model's input structure.
        threshold : float, optional
            Manual threshold for outlier decision. If None, uses trained `self.threshold_scores`.

        Returns
        -------
        dict
            {
                "anomaly_scores": np.ndarray,
                "is_outlier": np.ndarray (bool)
            }
        """
        if self.model is None:
            raise ValueError("Model is not trained. Call fit() first.")
        
        # Optionally scale input
        if self.scaler:
            input_array = self.scaler.transform(input_array)
        
        scores = self.model.decision_function(input_array)

        if threshold is None:
            if self.threshold_scores is None:
                raise ValueError("Threshold not defined. Either provide it or call fit() first.")
            threshold = self.threshold_scores

        is_outlier = scores > threshold

        return {
            "anomaly_scores": scores,
            "is_outlier": is_outlier
        }

    def plot_anomaly_score_distribution(self, bins=30, sample_size=10000, random_state=42):
        """
        Plot the distribution of anomaly scores (with optional downsampling).

        Parameters
        ----------
        bins : int
            Number of histogram bins (default=30).
        sample_size : int
            Number of scores to sample for plotting. If the total number of scores is less than this, use all.
        random_state : int
            Seed for reproducible sampling.
        """
        if self.anomaly_scores is None:
            raise ValueError("Call fit() before plotting anomaly scores.")
        
        scores = self.anomaly_scores
        if len(scores) > sample_size:
            np.random.seed(random_state)
            scores = np.random.choice(scores, size=sample_size, replace=False)

        plt.figure(figsize=(10, 5))
        plt.hist(scores, bins=bins, edgecolor='black', alpha=0.8)
        plt.axvline(self.threshold_scores, color='red', linestyle='--', label=f'Threshold = {self.threshold_scores:.4f}')
        plt.title(f"Anomaly Score Distribution (n={len(scores)} sample{'s' if len(scores) > 1 else ''})")
        plt.xlabel("Anomaly Score")
        plt.ylabel("Frequency")
        plt.legend()
        plt.grid(True)
        plt.tight_layout()
        plt.show()

In [52]:
%pyspark

import numpy as np
import matplotlib.pyplot as plt

dims = [2, 10, 50, 100, 200, 500]
avg_ratios = []

for d in dims:
    points = np.random.randn(1000, d)
    ref = points[0]
    dists = np.linalg.norm(points - ref, axis=1)
    ratio = (np.max(dists) - np.min(dists)) / np.mean(dists)
    avg_ratios.append(ratio)

plt.figure(figsize=(8, 5))
plt.plot(dims, avg_ratios, marker='o')
plt.title("Distance Contrast Vanishes in High Dimensions")
plt.xlabel("Dimensionality (d)")
plt.ylabel("(max - min) / mean distance")
plt.grid(True)
plt.show()


In [53]:
%pyspark
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

errors = np.random.rand(168, 50)

plt.figure(figsize=(12, 6))
sns.heatmap(errors, cmap="YlOrRd", cbar_kws={'label': 'Numerical Value'})
plt.xlabel("Feature Index")
plt.ylabel("Hour of Week")
plt.title("Reconstruction Error Heatmap (1 Customer, 1 Week)")
plt.tight_layout()
plt.show()

