# Load data and imports

In [17]:
file_path = 'stock_data_for_emm.pkl'

In [18]:
import pickle
import pandas as pd
from scipy.stats import linregress
import plotly.graph_objects as go

In [19]:
with open(file_path, 'rb') as f:
    stock_data = pickle.load(f)

stock_data = pd.DataFrame(stock_data)





# Select single time series and plot it

In [20]:
time_series = stock_data.loc["SOG.PA", "target"]  # "ORO.F" --> can run different stocks
df_ts = pd.DataFrame({'window_index': range(1, len(time_series) + 1), 'value': time_series})


# Define plotting function for time series

def plot_percentage_growth(
    df, 
    x_column="window_index", 
    y_column="value", 
    x_name="Window", 
    y_name="Value", 
    percentage_growth=False, 
    slope=None, 
    intercept=None
    ):
    """
    Plots the time series value over the time series windows using Plotly.
    
    Parameters:
        df (pd.DataFrame): DataFrame containing window_index and value
    """
    fig = go.Figure()
    
    # Add percentage growth line
    fig.add_trace(go.Scatter(
        x=df[x_column],
        y=df[y_column],
        mode='lines+markers',
        name=y_name,
        line=dict(color='blue'),
        marker=dict(size=8)
    ))
    
    # Update layout and add titles
    fig.update_layout(
        title=f"Stock {y_name} Over {x_name}",
        xaxis_title=x_name,
        yaxis_title=y_name,
        template='plotly_white'
    )
    
    if percentage_growth:
        # Add horizontal line at y=0 for reference of growth percentage
        fig.add_trace(go.Scatter(
            x=df[x_column],
            y=[0]*len(df),
            mode='lines',
            name='No Growth',
            line=dict(color='black', dash='dash')
        ))
        
    if slope is not None and intercept is not None:
        fig.add_trace(go.Scatter(
            x=[df[x_column].min(), df[x_column].max()],
            y=[intercept + slope * df[x_column].min(), intercept + slope * df[x_column].max()],
            mode='lines',
            name='Trend Line',
            line=dict(color='red', dash='dash')
        ))
    
    
    
    # Show plot
    fig.show()
    
    return fig


fig = plot_percentage_growth(
    df_ts,
    x_name="Days",
    y_name="Price"
)

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

# Create initial time series features (on original time series)

- Assume that this is just the same type of time series, but now with window size of 1 day. So then these features can be computed on any other time series, which was created by creating windows.

In [5]:
def compute_simple_features_og_ts(df):
    features = {}
    
    # Average value
    features['mean'] = round(df['value'].mean(), 5)
    # Median value
    features['median'] = round(df['value'].median(), 5)
    # Volatility in time series
    features['std_dev'] = round(df['value'].std(), 5)
    # Autocorrelation 1-lag: The correlation of the time series with its own lagged version by one time step.
    features['autocorr_lag1'] = round(df['value'].autocorr(lag=1), 5)
    # Min value
    features["min"] = round(df['value'].min(), 5)
    # Max value
    features["max"] = round(df['value'].max(), 5)
    # Value range
    features["range"] = round(features["max"] - features["min"], 5)
    # window_max
    features['window_min'] = df['window_index'][df['value'].idxmin()]
    # window_min
    features['window_max'] = df['window_index'][df['value'].idxmax()]
    # Trend slope
    slope, intercept, r_value, p_value, std_err = linregress(df['window_index'], df['value'])
    features['trend_slope'] = round(slope, 5)
    
    return features, df, slope, intercept


features, df_ts, slope, intercept = compute_simple_features_og_ts(df_ts)
features

{'mean': 25.52656,
 'median': 22.7,
 'std_dev': 10.41512,
 'autocorr_lag1': 0.95467,
 'min': 8.22,
 'max': 50.4,
 'range': 42.18,
 'window_min': 36,
 'window_max': 5,
 'trend_slope': -0.39594}

In [6]:
fig = plot_percentage_growth(
    df_ts,
    x_name="Days",
    y_name="Price",
    slope=slope,
    intercept=intercept
)

In [7]:
import pandas as pd

def compute_extra_statistics_og_ts(df):
    
    # Compute delta
    df['delta'] = df['value'].diff()
    
    # Define direction
    df['direction'] = df['delta'].apply(
        lambda x: 'increase' if x > 0 else ('decrease' if x < 0 else 'no_change')
    )
    
    # ---------------------------
    # Longest and Biggest Increase
    # ---------------------------
    
    # Mask for increasing windows
    df['is_increase'] = df['delta'] > 0
    
    # Identify streaks by cumulatively summing when the streak breaks
    df['increase_group'] = (df['is_increase'] != df['is_increase'].shift()).cumsum()
    
    # Filter only increase groups
    increase_streaks = df[df['is_increase']].groupby('increase_group')
    
    # Compute the length, sum, start_window_index, and end_window_index of each increase streak
    if increase_streaks.ngroups > 0:
        increase_stats = increase_streaks.agg(
            count=('delta', 'count'),
            sum=('delta', 'sum'),
            start_window_index=('window_index', 'min'),
            end_window_index=('window_index', 'max')
        )
        
        # Longest Continuous Increase
        longest_incr_idx = increase_stats['count'].idxmax()
        longest_continuous_increase = {
            "length": int(increase_stats.loc[longest_incr_idx, 'count']),
            "start_window_index": int(increase_stats.loc[longest_incr_idx, 'start_window_index']),
            "end_window_index": int(increase_stats.loc[longest_incr_idx, 'end_window_index'])
        }
        
        # Biggest Continuous Increase
        biggest_incr_idx = increase_stats['sum'].idxmax()
        biggest_continuous_increase = {
            "sum": round(increase_stats.loc[biggest_incr_idx, 'sum'], 5),
            "start_window_index": int(increase_stats.loc[biggest_incr_idx, 'start_window_index']),
            "end_window_index": int(increase_stats.loc[biggest_incr_idx, 'end_window_index'])
        }
    else:
        longest_continuous_increase = {
            "length": 0,
            "start_window_index": None,
            "end_window_index": None
        }
        biggest_continuous_increase = {
            "sum": 0.0,
            "start_window_index": None,
            "end_window_index": None
        }
    
    # ---------------------------
    # Longest and Biggest Decrease
    # ---------------------------
    
    # Mask for decreasing windows
    df['is_decrease'] = df['delta'] < 0
    
    # Identify streaks by cumulatively summing when the streak breaks
    df['decrease_group'] = (df['is_decrease'] != df['is_decrease'].shift()).cumsum()
    
    # Filter only decrease groups
    decrease_streaks = df[df['is_decrease']].groupby('decrease_group')
    
    # Compute the length, sum, start_window_index, and end_window_index of each decrease streak
    if decrease_streaks.ngroups > 0:
        decrease_stats = decrease_streaks.agg(
            count=('delta', 'count'),
            sum=('delta', 'sum'),
            start_window_index=('window_index', 'min'),
            end_window_index=('window_index', 'max')
        )
        
        # Longest Continuous Decrease
        longest_decr_idx = decrease_stats['count'].idxmax()
        longest_continuous_decrease = {
            "length": int(decrease_stats.loc[longest_decr_idx, 'count']),
            "start_window_index": int(decrease_stats.loc[longest_decr_idx, 'start_window_index']),
            "end_window_index": int(decrease_stats.loc[longest_decr_idx, 'end_window_index'])
        }
        
        # Biggest Continuous Decrease (use absolute value)
        biggest_decr_idx = decrease_stats['sum'].idxmin()  # Most negative sum
        biggest_continuous_decrease = {
            "sum": round(abs(decrease_stats.loc[biggest_decr_idx, 'sum']), 5) * -1,
            "start_window_index": int(decrease_stats.loc[biggest_decr_idx, 'start_window_index']),
            "end_window_index": int(decrease_stats.loc[biggest_decr_idx, 'end_window_index'])
        }
    else:
        longest_continuous_decrease = {
            "length": 0,
            "start_window_index": None,
            "end_window_index": None
        }
        biggest_continuous_decrease = {
            "sum": 0.0,
            "start_window_index": None,
            "end_window_index": None
        }
    
    # Compile the results into a dictionary
    statistics = {
        "longest_continuous_increase": longest_continuous_increase,
        "biggest_continuous_increase": biggest_continuous_increase,
        "longest_continuous_decrease": longest_continuous_decrease,
        "biggest_continuous_decrease": biggest_continuous_decrease
    }
    
    return statistics


compute_extra_statistics_og_ts(df_ts)

# longest_continuous_increase = number of consecutive days with increasing values
# biggest_continuous_increase = biggest summed increase in consecutive days, price value not number of days
# longest_continuous_decrease = number of consecutive days with decreasing values
# biggest_continuous_decrease = biggest summed decrease in consecutive days, price value not number of days

{'longest_continuous_increase': {'length': 5,
  'start_window_index': 47,
  'end_window_index': 51},
 'biggest_continuous_increase': {'sum': 12.98,
  'start_window_index': 37,
  'end_window_index': 38},
 'longest_continuous_decrease': {'length': 6,
  'start_window_index': 31,
  'end_window_index': 36},
 'biggest_continuous_decrease': {'sum': -20.05,
  'start_window_index': 11,
  'end_window_index': 13}}

# Convert to growth percentage time series

In [8]:
def convert_to_percentage_growth(df):
    """
    Converts a DataFrame with value data to include percentage growth.

    Parameters:
        df (pd.DataFrame): DataFrame containing 'window_index' and 'value' columns.

    Returns:
        pd.DataFrame: DataFrame containing 'window_index', 'value', and 'percentage_growth'.
    """
    # Create a copy to avoid modifying the original DataFrame
    df = df.copy()
    
    # Compute the difference between consecutive windows
    df['delta'] = df['value'].diff()
    
    # Calculate the percentage growth
    df['percentage_growth'] = (df['delta'] / df['value'].shift(1)) * 100
    
    # Replace NaN for the first window_index with 0% using .loc
    df.loc[df.index[0], 'percentage_growth'] = 0.0
    
    # Round the percentage growth to two decimal places
    df['percentage_growth'] = df['percentage_growth'].round(5)
    
    return df[['window_index', 'value', 'percentage_growth']]


df_ts_growth_perc = convert_to_percentage_growth(df_ts)

In [9]:
fig = plot_percentage_growth(
    df_ts_growth_perc, 
    y_column="percentage_growth", 
    x_name="Days", 
    y_name="Growth Percentage (%)", 
    percentage_growth=True
)

# Create extra time series features (on growth percentage time series)

In [10]:
def compute_simple_features_growth_perc_ts(df):
    features = {}
    
    # Average value
    features['mean'] = round(df['percentage_growth'].mean(), 5)
    # Median value
    features['median'] = round(df['percentage_growth'].median(), 5)
    # Volatility in time series
    features['std_dev'] = round(df['percentage_growth'].std(), 5)
    # Autocorrelation 1-lag: The correlation of the time series with its own lagged version by one time step.
    features['autocorr_lag1'] = round(df['value'].autocorr(lag=1), 5)
    # Min value
    features["min"] = round(df['percentage_growth'].min(), 5)
    # Max value
    features["max"] = round(df['percentage_growth'].max(), 5)
    # Value range
    features["range"] = round(features["max"] - features["min"], 5)
    # window_max
    features['window_min'] = df['window_index'][df['percentage_growth'].idxmin()]
    # window_min
    features['window_max'] = df['window_index'][df['percentage_growth'].idxmax()]
    # Trend slope
    slope, intercept, r_value, p_value, std_err = linregress(df['window_index'], df['percentage_growth'])
    features['trend_slope'] = round(slope, 5)
    
    return features, df, slope, intercept


features, df_ts_growth_perc, slope, intercept = compute_simple_features_growth_perc_ts(df_ts_growth_perc)
features

{'mean': -0.43459,
 'median': 0.0,
 'std_dev': 15.83299,
 'autocorr_lag1': 0.95467,
 'min': -34.64567,
 'max': 64.23357,
 'range': 98.87924,
 'window_min': 29,
 'window_max': 37,
 'trend_slope': 0.07065}

In [11]:
fig = plot_percentage_growth(
    df_ts_growth_perc, 
    y_column="percentage_growth", 
    x_name="Days", 
    y_name="Growth Percentage (%)", 
    percentage_growth=True, 
    slope=slope,
    intercept=intercept
)

In [12]:
def compute_extra_statistics_growth_perc_ts(df):
    """
    Computes the biggest continuous increase and decrease in percentage growth.
    
    Parameters:
        df (pd.DataFrame): DataFrame containing 'window_index' and 'percentage_growth'.
    
    Returns:
        dict: Dictionary containing statistics for the biggest continuous increase and decrease.
    """

    # Ensure required columns exist
    required_columns = {'window_index', 'percentage_growth'}
    if not required_columns.issubset(df.columns):
        raise ValueError(f"The DataFrame must contain the columns: {required_columns}")
    
    # Create a copy to avoid modifying the original DataFrame
    df = df.copy()
    
    # ---------------------------
    # Biggest Continuous Increase Percentage
    # ---------------------------
    
    # Mask for windows with positive percentage growth
    df['is_increase'] = df['percentage_growth'] > 0
    
    # Identify increase streaks by cumulatively summing when the streak breaks
    df['increase_group'] = (df['is_increase'] != df['is_increase'].shift()).cumsum()
    
    # Filter only increase groups
    increase_streaks = df[df['is_increase']].groupby('increase_group')
    
    # Check if there are any increase streaks
    if increase_streaks.ngroups > 0:
        # Aggregate to find sum_percentage_growth, start_window_index, end_window_index for each streak
        increase_stats = increase_streaks.agg(
            sum_percentage_growth=('percentage_growth', 'sum'),
            start_window_index=('window_index', 'min'),
            end_window_index=('window_index', 'max')
        )
        
        # Find the streak with the maximum sum_percentage_growth
        biggest_incr_idx = increase_stats['sum_percentage_growth'].idxmax()
        biggest_continuous_increase_perc = {
            "sum_percentage_growth": round(increase_stats.loc[biggest_incr_idx, 'sum_percentage_growth'], 5),
            "start_window_index": int(increase_stats.loc[biggest_incr_idx, 'start_window_index']),
            "end_window_index": int(increase_stats.loc[biggest_incr_idx, 'end_window_index'])
        }
    else:
        biggest_continuous_increase_perc = {
            "sum_percentage_growth": 0.0,
            "start_window_index": None,
            "end_window_index": None
        }
    
    # ---------------------------
    # Biggest Continuous Decrease Percentage
    # ---------------------------
    
    # Mask for windows with negative percentage growth
    df['is_decrease'] = df['percentage_growth'] < 0
    
    # Identify decrease streaks by cumulatively summing when the streak breaks
    df['decrease_group'] = (df['is_decrease'] != df['is_decrease'].shift()).cumsum()
    
    # Filter only decrease groups
    decrease_streaks = df[df['is_decrease']].groupby('decrease_group')
    
    # Check if there are any decrease streaks
    if decrease_streaks.ngroups > 0:
        # Aggregate to find sum_percentage_growth, start_window_index, end_window_index for each streak
        decrease_stats = decrease_streaks.agg(
            sum_percentage_growth=('percentage_growth', 'sum'),
            start_window_index=('window_index', 'min'),
            end_window_index=('window_index', 'max')
        )
        
        # Find the streak with the minimum sum_percentage_growth (most negative)
        biggest_decr_idx = decrease_stats['sum_percentage_growth'].idxmin()
        biggest_continuous_decrease_perc = {
            "sum_percentage_growth": round(abs(decrease_stats.loc[biggest_decr_idx, 'sum_percentage_growth']), 5) * -1,
            "start_window_index": int(decrease_stats.loc[biggest_decr_idx, 'start_window_index']),
            "end_window_index": int(decrease_stats.loc[biggest_decr_idx, 'end_window_index'])
        }
    else:
        biggest_continuous_decrease_perc = {
            "sum_percentage_growth": 0.0,
            "start_window_index": None,
            "end_window_index": None
        }
    
    # Compile the results into a dictionary
    statistics = {
        "biggest_continuous_increase_perc": biggest_continuous_increase_perc,
        "biggest_continuous_decrease_perc": biggest_continuous_decrease_perc
    }
    
    return statistics


compute_extra_statistics_growth_perc_ts(df_ts_growth_perc)

# biggest_continuous_increase_perc = biggest summed increase in consecutive days, price value percentage not number of days
# biggest_continuous_decrease_perc = biggest summed decrease in consecutive days, price value percentage not number of days

{'biggest_continuous_increase_perc': {'sum_percentage_growth': 121.27061,
  'start_window_index': 37,
  'end_window_index': 38},
 'biggest_continuous_decrease_perc': {'sum_percentage_growth': -78.01069,
  'start_window_index': 31,
  'end_window_index': 36}}

# Combine all features in a single function:

In [13]:
def compute_all_features_for_timeseries(time_series, unpack_advanced_features=True):
    """
    Computes features for a given time series and its percentage growth version.

    Parameters:
        time_series (list): A list of numerical values representing the time series.

    Returns:
        tuple: 
            - dict: Contains features for both the original and percentage growth time series. All features are rounded to at most 5 decimals.
            - pd.DataFrame: The growth percentage DataFrame with 'window_index', 'value', and 'percentage_growth' columns.
    """
    df_ts = pd.DataFrame({'window_index': range(1, len(time_series) + 1), 'value': time_series})
    
    # Compute features on original time series
    basic_features, _, _, _ = compute_simple_features_og_ts(df_ts)
    advanced_features = compute_extra_statistics_og_ts(df_ts)

    if unpack_advanced_features:
        advanced_features = {
            'longest_continuous_increase': advanced_features['longest_continuous_increase']['length'],
            'biggest_continuous_increase': advanced_features['biggest_continuous_increase']['sum'],
            'longest_continuous_decrease': advanced_features['longest_continuous_decrease']['length'],
            'biggest_continuous_decrease': advanced_features['biggest_continuous_decrease']['sum']
        }

    # Convert original dataset to the growth percentage time series
    df_ts_growth_perc = convert_to_percentage_growth(df_ts)
    
    # Compute features on growth percentage time series
    basic_features_growth_perc, _, _, _ = compute_simple_features_growth_perc_ts(df_ts_growth_perc)
    advanced_features_growth_perc = compute_extra_statistics_growth_perc_ts(df_ts_growth_perc)
    
    if unpack_advanced_features:
        advanced_features_growth_perc = {
            'biggest_continuous_increase_perc': advanced_features_growth_perc['biggest_continuous_increase_perc']['sum_percentage_growth'],
            'biggest_continuous_decrease_perc': advanced_features_growth_perc['biggest_continuous_decrease_perc']['sum_percentage_growth']
        }
    
    # Compute features
    features_total_hierarchical = {}
    features_total_hierarchical["og_time_series"] = {**basic_features, **advanced_features}
    features_total_hierarchical["growth_perc_time_series"] = {**basic_features_growth_perc, **advanced_features_growth_perc}
    
    return features_total_hierarchical, df_ts_growth_perc


features, final_ts_df = compute_all_features_for_timeseries(time_series)

In [14]:
final_ts_df

Unnamed: 0,window_index,value,percentage_growth
0,1,47.200001,0.00000
1,2,47.750000,1.16525
2,3,48.900002,2.40838
3,4,49.099998,0.40899
4,5,50.400002,2.64766
...,...,...,...
56,57,21.600000,6.93069
57,58,23.000000,6.48148
58,59,16.000000,-30.43478
59,60,16.700001,4.37500


In [15]:
features

{'og_time_series': {'mean': 25.52656,
  'median': 22.7,
  'std_dev': 10.41512,
  'autocorr_lag1': 0.95467,
  'min': 8.22,
  'max': 50.4,
  'range': 42.18,
  'window_min': 36,
  'window_max': 5,
  'trend_slope': -0.39594,
  'longest_continuous_increase': 5,
  'biggest_continuous_increase': 12.98,
  'longest_continuous_decrease': 6,
  'biggest_continuous_decrease': -20.05},
 'growth_perc_time_series': {'mean': -0.43459,
  'median': 0.0,
  'std_dev': 15.83299,
  'autocorr_lag1': 0.95467,
  'min': -34.64567,
  'max': 64.23357,
  'range': 98.87924,
  'window_min': 29,
  'window_max': 37,
  'trend_slope': 0.07065,
  'biggest_continuous_increase_perc': 121.27061,
  'biggest_continuous_decrease_perc': -78.01069}}

<font color='red'>Question: Do we want to use start and end windows of the biggest and longest continues increases and decreases. It could recognize situations where companies responded similarly to similar market influencing events. Change unpack advanced features parameter to show these values.</font>

- In that case create these as new features

# Time Series Features Overview

### `og_time_series` Features
1. **Mean**: Provides a baseline value to compare the overall level of different time series.
2. **Median**: Helps to understand the central tendency and identify skewness when compared to the mean.
3. **Standard Deviation**: Indicates the variability of the time series, providing insight into its volatility.
4. **Autocorrelation Lag 1**: Measures the persistence of the values between consecutive time windows, which helps identify if a time series has a strong temporal dependency.
5. **Minimum**: Captures the lowest observed value, useful for assessing the possible downside in the time series.
6. **Maximum**: Captures the highest observed value, indicating the peak potential of the series.
7. **Range**: Shows the spread of values, offering a measure of overall variability.
8. **Window Min**: Indicates the window interval where the lowest value was observed, adding temporal context to the minimum value.
9. **Window Max**: Indicates the window interval where the highest value occurred, providing context to the maximum value.
10. **Trend Slope**: Shows the direction and rate of change, revealing the overall trend of the series over time.
11. **Longest Continuous Increase**: Describes the resilience of positive momentum, reflecting potential stability during an increasing trend.
12. **Biggest Continuous Increase**: Indicates the magnitude of value growth during consecutive time windows, helpful in assessing peak growth phases.
13. **Longest Continuous Decrease**: Indicates how long a decline trend persists, providing insight into negative resilience.
14. **Biggest Continuous Decrease**: Quantifies the maximum loss during consecutive time windows, useful for understanding risk exposure.

### `growth_perc_time_series` Features
1. **Mean**: Reflects the average rate of change, indicating whether the time series is growing or declining on average.
2. **Median**: Helps identify whether the growth is balanced around zero or is skewed, giving insight into typical performance.
3. **Standard Deviation**: Shows the volatility in growth percentages, providing a measure of the consistency of changes.
4. **Autocorrelation Lag 1**: Helps identify how growth in one time window relates to subsequent windows, revealing stability or volatility in growth rates.
5. **Minimum**: Captures the worst period of percentage decline, helping to assess risk in growth trends.
6. **Maximum**: Indicates the best period of percentage growth, providing insight into potential gains.
7. **Range**: Shows the spread between highest and lowest growth, giving a sense of the volatility in percentage terms.
8. **Window Min**: Indicates the time interval where the worst percentage decline occurred, providing temporal context to risk.
9. **Window Max**: Indicates the time interval where the best percentage growth occurred, offering context to growth peaks.
10. **Trend Slope**: Reflects the long-term trend of percentage change, revealing the growth trajectory over time.
11. **Biggest Continuous Increase Percentage**: Shows the total percentage gain during the biggest continuous growth, which highlights high-momentum phases.
12. **Biggest Continuous Decrease Percentage**: Shows the total percentage loss during the biggest continuous decline, providing a measure of downside risk in percentage terms.

These features collectively help understand the behavior, trends, and risks of time series, which is crucial for effectively comparing the similarity between different time series.


# Research literature for more interesting aggregations of time series data

<font color='red'>Continue here with more informative time series aggregated features.</font>