# Feature Engineering

Transform raw stock prices into risk features for clustering.

In [1]:
import pandas as pd
import numpy as np
import sys
sys.path.append('../src')

from features import (
    calculate_returns,
    calculate_volatility_features,
    calculate_risk_metrics,
    calculate_technical_indicators,
    calculate_liquidity_features,
    calculate_momentum_features,
    calculate_drawdown,
    aggregate_stock_features
)

## Load Data

In [3]:
df = pd.read_csv('../Data/Processed/cleaned_nse.csv')
print(f"Loaded {len(df):,} rows for {df['Stock_code'].nunique()} stocks")

df.head(3)

Loaded 69,754 rows for 75 stocks


Unnamed: 0,Date,Stock_code,Name,12m Low,12m High,Day Low,Day High,Day Price,Previous,Change,%Change,Volume,Adjusted Price,Sector,Month,Year
0,2021-01-04,ABSA,ABSA Bank Kenya Plc,8.5,14.2,9.42,9.8,9.52,9.66,-0.14,1.45,18500.0,0.0,Banking,1,2021
1,2021-01-05,ABSA,ABSA Bank Kenya Plc,8.5,14.2,9.44,9.7,9.44,9.52,-0.08,0.84,1923300.0,0.0,Banking,1,2021
2,2021-01-06,ABSA,ABSA Bank Kenya Plc,8.5,14.2,9.4,9.68,9.44,9.44,0.0,0.0,233400.0,0.0,Banking,1,2021


## Step 1: Returns

In [4]:
df = df.groupby('Stock_code', group_keys=False).apply(calculate_returns)

print(f" Added: daily_return")


 Added: daily_return


  df = df.groupby('Stock_code', group_keys=False).apply(calculate_returns)


## Step 2: Volatility

In [5]:
df = df.groupby('Stock_code', group_keys=False).apply(calculate_volatility_features)
print(f"Added: volatility_7d, volatility_14d, volatility_30d")


Added: volatility_7d, volatility_14d, volatility_30d


  df = df.groupby('Stock_code', group_keys=False).apply(calculate_volatility_features)


## Step 3: Risk Metrics

In [6]:
df = df.groupby('Stock_code', group_keys=False).apply(calculate_risk_metrics)
print(f"Added: downside_deviation_30d, var_95")

Added: downside_deviation_30d, var_95


  df = df.groupby('Stock_code', group_keys=False).apply(calculate_risk_metrics)


## Step 4: Technical Indicators

In [7]:
df = df.groupby('Stock_code', group_keys=False).apply(calculate_technical_indicators)
print(f"Added: rsi, bb_width, macd")

Added: rsi, bb_width, macd


  df = df.groupby('Stock_code', group_keys=False).apply(calculate_technical_indicators)


## Step 5: Liquidity

In [8]:
df = df.groupby('Stock_code', group_keys=False).apply(calculate_liquidity_features)
print(f"Added: avg_volume, volume_volatility, amihud_illiquidity")

Added: avg_volume, volume_volatility, amihud_illiquidity


  df = df.groupby('Stock_code', group_keys=False).apply(calculate_liquidity_features)


## Step 6: Momentum

In [9]:
df = df.groupby('Stock_code', group_keys=False).apply(calculate_momentum_features)
print(f"Added: momentum_7d, momentum_30d, momentum_90d, ma_7, ma_30, ma_50")

Added: momentum_7d, momentum_30d, momentum_90d, ma_7, ma_30, ma_50


  df = df.groupby('Stock_code', group_keys=False).apply(calculate_momentum_features)


## Step 7: Drawdown

In [10]:
df = df.groupby('Stock_code', group_keys=False).apply(calculate_drawdown)
print(f"Added: current_drawdown, max_drawdown")

Added: current_drawdown, max_drawdown


  df = df.groupby('Stock_code', group_keys=False).apply(calculate_drawdown)


## Step 8: Aggregate to Stock Level

In [11]:
print("\nAggregating to stock level")
print(f"Before: {len(df):,} rows")

features_list = []
for stock_code, group in df.groupby('Stock_code'):
    stock_features = aggregate_stock_features(group)
    if stock_features is not None:
        features_list.append(stock_features)

df_features = pd.DataFrame(features_list)
print(f"After: {len(df_features)} stocks with {len(df_features.columns)} features")
df_features.head()


Aggregating to stock level
Before: 69,754 rows
After: 57 stocks with 26 features


Unnamed: 0,Stock_code,Sector,Name,volatility_mean,volatility_max,volatility_7d,downside_deviation,var_95,max_drawdown,mean_return,...,bb_width_mean,macd_volatility,avg_volume,volume_volatility,amihud_illiquidity,trading_frequency,momentum_30d,momentum_90d,trend_strength,current_price
0,ABSA,Banking,ABSA Bank Kenya Plc,0.012541,0.028603,0.011135,0.00783,-0.018083,-0.23622,0.000743,...,2.296415,0.17601,457918.016194,1206789.0,3.464928e-09,1.0,0.160772,0.280142,0.166774,18.05
1,BAMB,Construction and Allied,Bamburi Cement Ltd,0.024157,0.072992,0.021978,0.013723,-0.031429,-0.514563,0.000901,...,3.867892,1.28215,233655.667351,4414206.0,1.904103e-08,0.98583,-0.17603,-0.126984,-0.094158,55.0
2,BAT,Manufacturing and Allied,British American Tobacco Kenya Plc,0.012213,0.024126,0.010576,0.007299,-0.01628,-0.348232,0.000126,...,1.75241,5.776191,19173.611111,56489.45,1.622817e-09,0.947368,0.059155,0.082014,0.066576,376.0
3,BKG,Banking,BK Group Plc,0.028502,0.054462,0.025778,0.017173,-0.044569,-0.3925,0.001923,...,3.561992,0.57605,86221.114865,545481.1,0.0,0.59919,-0.01214,-0.013636,-0.028358,32.55
4,BOC,Manufacturing and Allied,BOC Kenya Plc,0.026053,0.052721,0.021544,0.015937,-0.035582,-0.302949,0.001758,...,3.670978,1.450247,10246.268657,75668.27,0.0,0.406883,-0.002809,0.053412,0.042829,88.75


## Inspect Features

In [13]:
print("Feature Summary:")
print(" ")
df_features.info()

Feature Summary:
 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57 entries, 0 to 56
Data columns (total 26 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Stock_code          57 non-null     object 
 1   Sector              57 non-null     object 
 2   Name                57 non-null     object 
 3   volatility_mean     57 non-null     float64
 4   volatility_max      57 non-null     float64
 5   volatility_7d       57 non-null     float64
 6   downside_deviation  57 non-null     float64
 7   var_95              57 non-null     float64
 8   max_drawdown        57 non-null     float64
 9   mean_return         57 non-null     float64
 10  std_return          57 non-null     float64
 11  return_skew         57 non-null     float64
 12  return_kurtosis     57 non-null     float64
 13  return_consistency  57 non-null     float64
 14  sharpe_ratio        57 non-null     float64
 15  rsi_mean            57 non-null     floa

In [14]:
print("\nKey Stats:")
df_features[['volatility_mean', 'sharpe_ratio', 'max_drawdown', 'rsi_mean']].describe().round(4)


Key Stats:


Unnamed: 0,volatility_mean,sharpe_ratio,max_drawdown,rsi_mean
count,57.0,57.0,57.0,57.0
mean,0.0263,0.0329,-0.4781,47.6902
std,0.0098,0.12,0.1788,7.3268
min,0.0046,-0.0353,-0.8555,8.8447
25%,0.0197,0.0019,-0.6036,47.777
50%,0.0261,0.0137,-0.4833,48.8951
75%,0.035,0.0314,-0.3482,49.9527
max,0.0463,0.8866,-0.0698,54.1655


## Save

In [15]:
output_path = '../Data/Processed/nse_features.csv'
df_features.to_csv(output_path, index=False)

print(f"   {len(df_features)} stocks × {len(df_features.columns)} features")

   57 stocks × 26 features
