# Feature Engineering for Cryptocurrency Prediction
## Exploring Technical Indicators & Feature Creation

This notebook demonstrates the feature engineering techniques used to create predictive features from raw crypto price data.

In [None]:
import os
os.chdir('../')
%pwd

## 1. Load Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from mlProject import logger

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (15, 6)

## 2. Feature Engineering Component

### Technical Indicators Implemented:
1. **Moving Averages**: SMA-7, SMA-14, SMA-30, EMA-7, EMA-14
2. **MACD**: Moving Average Convergence Divergence
3. **RSI**: Relative Strength Index
4. **Bollinger Bands**: Upper, Middle, Lower bands
5. **Price Changes**: 1h, 24h, 7d returns
6. **Volume Indicators**: Volume SMA, Volume Ratio
7. **Volatility**: Price volatility measure
8. **Price Position**: Relative position in 14-day range

In [None]:
class FeatureEngineering:
    """Unified feature engineering for crypto prediction"""
    
    FEATURE_NAMES = [
        'price', 'volume', 'market_cap', 'sma_7', 'sma_14', 'sma_30',
        'ema_7', 'ema_14', 'macd', 'macd_signal', 'macd_histogram',
        'rsi', 'bb_middle', 'bb_upper', 'bb_lower', 'price_change_1h',
        'price_change_24h', 'price_change_7d', 'volume_sma', 'volume_ratio',
        'volatility', 'high_14d', 'low_14d', 'price_position',
        'target_price_1h', 'target_price_24h', 'target_direction_5min',
        'target_direction_1h', 'target_direction_24h', 'target_change_5min',
        'target_change_1h', 'target_change_24h'
    ]
    
    @staticmethod
    def calculate_sma(prices, period):
        """Simple Moving Average"""
        if len(prices) < period:
            return prices[-1] if prices else 0.0
        return sum(prices[-period:]) / period
    
    @staticmethod
    def calculate_ema(prices, period):
        """Exponential Moving Average"""
        if not prices or len(prices) < period:
            return prices[-1] if prices else 0.0
        
        alpha = 2 / (period + 1)
        ema = prices[0]
        for price in prices[1:]:
            ema = price * alpha + ema * (1 - alpha)
        return ema
    
    @staticmethod
    def calculate_macd(prices):
        """MACD Indicator"""
        if len(prices) < 26:
            return {'macd': 0.0, 'macd_signal': 0.0, 'macd_histogram': 0.0}
        
        ema_12 = FeatureEngineering.calculate_ema(prices, 12)
        ema_26 = FeatureEngineering.calculate_ema(prices, 26)
        macd = ema_12 - ema_26
        macd_signal = macd * 0.9
        macd_histogram = macd - macd_signal
        
        return {
            'macd': macd,
            'macd_signal': macd_signal,
            'macd_histogram': macd_histogram
        }
    
    @staticmethod
    def calculate_rsi(prices, period=14):
        """Relative Strength Index"""
        if len(prices) < period + 1:
            return 50.0
        
        gains, losses = [], []
        for i in range(1, min(period + 1, len(prices))):
            change = prices[-i] - prices[-i-1]
            gains.append(max(0, change))
            losses.append(abs(min(0, change)))
        
        avg_gain = sum(gains) / len(gains) if gains else 0.01
        avg_loss = sum(losses) / len(losses) if losses else 0.01
        rs = avg_gain / avg_loss if avg_loss > 0 else 1
        rsi = 100 - (100 / (1 + rs))
        
        return rsi
    
    @staticmethod
    def calculate_bollinger_bands(prices, period=20, std_dev=2):
        """Bollinger Bands"""
        if len(prices) < period:
            middle = prices[-1] if prices else 0.0
            return {'bb_middle': middle, 'bb_upper': middle, 'bb_lower': middle}
        
        middle = sum(prices[-period:]) / period
        variance = sum((p - middle) ** 2 for p in prices[-period:]) / period
        std = variance ** 0.5
        
        return {
            'bb_middle': middle,
            'bb_upper': middle + (std_dev * std),
            'bb_lower': middle - (std_dev * std)
        }
    
    @staticmethod
    def get_feature_names():
        """Return list of feature names in correct order"""
        return FeatureEngineering.FEATURE_NAMES

## 3. Load Sample Data

In [None]:
# Load crypto data
try:
    df = pd.read_csv('artifacts/data_ingestion/crypto_data.csv')
    print(f"Loaded {len(df)} records")
    print(f"\nColumns: {list(df.columns)}")
    print(f"\nFirst few rows:")
    df.head()
except FileNotFoundError:
    print("Run data ingestion pipeline first to generate data")
    # Create sample data for demonstration
    np.random.seed(42)
    dates = pd.date_range('2023-01-01', periods=1000, freq='H')
    df = pd.DataFrame({
        'price': 45000 + np.cumsum(np.random.randn(1000) * 100),
        'volume': np.random.randint(1000000, 5000000, 1000),
        'market_cap': np.random.randint(800000000, 900000000, 1000)
    })
    df['timestamp'] = dates
    print("Created sample data for demonstration")

## 4. Calculate Technical Indicators

In [None]:
# Extract price series
prices = df['price'].tolist()

# Calculate moving averages
sma_7 = [FeatureEngineering.calculate_sma(prices[:i+1], 7) for i in range(len(prices))]
sma_14 = [FeatureEngineering.calculate_sma(prices[:i+1], 14) for i in range(len(prices))]
sma_30 = [FeatureEngineering.calculate_sma(prices[:i+1], 30) for i in range(len(prices))]

# Add to dataframe
df['sma_7'] = sma_7
df['sma_14'] = sma_14
df['sma_30'] = sma_30

# Calculate EMA
ema_7 = [FeatureEngineering.calculate_ema(prices[:i+1], 7) for i in range(len(prices))]
ema_14 = [FeatureEngineering.calculate_ema(prices[:i+1], 14) for i in range(len(prices))]
df['ema_7'] = ema_7
df['ema_14'] = ema_14

# Calculate RSI
rsi = [FeatureEngineering.calculate_rsi(prices[:i+1], 14) for i in range(len(prices))]
df['rsi'] = rsi

# Calculate MACD
macd_data = [FeatureEngineering.calculate_macd(prices[:i+1]) for i in range(len(prices))]
df['macd'] = [m['macd'] for m in macd_data]
df['macd_signal'] = [m['macd_signal'] for m in macd_data]
df['macd_histogram'] = [m['macd_histogram'] for m in macd_data]

# Calculate Bollinger Bands
bb_data = [FeatureEngineering.calculate_bollinger_bands(prices[:i+1], 20, 2) for i in range(len(prices))]
df['bb_upper'] = [b['bb_upper'] for b in bb_data]
df['bb_middle'] = [b['bb_middle'] for b in bb_data]
df['bb_lower'] = [b['bb_lower'] for b in bb_data]

print("Technical indicators calculated!")
print(f"\nNew columns: {list(df.columns[-12:])}")

## 5. Visualize Technical Indicators

In [None]:
# Plot Price with Moving Averages
plt.figure(figsize=(15, 8))

plt.subplot(3, 1, 1)
plt.plot(df.index[-200:], df['price'].iloc[-200:], label='Price', linewidth=2)
plt.plot(df.index[-200:], df['sma_7'].iloc[-200:], label='SMA-7', alpha=0.7)
plt.plot(df.index[-200:], df['sma_14'].iloc[-200:], label='SMA-14', alpha=0.7)
plt.plot(df.index[-200:], df['sma_30'].iloc[-200:], label='SMA-30', alpha=0.7)
plt.title('Price with Moving Averages')
plt.ylabel('Price')
plt.legend()
plt.grid(True)

# Plot RSI
plt.subplot(3, 1, 2)
plt.plot(df.index[-200:], df['rsi'].iloc[-200:], label='RSI', color='purple')
plt.axhline(y=70, color='r', linestyle='--', label='Overbought')
plt.axhline(y=30, color='g', linestyle='--', label='Oversold')
plt.title('Relative Strength Index (RSI)')
plt.ylabel('RSI')
plt.ylim(0, 100)
plt.legend()
plt.grid(True)

# Plot MACD
plt.subplot(3, 1, 3)
plt.plot(df.index[-200:], df['macd'].iloc[-200:], label='MACD', color='blue')
plt.plot(df.index[-200:], df['macd_signal'].iloc[-200:], label='Signal', color='red')
plt.bar(df.index[-200:], df['macd_histogram'].iloc[-200:], label='Histogram', alpha=0.3)
plt.title('MACD (Moving Average Convergence Divergence)')
plt.ylabel('MACD')
plt.xlabel('Sample Index')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()

## 6. Bollinger Bands Visualization

In [None]:
plt.figure(figsize=(15, 6))
sample_range = slice(-200, None)

plt.plot(df.index[sample_range], df['price'].iloc[sample_range], label='Price', linewidth=2, color='black')
plt.plot(df.index[sample_range], df['bb_upper'].iloc[sample_range], label='Upper Band', linestyle='--', color='red')
plt.plot(df.index[sample_range], df['bb_middle'].iloc[sample_range], label='Middle Band', linestyle='--', color='blue')
plt.plot(df.index[sample_range], df['bb_lower'].iloc[sample_range], label='Lower Band', linestyle='--', color='green')
plt.fill_between(df.index[sample_range], 
                 df['bb_upper'].iloc[sample_range], 
                 df['bb_lower'].iloc[sample_range], 
                 alpha=0.1)

plt.title('Bollinger Bands')
plt.xlabel('Sample Index')
plt.ylabel('Price')
plt.legend()
plt.grid(True)
plt.show()

## 7. Feature Correlation Analysis

In [None]:
# Select technical indicators for correlation
tech_features = ['price', 'sma_7', 'sma_14', 'sma_30', 'ema_7', 'ema_14', 
                 'rsi', 'macd', 'macd_signal', 'bb_upper', 'bb_lower']

# Calculate correlation matrix
correlation_matrix = df[tech_features].corr()

# Plot heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=1, fmt='.2f')
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

print("\nHighly correlated features (|r| > 0.9):")
for i in range(len(correlation_matrix)):
    for j in range(i+1, len(correlation_matrix)):
        if abs(correlation_matrix.iloc[i, j]) > 0.9:
            print(f"{tech_features[i]} <-> {tech_features[j]}: {correlation_matrix.iloc[i, j]:.3f}")

## 8. Feature Importance Analysis

In [None]:
# Create target variable (next hour price change)
df['target'] = df['price'].shift(-1) - df['price']
df = df.dropna()

# Prepare features
feature_cols = ['sma_7', 'sma_14', 'sma_30', 'ema_7', 'ema_14', 
                'rsi', 'macd', 'macd_signal', 'macd_histogram', 
                'bb_upper', 'bb_middle', 'bb_lower']

X = df[feature_cols].iloc[:-100]
y = df['target'].iloc[:-100]

# Train simple model to get feature importance
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(X, y)

# Plot feature importance
importance_df = pd.DataFrame({
    'Feature': feature_cols,
    'Importance': rf.feature_importances_
}).sort_values('Importance', ascending=False)

plt.figure(figsize=(12, 6))
plt.barh(importance_df['Feature'], importance_df['Importance'])
plt.xlabel('Importance')
plt.title('Feature Importance for Price Prediction')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

print("\nTop 5 Most Important Features:")
print(importance_df.head().to_string(index=False))

## 9. Key Insights

### Technical Indicators Explained:

1. **Moving Averages (SMA/EMA)**:
   - Smooth out price data to identify trends
   - EMA gives more weight to recent prices
   - Crossovers indicate potential buy/sell signals

2. **RSI (Relative Strength Index)**:
   - Measures momentum (0-100 scale)
   - >70: Overbought (potential sell signal)
   - <30: Oversold (potential buy signal)

3. **MACD**:
   - Shows relationship between two EMAs
   - Histogram shows divergence between MACD and signal
   - Crossovers indicate trend changes

4. **Bollinger Bands**:
   - Price typically stays within bands
   - Touching upper band: potential reversal down
   - Touching lower band: potential reversal up
   - Band width indicates volatility

### Feature Engineering Best Practices:
- **Consistency**: Use same calculations for training and prediction
- **No Future Data**: Only use past information
- **Scaling**: Normalize features before model training
- **Validation**: Check for feature drift in production