In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Step 1: Load and prepare data
print("="*60)
print("LOADING DATA")
print("="*60)

ticker = 'AAL'
df = pd.read_csv("/content/SCOA_A5.csv")
df = df[df['Name'] == ticker].copy()

# Convert date to datetime and sort
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values('date').reset_index(drop=True)

print(f"Loaded {len(df)} records for {ticker}")
print(f"Date range: {df['date'].min()} to {df['date'].max()}")
print(f"Price range: ${df['close'].min():.2f} - ${df['close'].max():.2f}")

# Step 2: Enhanced Feature Engineering
print("\n" + "="*60)
print("FEATURE ENGINEERING")
print("="*60)

# Price-based features
df['price_change'] = df['close'].pct_change()
df['high_low_range'] = (df['high'] - df['low']) / df['close']
df['open_close_change'] = (df['close'] - df['open']) / df['open']
df['day_range'] = df['high'] - df['low']

# Moving averages
df['ma_5'] = df['close'].rolling(window=5).mean()
df['ma_10'] = df['close'].rolling(window=10).mean()
df['ma_20'] = df['close'].rolling(window=20).mean()
df['ma_ratio_5_20'] = df['ma_5'] / df['ma_20']

# Exponential Moving Averages
df['ema_12'] = df['close'].ewm(span=12, adjust=False).mean()
df['ema_26'] = df['close'].ewm(span=26, adjust=False).mean()
df['macd'] = df['ema_12'] - df['ema_26']

# Relative Strength Index (RSI)
def compute_rsi(series, period=14):
    delta = series.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=period).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=period).mean()
    rs = gain / loss
    return 100 - (100 / (1 + rs))

df['rsi'] = compute_rsi(df['close'])
df['rsi_normalized'] = (df['rsi'] - 50) / 50

# Volume features
df['volume_change'] = df['volume'].pct_change()
df['volume_ma_5'] = df['volume'].rolling(window=5).mean()
df['volume_ratio'] = df['volume'] / df['volume_ma_5']

# Volatility features
df['volatility_10'] = df['close'].rolling(window=10).std()
df['volatility_20'] = df['close'].rolling(window=20).std()

# Momentum features
df['momentum_5'] = df['close'] - df['close'].shift(5)
df['momentum_10'] = df['close'] - df['close'].shift(10)

# Price position relative to high/low
df['price_position'] = (df['close'] - df['low']) / (df['high'] - df['low'])

# Lagged features (previous days)
df['close_lag_1'] = df['close'].shift(1)
df['close_lag_2'] = df['close'].shift(2)
df['close_lag_3'] = df['close'].shift(3)

# Target: Next day's closing price
df['Target'] = df['close'].shift(-1)

# Drop NaN values
df_clean = df.dropna().copy()
print(f"After feature engineering: {len(df_clean)} records")

# Step 3: Feature selection
feature_cols = [
    'open', 'high', 'low', 'close', 'volume',
    'price_change', 'high_low_range', 'open_close_change', 'day_range',
    'ma_5', 'ma_10', 'ma_20', 'ma_ratio_5_20',
    'ema_12', 'ema_26', 'macd',
    'rsi', 'rsi_normalized',
    'volume_change', 'volume_ma_5', 'volume_ratio',
    'volatility_10', 'volatility_20',
    'momentum_5', 'momentum_10',
    'price_position',
    'close_lag_1', 'close_lag_2', 'close_lag_3'
]

X = df_clean[feature_cols].values
y = df_clean['Target'].values  # Next day's actual price

print(f"\nTotal features: {len(feature_cols)}")
print(f"Target statistics:")
print(f"  Mean price: ${y.mean():.2f}")
print(f"  Std dev: ${y.std():.2f}")

# Step 4: Scaling (separate scalers for features and target)
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()

X_scaled = scaler_X.fit_transform(X)
y_scaled = scaler_y.fit_transform(y.reshape(-1, 1)).flatten()

# Step 5: Train-test split (time series - no shuffle)
test_size = 0.2
split_idx = int(len(X_scaled) * (1 - test_size))

X_train = X_scaled[:split_idx]
X_test = X_scaled[split_idx:]
y_train = y_scaled[:split_idx]
y_test = y_scaled[split_idx:]

# Keep original prices for evaluation
y_train_original = y[:split_idx]
y_test_original = y[split_idx:]

print(f"\nTrain samples: {len(X_train)} | Test samples: {len(X_test)}")

# Step 6: Build regression ANN
print("\n" + "="*60)
print("BUILDING REGRESSION MODEL")
print("="*60)

model = Sequential([
    # Input layer
    Dense(128, input_dim=X_train.shape[1], activation='relu', name='input_layer'),
    BatchNormalization(),
    Dropout(0.2),
    
    # Hidden layers
    Dense(64, activation='relu', name='hidden_1'),
    BatchNormalization(),
    Dropout(0.2),
    
    Dense(32, activation='relu', name='hidden_2'),
    BatchNormalization(),
    Dropout(0.15),
    
    Dense(16, activation='relu', name='hidden_3'),
    Dropout(0.1),
    
    # Output layer (linear activation for regression)
    Dense(1, activation='linear', name='output_layer')
])

# Compile model with regression loss
optimizer = Adam(learning_rate=0.001)
model.compile(
    loss='mse',  # Mean Squared Error for regression
    optimizer=optimizer,
    metrics=['mae']  # Mean Absolute Error
)

model.summary()

# Step 7: Callbacks
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=25,
    restore_best_weights=True,
    verbose=1
)

reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,
    patience=10,
    min_lr=0.00001,
    verbose=1
)

# Step 8: Train model
print("\n" + "="*60)
print("TRAINING MODEL")
print("="*60)

history = model.fit(
    X_train, y_train,
    epochs=200,
    batch_size=32,
    validation_split=0.2,
    callbacks=[early_stop, reduce_lr],
    verbose=1
)

# Step 9: Make predictions
print("\n" + "="*60)
print("MODEL EVALUATION")
print("="*60)

# Predictions (scaled)
y_train_pred_scaled = model.predict(X_train, verbose=0).flatten()
y_test_pred_scaled = model.predict(X_test, verbose=0).flatten()

# Inverse transform to get actual prices
y_train_pred = scaler_y.inverse_transform(y_train_pred_scaled.reshape(-1, 1)).flatten()
y_test_pred = scaler_y.inverse_transform(y_test_pred_scaled.reshape(-1, 1)).flatten()

# Calculate metrics for training set
train_mse = mean_squared_error(y_train_original, y_train_pred)
train_rmse = np.sqrt(train_mse)
train_mae = mean_absolute_error(y_train_original, y_train_pred)
train_r2 = r2_score(y_train_original, y_train_pred)
train_mape = np.mean(np.abs((y_train_original - y_train_pred) / y_train_original)) * 100

print("TRAINING SET METRICS:")
print(f"  RMSE: ${train_rmse:.4f}")
print(f"  MAE: ${train_mae:.4f}")
print(f"  R² Score: {train_r2:.4f}")
print(f"  MAPE: {train_mape:.2f}%")

# Calculate metrics for test set
test_mse = mean_squared_error(y_test_original, y_test_pred)
test_rmse = np.sqrt(test_mse)
test_mae = mean_absolute_error(y_test_original, y_test_pred)
test_r2 = r2_score(y_test_original, y_test_pred)
test_mape = np.mean(np.abs((y_test_original - y_test_pred) / y_test_original)) * 100

print("\nTEST SET METRICS:")
print(f"  RMSE: ${test_rmse:.4f}")
print(f"  MAE: ${test_mae:.4f}")
print(f"  R² Score: {test_r2:.4f}")
print(f"  MAPE: {test_mape:.2f}%")

# Direction accuracy (bonus metric)
train_direction_correct = np.sum((y_train_pred[1:] > y_train_pred[:-1]) == 
                                  (y_train_original[1:] > y_train_original[:-1]))
train_direction_acc = train_direction_correct / (len(y_train_pred) - 1) * 100

test_direction_correct = np.sum((y_test_pred[1:] > y_test_pred[:-1]) == 
                                 (y_test_original[1:] > y_test_original[:-1]))
test_direction_acc = test_direction_correct / (len(y_test_pred) - 1) * 100

print(f"\nDIRECTION ACCURACY:")
print(f"  Training: {train_direction_acc:.2f}%")
print(f"  Test: {test_direction_acc:.2f}%")

# Step 10: Visualizations
print("\n" + "="*60)
print("GENERATING VISUALIZATIONS")
print("="*60)

fig = plt.figure(figsize=(16, 12))

# Plot 1: Training History - Loss
ax1 = plt.subplot(3, 2, 1)
ax1.plot(history.history['loss'], label='Training Loss', linewidth=2)
ax1.plot(history.history['val_loss'], label='Validation Loss', linewidth=2)
ax1.set_title('Model Loss (MSE) Over Epochs', fontsize=12, fontweight='bold')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Plot 2: Training History - MAE
ax2 = plt.subplot(3, 2, 2)
ax2.plot(history.history['mae'], label='Training MAE', linewidth=2)
ax2.plot(history.history['val_mae'], label='Validation MAE', linewidth=2)
ax2.set_title('Model MAE Over Epochs', fontsize=12, fontweight='bold')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('MAE')
ax2.legend()
ax2.grid(True, alpha=0.3)

# Plot 3: Actual vs Predicted (Test Set)
ax3 = plt.subplot(3, 2, 3)
ax3.scatter(y_test_original, y_test_pred, alpha=0.5, s=30)
ax3.plot([y_test_original.min(), y_test_original.max()], 
         [y_test_original.min(), y_test_original.max()], 
         'r--', lw=2, label='Perfect Prediction')
ax3.set_title('Actual vs Predicted Prices (Test Set)', fontsize=12, fontweight='bold')
ax3.set_xlabel('Actual Price ($)')
ax3.set_ylabel('Predicted Price ($)')
ax3.legend()
ax3.grid(True, alpha=0.3)

# Plot 4: Time Series Predictions (Test Set - Last 100 points)
ax4 = plt.subplot(3, 2, 4)
plot_points = min(100, len(y_test_original))
x_axis = range(plot_points)
ax4.plot(x_axis, y_test_original[-plot_points:], label='Actual', linewidth=2, marker='o', markersize=3)
ax4.plot(x_axis, y_test_pred[-plot_points:], label='Predicted', linewidth=2, marker='s', markersize=3)
ax4.set_title(f'Price Predictions (Last {plot_points} Test Days)', fontsize=12, fontweight='bold')
ax4.set_xlabel('Time Steps')
ax4.set_ylabel('Price ($)')
ax4.legend()
ax4.grid(True, alpha=0.3)

# Plot 5: Prediction Error Distribution
ax5 = plt.subplot(3, 2, 5)
errors = y_test_original - y_test_pred
ax5.hist(errors, bins=50, alpha=0.7, edgecolor='black', color='steelblue')
ax5.axvline(x=0, color='r', linestyle='--', linewidth=2, label='Zero Error')
ax5.set_title('Prediction Error Distribution (Test Set)', fontsize=12, fontweight='bold')
ax5.set_xlabel('Error ($)')
ax5.set_ylabel('Frequency')
ax5.legend()
ax5.grid(True, alpha=0.3)

# Plot 6: Residual Plot
ax6 = plt.subplot(3, 2, 6)
ax6.scatter(y_test_pred, errors, alpha=0.5, s=30)
ax6.axhline(y=0, color='r', linestyle='--', linewidth=2)
ax6.set_title('Residual Plot (Test Set)', fontsize=12, fontweight='bold')
ax6.set_xlabel('Predicted Price ($)')
ax6.set_ylabel('Residual ($)')
ax6.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('ann_regression_performance.png', dpi=300, bbox_inches='tight')
print("Saved visualization to 'ann_regression_performance.png'")
plt.show()

# Additional Analysis
print("\n" + "="*60)
print("PREDICTION ANALYSIS")
print("="*60)
print(f"\nSample predictions (last 10 test days):")
print(f"{'Actual':<12} {'Predicted':<12} {'Error':<12} {'Error %':<12}")
print("-" * 50)
for i in range(-10, 0):
    actual = y_test_original[i]
    pred = y_test_pred[i]
    error = actual - pred
    error_pct = (error / actual) * 100
    print(f"${actual:<11.2f} ${pred:<11.2f} ${error:<11.2f} {error_pct:<11.2f}%")

print("\n" + "="*60)
print("ANALYSIS COMPLETE")
print("="*60)

  df = yf.download(ticker, start='2020-01-01', end='2023-01-01')
[*********************100%***********************]  1 of 1 completed

Epoch 1/50



  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.5497 - loss: 0.6926
Epoch 2/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.5025 - loss: 0.6929 
Epoch 3/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.5434 - loss: 0.6922 
Epoch 4/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5165 - loss: 0.6927 
Epoch 5/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.5069 - loss: 0.6937 
Epoch 6/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.5095 - loss: 0.6928 
Epoch 7/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5359 - loss: 0.6893 
Epoch 8/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.5053 - loss: 0.6925 
Epoch 9/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

# Feature Engineering Explanation

This assignment focuses on **feature engineering for financial time series forecasting**, specifically predicting the next-day closing price of a stock using historical and derived features. Below is a detailed breakdown of the **feature engineering process**, its logic, and its role in improving model performance.

---

## 1. Dataset Overview

The dataset used is a **stock price dataset** (`SCOA_A5.csv`), which contains information for multiple tickers. The script filters it for a single ticker (e.g., `AAL`). The key columns are:

* `date`: trading day
* `open`, `high`, `low`, `close`: stock prices at different times of the day
* `volume`: number of shares traded

Feature engineering transforms these raw values into more **informative predictors** that capture trends, volatility, momentum, and market behavior.

---

## 2. Price-based Features

These are direct transformations of price columns to reflect daily dynamics.

| Feature               | Formula / Logic                         | Interpretation                                            |
| --------------------- | --------------------------------------- | --------------------------------------------------------- |
| **price_change**      | `(close_t - close_{t-1}) / close_{t-1}` | Daily percentage change — measures market movement.       |
| **high_low_range**    | `(high - low) / close`                  | Relative daily range — higher values indicate volatility. |
| **open_close_change** | `(close - open) / open`                 | Intraday gain/loss — positive if stock closed higher.     |
| **day_range**         | `high - low`                            | Raw price range for the day — magnitude of movement.      |

These features make price movement more comparable across different scales.

---

## 3. Moving Averages (MA)

Moving averages smooth out short-term fluctuations and highlight trends.

* **ma_5, ma_10, ma_20**: Average closing prices over 5, 10, and 20 days.
* **ma_ratio_5_20 = ma_5 / ma_20**: Measures short-term vs long-term momentum.

> Example: If `ma_ratio_5_20 > 1`, short-term prices are above the long-term average → bullish signal.

---

## 4. Exponential Moving Averages (EMA) and MACD

EMAs give **more weight to recent prices**. They react faster to market changes.

* **ema_12**: Short-term EMA
* **ema_26**: Long-term EMA
* **macd = ema_12 - ema_26**: Moving Average Convergence Divergence — captures momentum shifts.

When `MACD` > 0 → bullish (momentum upward). When < 0 → bearish (momentum down).

---

## 5. Relative Strength Index (RSI)

RSI quantifies **momentum** based on average gains and losses over a period (default = 14 days).

```python
delta = close.diff()
gain = delta.where(delta > 0, 0).rolling(window=14).mean()
loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
rs = gain / loss
rsi = 100 - (100 / (1 + rs))
```

* RSI ranges from 0 to 100.
* > 70 → overbought; <30 → oversold.

A **normalized RSI** is also created:
`rsi_normalized = (rsi - 50) / 50` → rescaling helps neural networks train better.

---

## 6. Volume-based Features

Volume often precedes price movement — large volumes indicate market interest.

| Feature                                 | Description                                                            |
| --------------------------------------- | ---------------------------------------------------------------------- |
| **volume_change**                       | Daily % change in trading volume.                                      |
| **volume_ma_5**                         | 5-day average of volume — smooths fluctuations.                        |
| **volume_ratio = volume / volume_ma_5** | Measures whether the day’s trading activity was unusually high or low. |

---

## 7. Volatility Features

Volatility reflects **uncertainty or risk**. It is quantified via rolling standard deviation:

* **volatility_10**: Std dev of `close` over 10 days.
* **volatility_20**: Std dev over 20 days.

Higher volatility indicates unstable market conditions.

---

## 8. Momentum Features

Momentum measures the rate of price change:

* **momentum_5 = close_t - close_{t-5}**
* **momentum_10 = close_t - close_{t-10}**

Positive momentum means price is increasing compared to the past; negative means it’s declining.

---

## 9. Price Position Feature

```python
price_position = (close - low) / (high - low)
```

Represents where the closing price lies within the day’s range.

* Close near 1 → closed near day’s high.
* Close near 0 → closed near day’s low.

---

## 10. Lagged Features

Lagged features bring **historical dependencies** into the model.

* **close_lag_1, close_lag_2, close_lag_3** = Previous day(s) closing prices.

These help the neural network capture autoregressive behavior — that future prices depend on recent ones.

---

## 11. Target Variable

The target (`Target`) is the **next day’s closing price**:

```python
df['Target'] = df['close'].shift(-1)
```

This turns the problem into a **supervised regression** — predicting tomorrow’s closing price from today’s features.

---

## 12. Data Cleaning and Scaling

After feature creation, rows containing NaN (due to rolling windows) are dropped. Then:

* **MinMaxScaler** scales all features between 0 and 1 to help neural networks converge faster.
* Separate scalers are used for input features (`X`) and target (`y`).

---

## 13. Why This Feature Engineering Works

* Captures **temporal behavior**: via lags, moving averages, and momentum.
* Adds **market psychology**: through RSI and MACD.
* Integrates **volatility and liquidity** via standard deviation and volume.
* Provides both **short-term** (5-day) and **long-term** (20-day) signals.
* Reduces noise and helps the ANN model detect stable predictive patterns.

---

## 14. Summary of Feature Categories

| Category       | Features                                                   |
| -------------- | ---------------------------------------------------------- |
| Price Movement | price_change, high_low_range, open_close_change, day_range |
| Trend          | ma_5, ma_10, ma_20, ma_ratio_5_20, ema_12, ema_26, macd    |
| Momentum       | momentum_5, momentum_10, rsi, rsi_normalized               |
| Volatility     | volatility_10, volatility_20                               |
| Volume         | volume_change, volume_ma_5, volume_ratio                   |
| Lag            | close_lag_1, close_lag_2, close_lag_3                      |
| Price Position | price_position                                             |

---

## 15. Final Note

Feature engineering is the **most critical step** in time-series prediction. The neural network’s performance depends heavily on how well the features summarize market behavior. Here, we combined:

* Statistical indicators
* Technical analysis signals
* Rolling and lagged relationships

This comprehensive set ensures that the model learns **both local fluctuations and global trends**, which are essential in stock market forecasting.
