# 2. Price Prediction Model Tuning

This notebook focuses on tuning the final model used to predict stock prices.

## Goal
Find the best hyperparameters using **Grid Search** and **TimeSeriesSplit Cross Validation**.

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error

%matplotlib inline

## 1. Load Complete Data

In [None]:
DATA_DIR = "../data/complete"
df = pd.read_csv(os.path.join(DATA_DIR, "all_sectors_complete_10y.csv"))
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values(['ticker', 'date'])

print(f"Total samples: {len(df)}")

## 2. Feature Engineering (Simplified)
Create a target variable (e.g., Next Month Price).

In [None]:
# Shift close price to create target
df['target'] = df.groupby('ticker')['close'].shift(-1) # Predict next month
data = df.dropna()

features = ['close', 'volume', 'rsi_14', 'macd', 'PE', 'ROE', 'Debt_to_Equity']
X = data[features]
y = data['target']

## 3. Time Series Cross Validation
Standard K-Fold is NOT suitable for time series data because it shuffles data, causing data leakage (training on future data). We must use `TimeSeriesSplit`.

In [None]:
# Pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', GradientBoostingRegressor(random_state=42))
])

# Parameter Grid
param_grid = {
    'regressor__n_estimators': [100, 200],
    'regressor__learning_rate': [0.01, 0.1],
    'regressor__max_depth': [3, 5]
}

# Time Series Split
tscv = TimeSeriesSplit(n_splits=5)

# Grid Search
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=tscv,
    scoring='neg_mean_absolute_error',
    n_jobs=-1,
    verbose=1
)

print("Starting Grid Search (TimeSeriesSplit)...")
grid_search.fit(X, y)

## 4. Results Analysis

In [None]:
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best MAE: {-grid_search.best_score_:.4f}")

# Visualize CV Results
results_df = pd.DataFrame(grid_search.cv_results_)
plt.figure(figsize=(10, 6))
sns.barplot(data=results_df, x='param_regressor__learning_rate', y='mean_test_score', hue='param_regressor__max_depth')
plt.title('Grid Search Results: Learning Rate vs Negative MAE')
plt.show()