# Modelling the Data

# # Trading Strategy Using CSV Data
# 
This notebook demonstrates how to load historical stock data from a CSV file, create lagged return features, train a linear regression model, evaluate its performance, and simulate a simple trading strategy. The approach follows best practices including:
 
 - Reading and processing CSV data with proper date parsing and sorting.
 - Creating lagged features for returns.
 - Splitting the data in a time-aware manner (80/20 split).
 - Training a linear regression model.
 - Computing performance metrics (MAE, R², RMSE) and the annualized Sharpe ratio.
 - Simulating trading signals and visualizing cumulative returns.


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error, mean_absolute_percentage_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler


## Load Data

In [2]:
def load_data(csv_file):
  """Loads data from CSV file. Handles datatypes"""
  try:
      data = pd.read_csv(csv_file, parse_dates=['Date'], index_col='Date')
      data.sort_index(inplace=True)
      return data
  except FileNotFoundError:
      print(f"Error: File not found at {csv_file}")
      return None
  except Exception as e:
      print(f"Error loading data: {e}")
      return None

# Feature Engineering

In [3]:
def engineer_features(df):
  """Engineers features."""
  try:
    # Ensure Date is a datetime type
    if 'Date' in df.columns:
        df['Date'] = pd.to_datetime(df['Date'])

    # Convert Volume to numeric (if it contains non-numeric values, convert to NaN)
    df['Volume'] = df['Volume'].replace({',':''}, regex=True).astype('float64')

    # Basic price transformations
    df['Return'] = df['Close'].pct_change()
    df['Range'] = (df['High'] - df['Low']) / df['Close']  # Normalized daily range

    # Moving Averages
    for window in [5, 20, 50, 200]:
        df[f'SMA_{window}'] = df['Close'].rolling(window, min_periods=1).mean()
        df[f'EMA_{window}'] = df['Close'].ewm(span=window, adjust=False).mean()

    # Bollinger Bands
    df['BB_MA20'] = df['Close'].rolling(20, min_periods=1).mean()
    df['BB_Upper'] = df['BB_MA20'] + 2 * df['Close'].rolling(20).std()
    df['BB_Lower'] = df['BB_MA20'] - 2 * df['Close'].rolling(20).std()

    # RSI (14-day)
    delta = df['Close'].diff()
    gain = np.where(delta > 0, delta, 0)
    loss = np.where(delta < 0, -delta, 0)
    avg_gain = pd.Series(gain).rolling(14, min_periods=1).mean()
    avg_loss = pd.Series(loss).rolling(14, min_periods=1).mean()
    rs = avg_gain / (avg_loss + 1e-10) #Added 1e-10 to denom
    df['RSI_14'] = 100 - (100 / (1 + rs))

    # MACD
    df['MACD'] = df['Close'].ewm(span=12, adjust=False).mean() - df['Close'].ewm(span=26, adjust=False).mean()
    df['MACD_Signal'] = df['MACD'].ewm(span=9, adjust=False).mean()
    df['MACD_Hist'] = df['MACD'] - df['MACD_Signal'] #Adding a MACD histogram

    # Volatility, Force to float64
    for window in [10, 20, 30]:
        df[f'Volatility_{window}'] = (df['Return'].rolling(window, min_periods=1).std() * np.sqrt(252)).astype('float64') #Annualize Vol

    # On-Balance Volume (OBV)
    df['OBV'] = (np.sign(df['Close'].diff()) * df['Volume']).cumsum()

    # VWAP
    df['VWAP'] = (df['Volume'] * (df['High'] + df['Low'] + df['Close']) / 3).cumsum() / df['Volume'].cumsum()

    # Lagged Returns
    for lag in [1, 2, 3, 5, 7, 10]:
        df[f'Return_Lag_{lag}'] = df['Return'].shift(lag)

    # Lagged Volume
    df['Volume_Lag_1'] = df['Volume'].shift(1)

    # Target (Next day's return)
    df['Target'] = df['Close'].shift(-1) - df['Close']

    return df
  except Exception as e:
    print(f"Error during feature engineering: {e}")
    return None

# Data Splitting, Imputation, and Scaling Data

In [4]:
def impute_and_scale(train, test, features, scaler_type='StandardScaler'):
    """Imputes missing values and scales the features.
    Returns:
        X_train_scaled, X_test_scaled, y_train, y_test, scaler
    """
    try:
        # Impute missing values in training data using the mean of the training data
        train_means = {}  # Store means for each feature from the training set

        for feature in features:
            train_mean = train[feature].mean()
            train_means[feature] = train_mean  # Store the mean
            train[feature] = train[feature].fillna(train_mean)
            test[feature] = test[feature].fillna(train_mean)  # Use the training mean for test

        # Get features before dropping NaN
        X_train = train[features].copy()
        X_test = test[features].copy()

        # Align the targets and remove the data based on available features.
        train = train.loc[X_train.index].copy()
        test = test.loc[X_test.index].copy()

        # Scale, scale based on scaling parameter
        if scaler_type == 'StandardScaler':
            scaler = StandardScaler()
        elif scaler_type == 'MinMaxScaler':
            scaler = MinMaxScaler()
        elif scaler_type == 'RobustScaler':
            scaler = RobustScaler()
        else:
            raise ValueError("Invalid scaler_type. Choose 'StandardScaler', 'MinMaxScaler', or 'RobustScaler'.")

        # Remove NaN values AFTER setting X_train and X_test, to avoid key collisions with y_train
        X_train.dropna(inplace=True)
        X_test.dropna(inplace=True)

        # Align Target variables too after dropping NA in X
        y_train = train.loc[X_train.index, 'Target']  # Align indices
        y_test = test.loc[X_test.index, 'Target']  # Align indices

        # Validate if scaling will break due to small sample size
        if len(X_train) < 2 or len(X_test) < 2: # Require two samples to avoid scaler error
            print("Warning: Too few samples to train model. Please check test and train data set!")
            return None, None, None, None, None

        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        return X_train_scaled, X_test_scaled, y_train, y_test, scaler

    except Exception as e:
        print(f"Error during imputation and scaling: {e}")
        return None, None, None, None, None

# Model Training & Evaluation

In [5]:
def train_and_evaluate_model(X_train_scaled, X_test_scaled, y_train, y_test):
  """Trains the linear regression model and calculates/returns the metrics"""
  try:
    model = LinearRegression()
    model.fit(X_train_scaled, y_train)

    y_pred = model.predict(X_test_scaled)

    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mape = mean_absolute_percentage_error(y_test, y_pred)

    print("\nModel Performance:")
    print(f"MAE: {mae:.6f}")
    print(f"R² Score: {r2:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print(f"MAPE: {mape:.4f}")

    return model, y_pred

  except Exception as e:
    print(f"Error in model training and evaluation: {e}")
    return None, None

# Backtest & Visualisations

In [6]:
def backtest_and_visualize(data, split_index, y_pred, y_test, X_test, risk_free_rate=0):
  """Performs backtesting and generates visualizations."""
  try:
      # Get the Date index for the test set
      test_dates = data.iloc[split_index:].index

      # Recreate the dataframe for the test period, using the raw data before feature engineering
      test_data = data.iloc[split_index:].copy()
      test_data['Return'] = data['Return'] #Data must be already feature engineered

      test_data['Predicted_Return'] = y_pred  # The predicted returns
      test_data['Signal'] = np.where(test_data['Predicted_Return'] > 0, 1, -1) #Now it lines up.

      # Calculate transaction costs
      transaction_cost = 0.001  # Example: 0.1%
      test_data["Holdings"] = test_data["Signal"].diff().fillna(0).abs()
      test_data["Transaction_Cost"] = transaction_cost * test_data["Holdings"] * test_data["Close"]

      test_data['Strategy_Return_No_Cost'] = test_data['Signal'] * y_test - test_data["Transaction_Cost"]  #Now this is correct.
      test_data['Market_Return'] = (1 + test_data['Return']).cumprod() #Old return so its B+H over entire dataset
      test_data['Strategy_Return_No_Cost'] = (1 + test_data['Strategy_Return_No_Cost']).cumprod() #Strategy return with y_test and no cost

      # Calculate Sharpe Ratio (Risk-Free Rate Subtracted)
      excess_returns = test_data['Strategy_Return_No_Cost'].pct_change() - risk_free_rate / 252
      sharpe_ratio = excess_returns.mean() / excess_returns.std() * np.sqrt(252)

      # Calculate Maximum Drawdown
      cumulative_returns = test_data['Strategy_Return_No_Cost']
      peak = cumulative_returns.cummax()
      drawdown = (cumulative_returns - peak) / peak
      max_drawdown = drawdown.min()

      # Rolling Sharpe Ratio
      rolling_sharpe_ratio = excess_returns.rolling(window=252).mean() / excess_returns.rolling(window=252).std() * np.sqrt(252)

      print(f"\nFinal Strategy Return (No Transaction Costs): {test_data['Strategy_Return_No_Cost'].iloc[-1]:.2f}")
      print(f"Final Market Return: {test_data['Market_Return'].iloc[-1]:.2f}")
      print(f"Annualized Sharpe Ratio: {sharpe_ratio:.2f}")
      print(f"Maximum Drawdown: {max_drawdown:.2%}")

      # Plotting
      plt.figure(figsize=(12, 6))
      plt.plot(test_data.index, test_data['Market_Return'], label='Buy & Hold')
      plt.plot(test_data.index, test_data['Strategy_Return_No_Cost'], label='Strategy (No Transaction Costs)', alpha=0.8)

      plt.plot(test_data.index, peak, label='Peak Cumulative Return', linestyle='--', alpha=0.5)  # Show the peak
      plt.fill_between(test_data.index, cumulative_returns, peak, where=cumulative_returns < peak, color='red', alpha=0.3, label='Drawdown')

      plt.title('Trading Strategy Performance with Drawdown')
      plt.ylabel('Cumulative Returns')
      plt.legend()
      plt.show()

      # Plot Rolling Sharpe Ratio
      plt.figure(figsize=(12, 6))
      plt.plot(test_data.index, rolling_sharpe_ratio, label='Rolling Sharpe Ratio (252 days)')
      plt.title('Rolling Sharpe Ratio')
      plt.ylabel('Sharpe Ratio')
      plt.legend()
      plt.show()

  except Exception as e:
      print(f"Error during backtesting and visualization: {e}")

# MaIN Script - move to main.py

In [None]:
csv_file = "data/KO_data.csv"
features = ['Volatility_10', 'Return_Lag_1', 'Return_Lag_2', 'Return_Lag_5', 'Return_Lag_10', 'SMA_5', 'SMA_20', 'EMA_50', 'EMA_200', 'BB_MA20', 'MACD', 'MACD_Hist', 'OBV', 'VWAP', 'Volume_Lag_1', 'RSI_14']
scaler_type = 'StandardScaler'
risk_free_rate = 0
test_size=0.2 #Reduce size of test data

# Load data
data = load_data(csv_file)
if data is None:
    exit()

# Engineer return in the raw data (used later for B+H)
data = engineer_features(data)

# Time series split (80% training, 20% testing)
split_index = int((1-test_size) * len(data))
train_df = data.iloc[:split_index].copy()  # Create a copy
test_df = data.iloc[split_index:].copy()  # Create a copy

# Feature engineering
train_df = engineer_features(train_df)
test_df = engineer_features(test_df)

if train_df is None or test_df is None:
    exit()

# Imputation and Scaling
X_train_scaled, X_test_scaled, y_train, y_test, scaler = impute_and_scale(train_df, test_df, features, scaler_type)

if X_train_scaled is None:
    exit()

# Model Training and Evaluation
model, y_pred = train_and_evaluate_model(X_train_scaled, X_test_scaled, y_train, y_test)

if model is None:
    exit()

# Backtesting and Visualization
#Make sure to pass the values for X and Y
X_test = test_df[features] #Pull in features from test data, in test dates
backtest_and_visualize(data, split_index, y_pred, y_test, X_test, risk_free_rate)

print("Train Dataframe:")
print(train_df.head())
print("\nTest Dataframe:")
print(test_df.head())

Error in model training and evaluation: This LinearRegression estimator requires y to be passed, but the target y is None.
Error during backtesting and visualization: 'Return'
Train Dataframe:
             Open   High    Low  Close  Adj Close      Volume    Return  \
Date                                                                      
2024-03-19  60.24  60.35  60.06  60.23      58.48  15030600.0       NaN   
2024-03-20  60.18  60.81  60.16  60.75      58.99  15258800.0  0.008634   
2024-03-21  60.56  60.99  60.32  60.47      58.71  13067100.0 -0.004609   
2024-03-22  60.52  60.79  60.43  60.49      58.73  11501400.0  0.000331   
2024-03-25  60.48  60.71  60.12  60.40      58.65  13144700.0 -0.001488   

               Range      SMA_5      EMA_5  ...           OBV       VWAP  \
Date                                        ...                            
2024-03-19  0.004815  60.230000  60.230000  ...  1.947988e+08  60.213333   
2024-03-20  0.010700  60.490000  60.403333  ...  1.52

: 