# Project 101 – Baseline Modeling

This notebook builds a baseline machine learning model using the engineered
features. The focus is on correct evaluation, error analysis, and interpretability
rather than maximizing performance.

---

In [None]:
# ==============================
# Imports
# ==============================
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
# Load engineered features
features_path = "../data/processed/stock_features_v1.csv"
df = pd.read_csv(features_path)

df.head()

Unnamed: 0,Date,Price,Open,High,Low,Vol.,Change %,return_1d,return_lag_1,return_lag_3,return_lag_5,ma_5,ma_10,ma_20,volatility_5,volatility_10,volume_change,target
0,2000-01-31,6.41,6.41,6.41,6.41,20140000.0,-1.69,-0.016871,-0.016591,0.0,-0.007474,6.568,6.639,6.667,0.008911,0.006939,1.218062,1
1,2000-02-01,6.52,6.52,6.52,6.52,6340000.0,1.72,0.017161,-0.016871,-0.001506,0.0,6.544,6.615,6.6595,0.014082,0.009663,-0.685204,1
2,2000-02-02,6.54,6.54,6.54,6.54,9010000.0,0.31,0.003067,0.017161,-0.016591,0.0,6.524,6.596,6.653,0.014341,0.009877,0.421136,0
3,2000-02-03,6.53,6.53,6.53,6.53,2410000.0,-0.15,-0.001529,0.003067,-0.016871,-0.001506,6.504,6.576,6.646,0.01434,0.00984,-0.732519,1
4,2000-02-05,6.55,6.55,6.55,6.55,10110000.0,0.31,0.003063,-0.001529,0.017161,-0.016591,6.51,6.562,6.64,0.012201,0.009949,3.195021,1


In [None]:
# Separate features and target
X = df.drop(columns=["target", "Date"])
y = df["target"]

In [None]:
# Time-based split (80% train, 20% test)
split_index = int(len(df) * 0.8)

X_train, X_test = X.iloc[:split_index], X.iloc[split_index:]
y_train, y_test = y.iloc[:split_index], y.iloc[split_index:]

In [None]:
# Initialize a simple and interpretable baseline model
rf = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    max_depth=5
)

rf.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,5
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [None]:
# Predictions
y_pred = rf.predict(X_test)

# Evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.3f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.491

Classification Report:
              precision    recall  f1-score   support

           0       0.49      0.97      0.65       487
           1       0.55      0.04      0.07       510

    accuracy                           0.49       997
   macro avg       0.52      0.50      0.36       997
weighted avg       0.52      0.49      0.35       997



In [7]:
confusion_matrix(y_test, y_pred)

array([[472,  15],
       [492,  18]])

## Evaluation Notes

Accuracy alone is not sufficient.  
The confusion matrix and precision/recall values are analyzed to understand
where the model makes confident but incorrect predictions.

In [None]:
# Inspect a few misclassified examples
errors = X_test.copy()
errors["true"] = y_test.values
errors["pred"] = y_pred
errors["correct"] = errors["true"] == errors["pred"]

errors[errors["correct"] == False].head()

Unnamed: 0,Price,Open,High,Low,Vol.,Change %,return_1d,return_lag_1,return_lag_3,return_lag_5,ma_5,ma_10,ma_20,volatility_5,volatility_10,volume_change,true,pred,correct
3986,27.09,27.42,28.13,27.02,15470000.0,-5.9,-0.059048,-0.005527,0.001037,-0.00999,28.548,28.792,29.5235,0.026978,0.01901,1.065421,1,0,False
3991,25.89,26.51,26.71,25.7,14380000.0,0.94,0.009357,-0.045048,-0.009758,-0.059048,26.694,27.621,28.5215,0.025866,0.024957,-0.068653,1,0,False
3992,26.08,25.9,26.2,25.8,10290000.0,0.73,0.007339,0.009357,-0.019708,0.02141,26.376,27.335,28.317,0.022281,0.024986,-0.284423,1,0,False
3993,26.62,26.1,26.81,26.1,12590000.0,2.07,0.020706,0.007339,-0.045048,-0.009758,26.22,27.1,28.1665,0.026637,0.026674,0.223518,1,0,False
3995,26.78,26.81,26.81,26.41,7970000.0,-0.63,-0.006308,0.012397,0.007339,-0.045048,26.464,26.699,27.879,0.009815,0.027387,-0.316467,1,0,False


In [9]:
import joblib
import os

os.makedirs("../models", exist_ok=True)

model_path = "../models/rf_baseline_v1.pkl"
joblib.dump(rf, model_path)


['../models/rf_baseline_v1.pkl']