NeurArk · NeurArk · May 20, 2025 · May 20, 2025
diff --git a/TODO.md b/TODO.md
@@ -58,17 +58,17 @@ After completing a milestone, create a pull request with your changes for review
 
 ## PR5: Model Training - Classification
 
-- [ ] Create feature selection interface
-- [ ] Add train/test split functionality
-- [ ] Implement cross-validation
-- [ ] Create model selection interface for classification
-- [ ] Implement Logistic Regression
-- [ ] Implement Random Forest Classifier
-- [ ] Add hyperparameter selection interface
-- [ ] Create model training progress indicators
-- [ ] Implement model caching for performance
-- [ ] Write tests for model training pipeline
-- [ ] Create test cases for classification models with sample datasets
+- [x] Create feature selection interface
+- [x] Add train/test split functionality
+- [x] Implement cross-validation
+- [x] Create model selection interface for classification
+- [x] Implement Logistic Regression
+- [x] Implement Random Forest Classifier
+- [x] Add hyperparameter selection interface
+- [x] Create model training progress indicators
+- [x] Implement model caching for performance
+- [x] Write tests for model training pipeline
+- [x] Create test cases for classification models with sample datasets
 
 ## PR6: Model Training - Regression
 

diff --git a/app.py b/app.py
@@ -4,6 +4,7 @@
 from utils import config
 from utils import data as data_utils
 from utils import eda
+from utils import model
 
 st.set_page_config(page_title="PredictStream", layout="wide")
 
@@ -70,6 +71,52 @@ def main() -> None:
         for insight in eda.data_insights_summary(data):
             st.write(f"- {insight}")
 
+        st.subheader("Model Training - Classification")
+        target = st.selectbox("Target Column", options=data.columns)
+        feature_cols = st.multiselect(
+            "Feature Columns",
+            options=[c for c in data.columns if c != target],
+            default=[c for c in data.columns if c != target],
+        )
+        test_size = st.slider("Test Size", 0.1, 0.5, 0.2, step=0.05)
+        model_name = st.selectbox("Model", ["Logistic Regression", "Random Forest"])
+
+        hyperparams = {}
+        if model_name == "Logistic Regression":
+            hyperparams["C"] = st.number_input("C", 0.01, 10.0, 1.0, step=0.01)
+        else:
+            hyperparams["n_estimators"] = st.slider("n_estimators", 10, 200, 100, step=10)
+
+        if st.button("Train Model") and feature_cols:
+            progress = st.progress(0)
+            df_model = data[feature_cols + [target]]
+            X_train, X_test, y_train, y_test = model.train_test_split_data(
+                df_model,
+                target,
+                test_size=test_size,
+                random_state=42,
+            )
+            progress.progress(25)
+            if model_name == "Logistic Regression":
+                clf = model.train_logistic_regression(
+                    X_train,
+                    y_train,
+                    C=hyperparams.get("C", 1.0),
+                    max_iter=200,
+                )
+            else:
+                clf = model.train_random_forest_classifier(
+                    X_train,
+                    y_train,
+                    n_estimators=hyperparams.get("n_estimators", 100),
+                    random_state=42,
+                )
+            progress.progress(75)
+            scores = model.cross_validate_model(clf, X_train, y_train, cv=5)
+            progress.progress(100)
+            st.write("Cross-validation scores:", scores)
+            st.write("Mean accuracy:", float(scores.mean()))
+
 
 if __name__ == "__main__":
     main()
diff --git a/tests/test_model.py b/tests/test_model.py
@@ -0,0 +1,59 @@
+import pandas as pd
+from sklearn.datasets import make_classification
+
+from utils import model
+
+
+def sample_df():
+    X, y = make_classification(n_samples=50, n_features=4, n_classes=2, random_state=0)
+    df = pd.DataFrame(X, columns=[f"f{i}" for i in range(4)])
+    df["target"] = y
+    return df
+
+
+def test_train_test_split():
+    df = sample_df()
+    X_train, X_test, y_train, y_test = model.train_test_split_data(
+        df, "target", test_size=0.2, random_state=42
+    )
+    assert len(X_train) + len(X_test) == len(df)
+    assert len(y_train) == len(X_train)
+    assert len(y_test) == len(X_test)
+
+
+def test_logistic_regression_training():
+    df = sample_df()
+    X_train, X_test, y_train, y_test = model.train_test_split_data(df, "target")
+    clf = model.train_logistic_regression(X_train, y_train, max_iter=100)
+    preds = clf.predict(X_test)
+    assert len(preds) == len(y_test)
+
+
+def test_random_forest_training():
+    df = sample_df()
+    X_train, X_test, y_train, y_test = model.train_test_split_data(df, "target")
+    clf = model.train_random_forest_classifier(X_train, y_train, n_estimators=10)
+    preds = clf.predict(X_test)
+    assert len(preds) == len(y_test)
+
+
+def test_cross_validation():
+    df = sample_df()
+    X = df.drop(columns=["target"])
+    y = df["target"]
+    clf = model.train_logistic_regression(X, y, max_iter=100)
+    scores = model.cross_validate_model(clf, X, y, cv=3)
+    assert len(scores) == 3
+
+
+def test_model_caching():
+    df = sample_df()
+    X_train, _, y_train, _ = model.train_test_split_data(df, "target")
+
+    @model.cache_model
+    def custom_train(x, y):
+        return model.train_logistic_regression(x, y, max_iter=50)
+
+    first = custom_train(X_train, y_train)
+    second = custom_train(X_train, y_train)
+    assert first is second
diff --git a/utils/__init__.py b/utils/__init__.py
@@ -4,5 +4,6 @@
 from . import data
 from . import eda
 from . import viz
+from . import model
 
-__all__ = ["config", "data", "eda", "viz"]
+__all__ = ["config", "data", "eda", "viz", "model"]
diff --git a/utils/model.py b/utils/model.py
@@ -0,0 +1,96 @@
+"""Modeling utilities for classification tasks."""
+
+from __future__ import annotations
+
+from functools import wraps
+from typing import Callable, Tuple
+
+import pandas as pd
+import numpy as np
+from sklearn.model_selection import train_test_split, cross_val_score
+from sklearn.linear_model import LogisticRegression
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.base import BaseEstimator
+
+
+def cache_model(func: Callable) -> Callable:
+    """Simple caching decorator for model training functions."""
+
+    cache = {}
+
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        key_parts = [func.__name__]
+        for arg in args:
+            if isinstance(arg, pd.DataFrame):
+                key_parts.append(pd.util.hash_pandas_object(arg, index=True).sum())
+            elif isinstance(arg, pd.Series):
+                key_parts.append(pd.util.hash_pandas_object(arg, index=True).sum())
+            else:
+                key_parts.append(repr(arg))
+        for k, v in sorted(kwargs.items()):
+            key_parts.append(f"{k}={v}")
+        key = tuple(key_parts)
+        if key not in cache:
+            cache[key] = func(*args, **kwargs)
+        return cache[key]
+
+    return wrapper
+
+
+def train_test_split_data(
+    df: pd.DataFrame,
+    target: str,
+    *,
+    test_size: float = 0.2,
+    random_state: int | None = None,
+) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]:
+    """Split dataframe into train and test sets."""
+    X = df.drop(columns=[target])
+    y = df[target]
+    return train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=y)
+
+
+@cache_model
+def train_logistic_regression(
+    X: pd.DataFrame,
+    y: pd.Series,
+    *,
+    C: float = 1.0,
+    max_iter: int = 1000,
+) -> LogisticRegression:
+    """Train a Logistic Regression classifier."""
+    model = LogisticRegression(C=C, max_iter=max_iter, n_jobs=None)
+    model.fit(X, y)
+    return model
+
+
+@cache_model
+def train_random_forest_classifier(
+    X: pd.DataFrame,
+    y: pd.Series,
+    *,
+    n_estimators: int = 100,
+    max_depth: int | None = None,
+    random_state: int | None = None,
+) -> RandomForestClassifier:
+    """Train a Random Forest classifier."""
+    model = RandomForestClassifier(
+        n_estimators=n_estimators,
+        max_depth=max_depth,
+        random_state=random_state,
+    )
+    model.fit(X, y)
+    return model
+
+
+def cross_validate_model(
+    model: BaseEstimator,
+    X: pd.DataFrame,
+    y: pd.Series,
+    *,
+    cv: int = 5,
+) -> np.ndarray:
+    """Return cross-validation scores for the given model."""
+    scores = cross_val_score(model, X, y, cv=cv)
+    return scores