Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 11 additions & 11 deletions TODO.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,17 +58,17 @@ After completing a milestone, create a pull request with your changes for review

## PR5: Model Training - Classification

- [ ] Create feature selection interface
- [ ] Add train/test split functionality
- [ ] Implement cross-validation
- [ ] Create model selection interface for classification
- [ ] Implement Logistic Regression
- [ ] Implement Random Forest Classifier
- [ ] Add hyperparameter selection interface
- [ ] Create model training progress indicators
- [ ] Implement model caching for performance
- [ ] Write tests for model training pipeline
- [ ] Create test cases for classification models with sample datasets
- [x] Create feature selection interface
- [x] Add train/test split functionality
- [x] Implement cross-validation
- [x] Create model selection interface for classification
- [x] Implement Logistic Regression
- [x] Implement Random Forest Classifier
- [x] Add hyperparameter selection interface
- [x] Create model training progress indicators
- [x] Implement model caching for performance
- [x] Write tests for model training pipeline
- [x] Create test cases for classification models with sample datasets

## PR6: Model Training - Regression

Expand Down
47 changes: 47 additions & 0 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from utils import config
from utils import data as data_utils
from utils import eda
from utils import model

st.set_page_config(page_title="PredictStream", layout="wide")

Expand Down Expand Up @@ -70,6 +71,52 @@ def main() -> None:
for insight in eda.data_insights_summary(data):
st.write(f"- {insight}")

st.subheader("Model Training - Classification")
target = st.selectbox("Target Column", options=data.columns)
feature_cols = st.multiselect(
"Feature Columns",
options=[c for c in data.columns if c != target],
default=[c for c in data.columns if c != target],
)
test_size = st.slider("Test Size", 0.1, 0.5, 0.2, step=0.05)
model_name = st.selectbox("Model", ["Logistic Regression", "Random Forest"])

hyperparams = {}
if model_name == "Logistic Regression":
hyperparams["C"] = st.number_input("C", 0.01, 10.0, 1.0, step=0.01)
else:
hyperparams["n_estimators"] = st.slider("n_estimators", 10, 200, 100, step=10)

if st.button("Train Model") and feature_cols:
progress = st.progress(0)
df_model = data[feature_cols + [target]]
X_train, X_test, y_train, y_test = model.train_test_split_data(
df_model,
target,
test_size=test_size,
random_state=42,
)
progress.progress(25)
if model_name == "Logistic Regression":
clf = model.train_logistic_regression(
X_train,
y_train,
C=hyperparams.get("C", 1.0),
max_iter=200,
)
else:
clf = model.train_random_forest_classifier(
X_train,
y_train,
n_estimators=hyperparams.get("n_estimators", 100),
random_state=42,
)
progress.progress(75)
scores = model.cross_validate_model(clf, X_train, y_train, cv=5)
progress.progress(100)
st.write("Cross-validation scores:", scores)
st.write("Mean accuracy:", float(scores.mean()))


if __name__ == "__main__":
main()
59 changes: 59 additions & 0 deletions tests/test_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import pandas as pd
from sklearn.datasets import make_classification

from utils import model


def sample_df():
X, y = make_classification(n_samples=50, n_features=4, n_classes=2, random_state=0)
df = pd.DataFrame(X, columns=[f"f{i}" for i in range(4)])
df["target"] = y
return df


def test_train_test_split():
df = sample_df()
X_train, X_test, y_train, y_test = model.train_test_split_data(
df, "target", test_size=0.2, random_state=42
)
assert len(X_train) + len(X_test) == len(df)
assert len(y_train) == len(X_train)
assert len(y_test) == len(X_test)


def test_logistic_regression_training():
df = sample_df()
X_train, X_test, y_train, y_test = model.train_test_split_data(df, "target")
clf = model.train_logistic_regression(X_train, y_train, max_iter=100)
preds = clf.predict(X_test)
assert len(preds) == len(y_test)


def test_random_forest_training():
df = sample_df()
X_train, X_test, y_train, y_test = model.train_test_split_data(df, "target")
clf = model.train_random_forest_classifier(X_train, y_train, n_estimators=10)
preds = clf.predict(X_test)
assert len(preds) == len(y_test)


def test_cross_validation():
df = sample_df()
X = df.drop(columns=["target"])
y = df["target"]
clf = model.train_logistic_regression(X, y, max_iter=100)
scores = model.cross_validate_model(clf, X, y, cv=3)
assert len(scores) == 3


def test_model_caching():
df = sample_df()
X_train, _, y_train, _ = model.train_test_split_data(df, "target")

@model.cache_model
def custom_train(x, y):
return model.train_logistic_regression(x, y, max_iter=50)

first = custom_train(X_train, y_train)
second = custom_train(X_train, y_train)
assert first is second
3 changes: 2 additions & 1 deletion utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,6 @@
from . import data
from . import eda
from . import viz
from . import model

__all__ = ["config", "data", "eda", "viz"]
__all__ = ["config", "data", "eda", "viz", "model"]
96 changes: 96 additions & 0 deletions utils/model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
"""Modeling utilities for classification tasks."""

from __future__ import annotations

from functools import wraps
from typing import Callable, Tuple

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator


def cache_model(func: Callable) -> Callable:
"""Simple caching decorator for model training functions."""

cache = {}

@wraps(func)
def wrapper(*args, **kwargs):
key_parts = [func.__name__]
for arg in args:
if isinstance(arg, pd.DataFrame):
key_parts.append(pd.util.hash_pandas_object(arg, index=True).sum())
elif isinstance(arg, pd.Series):
key_parts.append(pd.util.hash_pandas_object(arg, index=True).sum())
else:
key_parts.append(repr(arg))
for k, v in sorted(kwargs.items()):
key_parts.append(f"{k}={v}")
key = tuple(key_parts)
if key not in cache:
cache[key] = func(*args, **kwargs)
return cache[key]

return wrapper


def train_test_split_data(
df: pd.DataFrame,
target: str,
*,
test_size: float = 0.2,
random_state: int | None = None,
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]:
"""Split dataframe into train and test sets."""
X = df.drop(columns=[target])
y = df[target]
return train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=y)


@cache_model
def train_logistic_regression(
X: pd.DataFrame,
y: pd.Series,
*,
C: float = 1.0,
max_iter: int = 1000,
) -> LogisticRegression:
"""Train a Logistic Regression classifier."""
model = LogisticRegression(C=C, max_iter=max_iter, n_jobs=None)
model.fit(X, y)
return model


@cache_model
def train_random_forest_classifier(
X: pd.DataFrame,
y: pd.Series,
*,
n_estimators: int = 100,
max_depth: int | None = None,
random_state: int | None = None,
) -> RandomForestClassifier:
"""Train a Random Forest classifier."""
model = RandomForestClassifier(
n_estimators=n_estimators,
max_depth=max_depth,
random_state=random_state,
)
model.fit(X, y)
return model


def cross_validate_model(
model: BaseEstimator,
X: pd.DataFrame,
y: pd.Series,
*,
cv: int = 5,
) -> np.ndarray:
"""Return cross-validation scores for the given model."""
scores = cross_val_score(model, X, y, cv=cv)
return scores