# 05 Train Alternative Models

This notebook trains and compares a Logistic Regression and a Random Forest model.

In [13]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join('..', 'src')))
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib
from ds_logger import start_logging, end_logging

notebook_description = "Trains and compares Logistic Regression and Random Forest models."
start_logging(notebook_name='05_train_alternative_model.ipynb', notebook_description=notebook_description)

In [14]:
# Find the latest feature data file
processed_data_dir = '../data/processed'
latest_feature_file = max([os.path.join(processed_data_dir, f) for f in os.listdir(processed_data_dir) if f.startswith('iris_features') and f.endswith('.csv')], key=os.path.getctime)
df = pd.read_csv(latest_feature_file)

# Prepare data for modeling
X = df.drop('target', axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
# Train and evaluate Logistic Regression
lr_model = LogisticRegression(max_iter=200)
lr_model.fit(X_train, y_train)
lr_preds = lr_model.predict(X_test)
lr_accuracy = accuracy_score(y_test, lr_preds)
joblib.dump(lr_model, '../models/iris_log_reg.joblib')

# Train and evaluate Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_preds = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_preds)
joblib.dump(rf_model, '../models/iris_random_forest.joblib')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


['../models/iris_random_forest_20250704_1529.joblib']

In [16]:
results = {
    'logistic_regression': {
        'hyperparameters': lr_model.get_params(),
        'accuracy': lr_accuracy
    },
    'random_forest': {
        'hyperparameters': rf_model.get_params(),
        'accuracy': rf_accuracy
    }
}
end_logging(results=results)