# Task 2: Classification with Logistic Regression

### Build a decision tree classifier to predict a categorical outcome (e.g., predict species of flowers)

- Preprocess the data (e.g., handling categorical features, feature scaling).
- Train and evaluate the logistic regression model.
- Use metrics such as accuracy, precision, recall, and the ROC curve for evaluation.
- Compare logistic regression with other classifiers like Random Forest or SVM.

In [1]:
import os
os.environ["OMP_NUM_THREADS"] = "1"

import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
import matplotlib.pyplot as plt

In [3]:
iris_df = pd.read_csv("data/iris.csv")
iris_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [4]:
iris_df.isna().sum()

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64

In [5]:
# Define features
numeric_features = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']

# Create the preprocessor for X
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features)
    ])

In [6]:
# Split the data (Assuming 'df' is your Iris dataframe)
X = iris_df[numeric_features]
y = iris_df['species']

X_train_raw, X_test_raw, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# --- EXECUTE PREPROCESSING ---
X_train = preprocessor.fit_transform(X_train_raw)
X_test = preprocessor.transform(X_test_raw)

In [8]:
models = {
    "Logistic Regression": LogisticRegression(multi_class='ovr'),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(probability=True)
}

In [9]:
results = []

for name, model in models.items():
    # Fit
    model.fit(X_train, y_train)
    
    # Predict
    y_pred = model.predict(X_test)
    y_probs = model.predict_proba(X_test) # Returns probabilities for each class
    
    # Calculate Metrics 
    # (Using 'weighted' average to account for multi-class nature)
    results.append({
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, average='weighted'),
        "Recall": recall_score(y_test, y_pred, average='weighted'),
        "AUC": roc_auc_score(y_test, y_probs, multi_class='ovr')
    })

# Compare Results
results_df = pd.DataFrame(results).set_index("Model")
print(results_df)



                     Accuracy  Precision    Recall       AUC
Model                                                       
Logistic Regression  0.966667   0.969444  0.966667  0.994878
Random Forest        1.000000   1.000000  1.000000  1.000000
SVM                  1.000000   1.000000  1.000000  1.000000
