## Logistic Regression Workshop

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score

# 1) Load iris datasets from iris-data-clean.csv
df = pd.read_csv('iris-data-clean.csv')

# Replace the values in the columns 'Class'
df['class'] = df['class'].map({'Setosa': 0, 'Versicolor': 1, 'Virginica': 2})

# 2) Using Logistic Regression, classify the outcome based on all four features
X = df[['sepal_length_cm', 'sepal_width_cm', 'petal_length_cm', 'petal_width_cm']]
y = df['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model_4features = LogisticRegression(multi_class='multinomial', max_iter=1000)
model_4features.fit(X_train, y_train)

# a) Provide some values to predict the outcome
print("\n===== 4-Feature Model =====")
y_pred_4f = model_4features.predict(X_test)

sample_data_4f = [
    [5.1, 3.5, 1.4, 0.2],  # Setosa
    [7.0, 3.2, 4.7, 1.4],  # Versicolor
    [6.3, 3.3, 6.0, 2.5],  # Virginica
]

sample_predictions = model_4features.predict(sample_data_4f)
sample_probabilities = model_4features.predict_proba(sample_data_4f)

class_names = {0: 'Setosa', 1: 'Virginica', 2: 'Versicolor'}

print("\nSample predictions:\n")
for i, sample in enumerate(sample_data_4f):
    predicted_class = sample_predictions[i]
    probabilities = sample_probabilities[i]
    print(f"Sample {i+1}: {sample}")
    print(f"Predicted class: {predicted_class} ({class_names[predicted_class]})")
    print(f"Probabilities: Setosa: {probabilities[0]:.3f}, Virginica: {probabilities[1]:.3f}, Versicolor: {probabilities[2]:.3f}")
    print()

# b) Validate the model - print the confusion matrix and the accuracy score
accuracy_4f = accuracy_score(y_test, y_pred_4f)
cm_4f = confusion_matrix(y_test, y_pred_4f)

print(f"4-Feature Model Accuracy: {accuracy_4f:.4f}")
print("\n4-Feature Model Confusion Matrix:")
print(cm_4f)

# 3) Redo the above steps with any two features
print("\n===== 2-Feature Model =====")
X_2features = df[['petal_length_cm', 'petal_width_cm']]

X_train_2f, X_test_2f, y_train_2f, y_test_2f = train_test_split(X_2features, y, test_size=0.3, random_state=42)

model_2features = LogisticRegression(multi_class='multinomial', max_iter=1000)
model_2features.fit(X_train_2f, y_train_2f)

y_pred_2f = model_2features.predict(X_test_2f)

sample_data_2f = [
    [1.4, 0.2],  # Setosa
    [4.7, 1.4],  # Versicolor
    [6.0, 2.5],  # Virginica
]

sample_predictions_2f = model_2features.predict(sample_data_2f)
sample_probabilities_2f = model_2features.predict_proba(sample_data_2f)

print("\n2-Feature Model - Sample predictions:\n")
for i, sample in enumerate(sample_data_2f):
    predicted_class = sample_predictions_2f[i]
    probabilities = sample_probabilities_2f[i]
    print(f"Sample {i+1}: {sample}")
    print(f"Predicted class: {predicted_class} ({class_names[predicted_class]})")
    print(f"Probabilities: Setosa: {probabilities[0]:.3f}, Virginica: {probabilities[1]:.3f}, Versicolor: {probabilities[2]:.3f}")
    print()

accuracy_2f = accuracy_score(y_test_2f, y_pred_2f)
cm_2f = confusion_matrix(y_test_2f, y_pred_2f)

print(f"2-Feature Model Accuracy: {accuracy_2f:.4f}")
print("\n2-Feature Model Confusion Matrix:")
print(cm_2f)

# a) Compare the accuracy score with the model built in the above with four features
print("\n===== Model Comparison =====")
print(f"4-Feature Model Accuracy: {accuracy_4f:.4f}")
print(f"2-Feature Model Accuracy: {accuracy_2f:.4f}")
print(f"Accuracy Difference: {abs(accuracy_4f - accuracy_2f):.4f}")

if accuracy_4f > accuracy_2f:
    print("4-Feature Model performs better.")
elif accuracy_4f < accuracy_2f:
    print("2-Feature Model performs better.")
else:
    print("Both models perform equally.")


===== 4-Feature Model =====

Sample predictions:

Sample 1: [5.1, 3.5, 1.4, 0.2]
Predicted class: 0 (Setosa)
Probabilities: Setosa: 0.973, Virginica: 0.027, Versicolor: 0.000

Sample 2: [7.0, 3.2, 4.7, 1.4]
Predicted class: 1 (Virginica)
Probabilities: Setosa: 0.003, Virginica: 0.823, Versicolor: 0.173

Sample 3: [6.3, 3.3, 6.0, 2.5]
Predicted class: 2 (Versicolor)
Probabilities: Setosa: 0.000, Virginica: 0.007, Versicolor: 0.993

4-Feature Model Accuracy: 0.8864

4-Feature Model Confusion Matrix:
[[14  0  0]
 [ 0 11  1]
 [ 0  4 14]]

===== 2-Feature Model =====

2-Feature Model - Sample predictions:

Sample 1: [1.4, 0.2]
Predicted class: 0 (Setosa)
Probabilities: Setosa: 0.969, Virginica: 0.031, Versicolor: 0.000

Sample 2: [4.7, 1.4]
Predicted class: 1 (Virginica)
Probabilities: Setosa: 0.004, Virginica: 0.790, Versicolor: 0.206

Sample 3: [6.0, 2.5]
Predicted class: 2 (Versicolor)
Probabilities: Setosa: 0.000, Virginica: 0.009, Versicolor: 0.991

2-Feature Model Accuracy: 0.8864

2

