## Logistic Regression Workshop

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('iris-data-clean.csv')
print(df.head())

class_mapping = {
    "Setosa": 0,
    "Virginica": 1,
    "Versicolor": 2
}
df['class'] = df['class'].replace(class_mapping)
print(df['class'].value_counts().sort_index())

   sepal_length_cm  sepal_width_cm  petal_length_cm  petal_width_cm   class
0              5.1             3.5              1.4             0.2  Setosa
1              4.9             3.0              1.4             0.2  Setosa
2              4.7             3.2              1.3             0.2  Setosa
3              4.6             3.1              1.5             0.2  Setosa
4              5.0             3.6              1.4             0.2  Setosa
class
0    45
1    50
2    50
Name: count, dtype: int64


  df['class'] = df['class'].replace(class_mapping)


In [6]:
X = df[['sepal_length_cm','sepal_width_cm','petal_length_cm','petal_width_cm']]
y = df['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model = LogisticRegression(max_iter=200, random_state=42)
model.fit(X_train, y_train)

# predict
sample_data = np.array([[6.5, 3.0, 5.2, 2.0]])
predictions = model.predict(sample_data)
print(predictions);

#validate
print("\n validate:")
y_pred = model.predict(X_test)

# 混淆矩阵
cm = confusion_matrix(y_test, y_pred)
print("\nconfusin_matrix：")
print(cm)

# 准确率
accuracy = accuracy_score(y_test, y_pred)
print(f"\naccuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")

[1]

 validate:

confusin_matrix：
[[14  0  0]
 [ 0 14  4]
 [ 0  1 11]]

accuracy: 0.8864 (88.64%)




In [9]:
X_2features = df[['sepal_length_cm','sepal_width_cm']]

X_train_2f, X_test_2f, y_train_2f, y_test_2f = train_test_split(
    X_2features, y, test_size=0.3, random_state=42
)

model_2features = LogisticRegression(max_iter=200, random_state=42)
model_2features.fit(X_train_2f, y_train_2f)
y_pred_2f = model_2features.predict(X_test_2f)
accuracy_2f = accuracy_score(y_test_2f, y_pred_2f)
print(f"\n2 features: accuracy: {accuracy_2f:.4f} ({accuracy_2f*100:.2f}%)")
print(f"\n4 features: accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")


2 features: accuracy: 0.7500 (75.00%)

4 features: accuracy: 0.8864 (88.64%)
