# Binary Logistic Regression

## 1. Do an exploratory data analysis on the iris dataset from seaborn library. (Done in previous problem)

## 2. First use scikit-learn and next use PyTorch to build binary logistic regression and do all the neccessary things in machine learning process.
* Features: `sepal_length`, `sepal_width`, `petal_length` and `petal_width`
* Target: `species` is `setosa` or not

In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder

In [2]:
iris = sns.load_dataset('iris')

# Convert the target 'species' to binary: 1 for 'setosa' and 0 for others
iris['is_setosa'] = iris['species'].apply(lambda x: 1 if x == 'setosa' else 0)

# Select features and target
X = iris[['petal_length', 'petal_width', 'sepal_length', 'sepal_width']]
y = iris['is_setosa']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)  # Fit the scaler on the training data and transform it
X_test_scaled = scaler.transform(X_test)        # Transform the test data using the same scaler

# Create and train the logistic regression model
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

Accuracy: 1.0
Confusion Matrix:
[[20  0]
 [ 0 10]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        20
           1       1.00      1.00      1.00        10

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



In [3]:
# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)

# Define the Logistic Regression model in PyTorch
class LogisticRegressionModel(nn.Module):
    def __init__(self):
        super(LogisticRegressionModel, self).__init__()
        self.linear = nn.Linear(4, 1)  # 4 input features, 1 output (binary)

    def forward(self, x):
        return torch.sigmoid(self.linear(x))

# Create the model, loss function, and optimizer
model_pytorch = LogisticRegressionModel()
criterion = nn.BCELoss()  # Binary Cross Entropy Loss
optimizer = optim.SGD(model_pytorch.parameters(), lr=0.01)

# Train the PyTorch model
epochs = 1000
for epoch in range(epochs):
    model_pytorch.train()

    # Forward pass
    y_pred = model_pytorch(X_train_tensor)

    # Compute loss
    loss = criterion(y_pred, y_train_tensor)

    # Backward pass and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 100 == 0:
        print(f'Epoch [{epoch + 1}/{epochs}], Loss: {loss.item():.4f}')

# Evaluate the PyTorch model
model_pytorch.eval()
with torch.no_grad():
    y_test_pred = model_pytorch(X_test_tensor)
    y_test_pred_class = (y_test_pred >= 0.5).float()
    accuracy = accuracy_score(y_test_tensor, y_test_pred_class)
    print(f'PyTorch Model - Accuracy: {accuracy:.4f}')

Epoch [100/1000], Loss: 0.3768
Epoch [200/1000], Loss: 0.2559
Epoch [300/1000], Loss: 0.1953
Epoch [400/1000], Loss: 0.1587
Epoch [500/1000], Loss: 0.1342
Epoch [600/1000], Loss: 0.1165
Epoch [700/1000], Loss: 0.1031
Epoch [800/1000], Loss: 0.0926
Epoch [900/1000], Loss: 0.0842
Epoch [1000/1000], Loss: 0.0772
PyTorch Model - Accuracy: 1.0000


In [4]:
# Save model both sklearn and PyTorch
import joblib

# Save the sklearn binary logistic regression model and scaler
joblib.dump(
    model, 
    '/Users/mac/Desktop/Home/Year 5/NLP/NLP-Project/src/P1/PDS-Regression/model/sklearn_iris_binary_lgr_model.pkl'
)
joblib.dump(
    scaler,
    '/Users/mac/Desktop/Home/Year 5/NLP/NLP-Project/src/P1/PDS-Regression/model/scaler_binary_lgr_model.pkl'
)

# Save the PyTorch binary logistic regression model
torch.save(
    obj=model_pytorch.state_dict(),
    f='/Users/mac/Desktop/Home/Year 5/NLP/NLP-Project/src/P1/PDS-Regression/model/pytorch_iris_binary_lgr_model.pkl'
)


In [5]:
display(X_test)
X_test['is_setosa'] = y_test.values
display(X_test)

Unnamed: 0,petal_length,petal_width,sepal_length,sepal_width
73,4.7,1.2,6.1,2.8
18,1.7,0.3,5.7,3.8
118,6.9,2.3,7.7,2.6
78,4.5,1.5,6.0,2.9
76,4.8,1.4,6.8,2.8
31,1.5,0.4,5.4,3.4
64,3.6,1.3,5.6,2.9
141,5.1,2.3,6.9,3.1
68,4.5,1.5,6.2,2.2
82,3.9,1.2,5.8,2.7


Unnamed: 0,petal_length,petal_width,sepal_length,sepal_width,is_setosa
73,4.7,1.2,6.1,2.8,0
18,1.7,0.3,5.7,3.8,1
118,6.9,2.3,7.7,2.6,0
78,4.5,1.5,6.0,2.9,0
76,4.8,1.4,6.8,2.8,0
31,1.5,0.4,5.4,3.4,1
64,3.6,1.3,5.6,2.9,0
141,5.1,2.3,6.9,3.1,0
68,4.5,1.5,6.2,2.2,0
82,3.9,1.2,5.8,2.7,0


In [7]:
iris.sample(10)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,is_setosa
38,4.4,3.0,1.3,0.2,setosa,1
131,7.9,3.8,6.4,2.0,virginica,0
36,5.5,3.5,1.3,0.2,setosa,1
87,6.3,2.3,4.4,1.3,versicolor,0
109,7.2,3.6,6.1,2.5,virginica,0
134,6.1,2.6,5.6,1.4,virginica,0
56,6.3,3.3,4.7,1.6,versicolor,0
132,6.4,2.8,5.6,2.2,virginica,0
112,6.8,3.0,5.5,2.1,virginica,0
20,5.4,3.4,1.7,0.2,setosa,1
