### Data load and preprocessing

In [None]:
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
# Load the penguins dataset
df = sns.load_dataset("penguins")
df.dropna(inplace=True)

In [None]:
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,Male


In [None]:
print(df.dtypes)
print(df.shape)

# Find the unique values in the 'species' column
unique_species = df['species'].unique()
print(unique_species)

species               object
island                object
bill_length_mm       float64
bill_depth_mm        float64
flipper_length_mm    float64
body_mass_g          float64
sex                   object
dtype: object
(333, 7)
['Adelie' 'Chinstrap' 'Gentoo']


In [None]:
# Filter rows for 'Adelie' and 'Chinstrap' classes
selected_classes = ['Adelie', 'Chinstrap']
df_filtered = df[df['species'].isin(selected_classes)].copy() # Make a copy to avoid the warning

# Initialize the LabelEncoder
le = LabelEncoder()
# Encode the species column
y_encoded = le.fit_transform(df_filtered['species'])
df_filtered['class_encoded'] = y_encoded

# Display the filtered and encoded DataFrame
print(df_filtered[['species', 'class_encoded']])

# Split the data into features (X) and target variable (y)
y = df_filtered['class_encoded'] # Target variable
X = df_filtered.drop(['species', 'island', 'sex','class_encoded'], axis=1)

       species  class_encoded
0       Adelie              0
1       Adelie              0
2       Adelie              0
4       Adelie              0
5       Adelie              0
..         ...            ...
215  Chinstrap              1
216  Chinstrap              1
217  Chinstrap              1
218  Chinstrap              1
219  Chinstrap              1

[214 rows x 2 columns]


### Training LR model

In [None]:
#Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2, random_state=42)

#Train the logistic regression model. Here we are using saga solver to learn weights.
logreg = LogisticRegression(solver='saga')
logreg.fit(X_train, y_train)



Logistic Regression with saga solver (Unscaled)

In [None]:
# Predict on the testing data
y_pred = logreg.predict(X_test)
# Evaluate the model
accuracy_saga = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy_saga)
print(logreg.coef_, logreg.intercept_)

Accuracy: 0.5813953488372093
[[ 2.76064396e-03 -8.35374683e-05  4.51815205e-04 -2.85979528e-04]] [-8.53947064e-06]


In [None]:
print(X_train.shape)


(171, 4)


Logistic Regression with liblinear solver (Unscaled)

In [None]:
logreg = LogisticRegression(solver='liblinear')
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
accuracy_liblinear = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy_liblinear)
print(logreg.coef_, logreg.intercept_)

Accuracy: 1.0
[[ 1.45422752 -0.93943994 -0.16571368 -0.00398663]] [-0.04793176]


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Logistic Regression with liblinear solver (Scaled)

In [None]:
logreg_liblinear_scaled = LogisticRegression(solver='liblinear')
logreg_liblinear_scaled.fit(X_train_scaled, y_train)
y_pred_liblinear_scaled = logreg_liblinear_scaled.predict(X_test_scaled)
accuracy_liblinear_scaled = accuracy_score(y_test, y_pred_liblinear_scaled)

Logistic Regression with saga solver (Scaled)

In [None]:
logreg_saga_scaled = LogisticRegression(solver='saga')
logreg_saga_scaled.fit(X_train_scaled, y_train)
y_pred_saga_scaled = logreg_saga_scaled.predict(X_test_scaled)
accuracy_saga_scaled = accuracy_score(y_test, y_pred_saga_scaled)

In [None]:
print("Accuracy with liblinear (without scaling):", accuracy_liblinear)
print("Accuracy with liblinear (with scaling):", accuracy_liblinear_scaled)
print("Accuracy with saga (without scaling):", accuracy_saga)
print("Accuracy with saga (with scaling):", accuracy_saga_scaled)

Accuracy with liblinear (without scaling): 1.0
Accuracy with liblinear (with scaling): 0.9767441860465116
Accuracy with saga (without scaling): 0.5813953488372093
Accuracy with saga (with scaling): 0.9767441860465116


### Logistic regression with multiple features

In [None]:
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load the penguins dataset
df = sns.load_dataset("penguins")
df.dropna(inplace=True)

# Filter rows for 'Adelie' and 'Chinstrap' classes
selected_classes = ['Adelie', 'Chinstrap']
df_filtered = df[df['species'].isin(selected_classes)].copy() # Make a copy to avoid the warning

In [None]:
# Initialize the LabelEncoder
le = LabelEncoder()
# Encode the species column
y_encoded = le.fit_transform(df_filtered['species'])
df_filtered['class_encoded'] = y_encoded
df_filtered.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,class_encoded
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male,0
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female,0
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female,0
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female,0
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,Male,0


In [None]:
X = df_filtered.drop(['species', 'class_encoded'], axis=1)
X = pd.get_dummies(X, columns=['sex', 'island']) # One-hot encoding
y = df_filtered['class_encoded'] # Target variable
X.head()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex_Female,sex_Male,island_Biscoe,island_Dream,island_Torgersen
0,39.1,18.7,181.0,3750.0,False,True,False,False,True
1,39.5,17.4,186.0,3800.0,True,False,False,False,True
2,40.3,18.0,195.0,3250.0,True,False,False,False,True
4,36.7,19.3,193.0,3450.0,True,False,False,False,True
5,39.3,20.6,190.0,3650.0,False,True,False,False,True


In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2, random_state=42)
logreg = LogisticRegression(solver='saga')
logreg.fit(X_train, y_train)

# Predict on the testing data
y_pred = logreg.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(logreg.coef_, logreg.intercept_)

Accuracy: 0.5813953488372093
[[ 2.75621550e-03 -8.12931606e-05  4.67169024e-04 -2.86847179e-04
  -1.90196940e-05  1.05611247e-05 -8.83324886e-05  1.85381302e-04
  -1.05507382e-04]] [-8.45860583e-06]


