In [None]:
import pandas as pd
import numpy as np
%config InlineBackend.figure_format = 'retina'
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler

In [None]:
df = pd.read_csv("penguins_size.csv")
df = df.drop(["island","culmen_depth_mm","flipper_length_mm"],axis = 1)
df = df[df['species'] == 'Adelie']
df = df.drop(["species"],axis = 1)
df = df.dropna()
df['sex'] = df['sex'].replace({'FEMALE': 1, 'MALE': -1})
df['sex'] = df['sex'].astype(float)
X = df.iloc[:, :-1].values
scaler = StandardScaler()
X = scaler.fit_transform(X)
Y = df.iloc[:, -1].values  
Y = Y.reshape(-1, 1).astype(float) 
#Y[Y == 0] = -1 
X_and_Y = np.hstack((X, Y))     
np.random.seed(1)              
np.random.shuffle(X_and_Y)     

# Logistic

# 20 training

In [None]:
# Divide the data points into training set and test set.
X_shuffled = X_and_Y[:,:2]
Y_shuffled = X_and_Y[:,2]

split_index = int(0.8 * len(X_shuffled))  # 80% for training

X_train = X_shuffled[:split_index]
Y_train = Y_shuffled[:split_index]
X_test = X_shuffled[split_index:]
Y_test = Y_shuffled[split_index:]   
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

In [None]:
# Sigmoid function: sigmoid(z) = 1/(1 + e^(-z))
def sigmoid(z):
    return 1.0/(1.0+np.exp(-z))
# Judge function: 1(a != b).
def judge(a, b):
    """
    Judge function: 1(a != b).
    Return 1 if a != b, otherwise return 0.
    """
    if a != b:
        return 1
    else:
        return 0
    
def f_logistic(x, W, b):
    """
    Logistic classifier: f(x, W, b)
    This function should return -1 or 1.

    x should be a 2-dimensional vector, 
    W should be a 2-dimensional vector,
    b should be a scalar.
    """
    if sigmoid(W.T.dot(x)+b) >= 0.5:
        return 1
    else:
        return -1
    
# Calculate error given feature vectors X and labels Y.
def calc_error(X, Y, W, b):
    e = 0
    n = len(X)
    for (xi, yi) in zip(X, Y):
        # Hint: Use judge() and f_logistic()
        predicted_label = f_logistic(xi, W, b)
        e+=judge(yi, predicted_label)
    
    # Hint: remember we want the average error.
    e = 1.0*e/n
    return e
# Gradient of L(W, b) with respect to W and b.
def grad_L_W_b(X, Y, W, b):
    P = sigmoid(Y*(np.dot(X, W)+b))
    delta = np.ones_like(Y)-P
    grad_W = -np.dot(X.T,delta*Y)
    grad_b = -np.dot(np.ones(delta.shape).T, delta * Y) #dotting ones is the same as summing everything, wow
    return grad_W, grad_b
# Loss L(W, b).
def L_W_b(X, Y, W, b):
    P = sigmoid(Y*(np.dot(X, W)+b))
    loss = -np.dot(np.ones_like(Y).T, np.log(P))

    return loss
def logistic_regression(X_train, Y_train):
    # Some settings.
    losses = []           # Error history.
    learning_rate = 0.001 # Learning rate, fixed
    iterations    = 10000 # Iteration number, fixed

    # Gradient descent algorithm for logistic regression.
    # Step 1. Initialize the parameters W, b.
    W      = np.zeros(2)  # Weight.
    b      = 0.0          # Bias.

    # Logistic regression learning algorithm.
    for i in range(iterations):
        # Step 2. Compute the partial derivatives.
        grad_W, grad_b = grad_L_W_b(X_train, Y_train, W, b)
        # Step 3. Update the parameters.
        W = W-learning_rate*grad_W
        b = b-learning_rate*grad_b
        
        # Track the training losses.
        loss = L_W_b(X_train, Y_train, W, b)
        losses.append(loss)

    return W, b, losses

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
C_values = [0.001, 0.01, 0.1, 1, 10, 100]

best_C = None
best_score = 0
#Cross validation
for C in C_values:
    model = LogisticRegression(C=C, solver='lbfgs')
    scores = cross_val_score(model, X_train, Y_train, cv=5)  # 5-fold cross-validation
    avg_score = scores.mean()
    
    print(f"C: {C}, Average Cross-Validation Score: {avg_score}")

    if avg_score > best_score:
        best_score = avg_score
        best_C = C

print(f"Best C: {best_C} with score: {best_score}")
from sklearn.metrics import accuracy_score
final_model = LogisticRegression(C=best_C, solver='lbfgs')
final_model.fit(X_train, Y_train)
Y_train_pred = final_model.predict(X_train)
training_accuracy = accuracy_score(Y_train, Y_train_pred)
training_error = 1 - training_accuracy
print(f"Training accuracy: {training_accuracy}")
print(f"Training error: {training_error}")
Y_test_pred = final_model.predict(X_test)
test_accuracy = accuracy_score(Y_test, Y_test_pred)
test_error = 1 - test_accuracy
print(f"Test accuracy: {test_accuracy}")
print(f"Test error: {test_error}")

# 50 training

In [None]:
# Divide the data points into training set and test set.
X_shuffled = X_and_Y[:,:2]
Y_shuffled = X_and_Y[:,2]

split_index = int(0.5 * len(X_shuffled))  # 80% for training

X_train = X_shuffled[:split_index]
Y_train = Y_shuffled[:split_index]
X_test = X_shuffled[split_index:]
Y_test = Y_shuffled[split_index:]   
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
C_values = [0.001, 0.01, 0.1, 1, 10, 100]

best_C = None
best_score = 0
#Cross validation
for C in C_values:
    model = LogisticRegression(C=C, solver='lbfgs')
    scores = cross_val_score(model, X_train, Y_train, cv=5)  # 5-fold cross-validation
    avg_score = scores.mean()
    
    print(f"C: {C}, Average Cross-Validation Score: {avg_score}")

    if avg_score > best_score:
        best_score = avg_score
        best_C = C

print(f"Best C: {best_C} with score: {best_score}")
from sklearn.metrics import accuracy_score
final_model = LogisticRegression(C=best_C, solver='lbfgs')
final_model.fit(X_train, Y_train)
Y_train_pred = final_model.predict(X_train)
training_accuracy = accuracy_score(Y_train, Y_train_pred)
training_error = 1 - training_accuracy
print(f"Training accuracy: {training_accuracy}")
print(f"Training error: {training_error}")
Y_test_pred = final_model.predict(X_test)
test_accuracy = accuracy_score(Y_test, Y_test_pred)
test_error = 1 - test_accuracy
print(f"Test accuracy: {test_accuracy}")
print(f"Test error: {test_error}")

# 80 training

In [None]:
# Divide the data points into training set and test set.
X_shuffled = X_and_Y[:,:2]
Y_shuffled = X_and_Y[:,2]

split_index = int(0.8 * len(X_shuffled))  # 80% for training

X_train = X_shuffled[:split_index]
Y_train = Y_shuffled[:split_index]
X_test = X_shuffled[split_index:]
Y_test = Y_shuffled[split_index:]   
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
C_values = [0.001, 0.01, 0.1, 1, 10, 100]

best_C = None
best_score = 0
#Cross validation
for C in C_values:
    model = LogisticRegression(C=C, solver='lbfgs')
    scores = cross_val_score(model, X_train, Y_train, cv=5)  # 5-fold cross-validation
    avg_score = scores.mean()
    
    print(f"C: {C}, Average Cross-Validation Score: {avg_score}")

    if avg_score > best_score:
        best_score = avg_score
        best_C = C

print(f"Best C: {best_C} with score: {best_score}")
from sklearn.metrics import accuracy_score
final_model = LogisticRegression(C=best_C, solver='lbfgs')
final_model.fit(X_train, Y_train)
Y_train_pred = final_model.predict(X_train)
training_accuracy = accuracy_score(Y_train, Y_train_pred)
training_error = 1 - training_accuracy
print(f"Training accuracy: {training_accuracy}")
print(f"Training error: {training_error}")
Y_test_pred = final_model.predict(X_test)
test_accuracy = accuracy_score(Y_test, Y_test_pred)
test_error = 1 - test_accuracy
print(f"Test accuracy: {test_accuracy}")
print(f"Test error: {test_error}")