In [1]:
import pandas as pd
import numpy as np
%config InlineBackend.figure_format = 'retina'
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
import scipy
from matplotlib.colors import ListedColormap
from functools import partial

In [2]:
df = pd.read_csv('glass.data', header=None, delimiter=',', dtype=float)
df = df.drop([0,1,2,3,4,5,6,8], axis=1)
df = df[df[10].isin([1.0, 2.0])]
df[10] = df[10].replace({1.0: 1, 2.0: -1})
df[10] = df[10].astype(float)

In [3]:
X = df.iloc[:, :-1].values  
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X = scaler.fit_transform(X)
Y = df.iloc[:, -1].values  
Y = Y.reshape(-1, 1).astype(float) 
#Y[Y == 0] = -1 
X_and_Y = np.hstack((X, Y))     # Stack them together for shuffling.
np.random.seed(1)               # Set the random seed.
np.random.shuffle(X_and_Y)      # Shuffle the data points in X_and_Y array

print(X.shape)
print(Y.shape)
print(X_and_Y[0])
# Divide the data points into training set and test set.
X_shuffled = X_and_Y[:,:2]
Y_shuffled = X_and_Y[:,2]

split_index = int(0.8 * len(X_shuffled))  # 80% for training

X_train = X_shuffled[:split_index]
Y_train = Y_shuffled[:split_index]
X_test = X_shuffled[split_index:]
Y_test = Y_shuffled[split_index:]   
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

(146, 2)
(146, 1)
[-0.27868438 -0.69899332 -1.        ]
(116, 2)
(116,)
(30, 2)
(30,)


# Logistic

# 20 training

In [4]:
# Divide the data points into training set and test set.
X_shuffled = X_and_Y[:,:2]
Y_shuffled = X_and_Y[:,2]

split_index = int(0.2 * len(X_shuffled))  # 20% for training

X_train = X_shuffled[:split_index]
Y_train = Y_shuffled[:split_index]
X_test = X_shuffled[split_index:]
Y_test = Y_shuffled[split_index:]   
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

(29, 2)
(29,)
(117, 2)
(117,)


In [5]:
# Sigmoid function: sigmoid(z) = 1/(1 + e^(-z))
def sigmoid(z):
    return 1.0/(1.0+np.exp(-z))
# Judge function: 1(a != b).
def judge(a, b):
    """
    Judge function: 1(a != b).
    Return 1 if a != b, otherwise return 0.
    """
    if a != b:
        return 1
    else:
        return 0
    
def f_logistic(x, W, b):
    """
    Logistic classifier: f(x, W, b)
    This function should return -1 or 1.

    x should be a 2-dimensional vector, 
    W should be a 2-dimensional vector,
    b should be a scalar.
    """
    if sigmoid(W.T.dot(x)+b) >= 0.5:
        return 1
    else:
        return -1
    
# Calculate error given feature vectors X and labels Y.
def calc_error(X, Y, W, b):
    e = 0
    n = len(X)
    for (xi, yi) in zip(X, Y):
        # Hint: Use judge() and f_logistic()
        predicted_label = f_logistic(xi, W, b)
        e+=judge(yi, predicted_label)
    
    # Hint: remember we want the average error.
    e = 1.0*e/n
    return e
# Gradient of L(W, b) with respect to W and b.
def grad_L_W_b(X, Y, W, b):
    P = sigmoid(Y*(np.dot(X, W)+b))
    delta = np.ones_like(Y)-P
    grad_W = -np.dot(X.T,delta*Y)
    grad_b = -np.dot(np.ones(delta.shape).T, delta * Y) #dotting ones is the same as summing everything, wow
    return grad_W, grad_b
# Loss L(W, b).
def L_W_b(X, Y, W, b):
    P = sigmoid(Y*(np.dot(X, W)+b))
    loss = -np.dot(np.ones_like(Y).T, np.log(P))

    return loss
def logistic_regression(X_train, Y_train):
    # Some settings.
    losses = []           # Error history.
    learning_rate = 0.001 # Learning rate, fixed
    iterations    = 10000 # Iteration number, fixed

    # Gradient descent algorithm for logistic regression.
    # Step 1. Initialize the parameters W, b.
    W      = np.zeros(2)  # Weight.
    b      = 0.0          # Bias.

    # Logistic regression learning algorithm.
    for i in range(iterations):
        # Step 2. Compute the partial derivatives.
        grad_W, grad_b = grad_L_W_b(X_train, Y_train, W, b)
        # Step 3. Update the parameters.
        W = W-learning_rate*grad_W
        b = b-learning_rate*grad_b
        
        # Track the training losses.
        loss = L_W_b(X_train, Y_train, W, b)
        losses.append(loss)

    return W, b, losses
#W,b = logistic_regression(X_train, Y_train)
def cross_validate_logistic_regression(X, Y, k=5):
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    validation_errors = []  # To store validation errors for each fold
    models = []  # To store W, b, and losses for each fold

    for train_idx, val_idx in kf.split(X):
        X_train, X_val = X[train_idx], X[val_idx]
        Y_train, Y_val = Y[train_idx], Y[val_idx]

        # Train the model on the training data
        W, b, losses = logistic_regression(X_train, Y_train)
        
        # Evaluate on the validation set
        error = calc_error(X_val, Y_val, W, b)
        validation_errors.append(error)
        
        # Store the model for this fold
        models.append((W, b, losses))
    
    # Average validation error across folds
    avg_validation_error = np.mean(validation_errors)
    return avg_validation_error, models, validation_errors


In [6]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
C_values = [0.001, 0.01, 0.1, 1, 10, 100]

best_C = None
best_score = 0
#Cross validation
for C in C_values:
    model = LogisticRegression(C=C, solver='lbfgs')
    scores = cross_val_score(model, X_train, Y_train, cv=5)  # 5-fold cross-validation
    avg_score = scores.mean()
    
    print(f"C: {C}, Average Cross-Validation Score: {avg_score}")

    if avg_score > best_score:
        best_score = avg_score
        best_C = C

print(f"Best C: {best_C} with score: {best_score}")
from sklearn.metrics import accuracy_score
final_model = LogisticRegression(C=best_C, solver='lbfgs')
final_model.fit(X_train, Y_train)
Y_train_pred = final_model.predict(X_train)
training_accuracy = accuracy_score(Y_train, Y_train_pred)
training_error = 1 - training_accuracy
print(f"Training accuracy: {training_accuracy}")
print(f"Training error: {training_error}")
Y_test_pred = final_model.predict(X_test)
test_accuracy = accuracy_score(Y_test, Y_test_pred)
test_error = 1 - test_accuracy
print(f"Test accuracy: {test_accuracy}")
print(f"Test error: {test_error}")

C: 0.001, Average Cross-Validation Score: 0.5533333333333333
C: 0.01, Average Cross-Validation Score: 0.5533333333333333
C: 0.1, Average Cross-Validation Score: 0.4533333333333333
C: 1, Average Cross-Validation Score: 0.42000000000000004
C: 10, Average Cross-Validation Score: 0.4533333333333333
C: 100, Average Cross-Validation Score: 0.4533333333333333
Best C: 0.001 with score: 0.5533333333333333
Training accuracy: 0.5517241379310345
Training error: 0.4482758620689655
Test accuracy: 0.5128205128205128
Test error: 0.4871794871794872


# 50 training

In [7]:
# Divide the data points into training set and test set.
X_shuffled = X_and_Y[:,:2]
Y_shuffled = X_and_Y[:,2]

split_index = int(0.5 * len(X_shuffled))

X_train = X_shuffled[:split_index]
Y_train = Y_shuffled[:split_index]
X_test = X_shuffled[split_index:]
Y_test = Y_shuffled[split_index:]   
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

(73, 2)
(73,)
(73, 2)
(73,)


In [8]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
C_values = [0.001, 0.01, 0.1, 1, 10, 100]

best_C = None
best_score = 0
#Cross validation
for C in C_values:
    model = LogisticRegression(C=C, solver='lbfgs')
    scores = cross_val_score(model, X_train, Y_train, cv=5)  # 5-fold cross-validation
    avg_score = scores.mean()
    
    print(f"C: {C}, Average Cross-Validation Score: {avg_score}")

    if avg_score > best_score:
        best_score = avg_score
        best_C = C

print(f"Best C: {best_C} with score: {best_score}")
from sklearn.metrics import accuracy_score
final_model = LogisticRegression(C=best_C, solver='lbfgs')
final_model.fit(X_train, Y_train)
Y_train_pred = final_model.predict(X_train)
training_accuracy = accuracy_score(Y_train, Y_train_pred)
training_error = 1 - training_accuracy
print(f"Training accuracy: {training_accuracy}")
print(f"Training error: {training_error}")
Y_test_pred = final_model.predict(X_test)
test_accuracy = accuracy_score(Y_test, Y_test_pred)
test_error = 1 - test_accuracy
print(f"Test accuracy: {test_accuracy}")
print(f"Test error: {test_error}")

C: 0.001, Average Cross-Validation Score: 0.52
C: 0.01, Average Cross-Validation Score: 0.52
C: 0.1, Average Cross-Validation Score: 0.4647619047619048
C: 1, Average Cross-Validation Score: 0.5333333333333333
C: 10, Average Cross-Validation Score: 0.5466666666666666
C: 100, Average Cross-Validation Score: 0.5466666666666666
Best C: 10 with score: 0.5466666666666666
Training accuracy: 0.547945205479452
Training error: 0.452054794520548
Test accuracy: 0.5068493150684932
Test error: 0.4931506849315068


# 80 training

In [9]:
# Divide the data points into training set and test set.
X_shuffled = X_and_Y[:,:2]
Y_shuffled = X_and_Y[:,2]

split_index = int(0.8 * len(X_shuffled))

X_train = X_shuffled[:split_index]
Y_train = Y_shuffled[:split_index]
X_test = X_shuffled[split_index:]
Y_test = Y_shuffled[split_index:]   
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

(116, 2)
(116,)
(30, 2)
(30,)


In [10]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
C_values = [0.001, 0.01, 0.1, 1, 10, 100]

best_C = None
best_score = 0
#Cross validation
for C in C_values:
    model = LogisticRegression(C=C, solver='lbfgs')
    scores = cross_val_score(model, X_train, Y_train, cv=5)  # 5-fold cross-validation
    avg_score = scores.mean()
    
    print(f"C: {C}, Average Cross-Validation Score: {avg_score}")

    if avg_score > best_score:
        best_score = avg_score
        best_C = C

print(f"Best C: {best_C} with score: {best_score}")
from sklearn.metrics import accuracy_score
final_model = LogisticRegression(C=best_C, solver='lbfgs')
final_model.fit(X_train, Y_train)
Y_train_pred = final_model.predict(X_train)
training_accuracy = accuracy_score(Y_train, Y_train_pred)
training_error = 1 - training_accuracy
print(f"Training accuracy: {training_accuracy}")
print(f"Training error: {training_error}")
Y_test_pred = final_model.predict(X_test)
test_accuracy = accuracy_score(Y_test, Y_test_pred)
test_error = 1 - test_accuracy
print(f"Test accuracy: {test_accuracy}")
print(f"Test error: {test_error}")

C: 0.001, Average Cross-Validation Score: 0.5344202898550725
C: 0.01, Average Cross-Validation Score: 0.5344202898550725
C: 0.1, Average Cross-Validation Score: 0.4818840579710145
C: 1, Average Cross-Validation Score: 0.4909420289855073
C: 10, Average Cross-Validation Score: 0.4909420289855073
C: 100, Average Cross-Validation Score: 0.4909420289855073
Best C: 0.001 with score: 0.5344202898550725
Training accuracy: 0.5344827586206896
Training error: 0.4655172413793104
Test accuracy: 0.4666666666666667
Test error: 0.5333333333333333
