# Part 2:

In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
#from bayes_opt import BayesianOptimization

### Section 1: Logistic Regression for Digit Classification

In [3]:
# Load the dataset for training
features_df = pd.read_csv("./data_trouser_dress/troudress_train_x.csv").values
labels = pd.read_csv("./data_trouser_dress/troudress_train_y.csv").values
test_features_df = pd.read_csv("./data_trouser_dress/troudress_test_x.csv").values


def apply_blur_to_images(images, image_shape):
    from scipy.ndimage import convolve
    # Define a 3x3 averaging filter kernel
    kernel = np.ones((3, 3)) / 9.0
    
    blurred_images = np.empty_like(images)
    for i, image in enumerate(images):
        # Reshape the flattened image back to its 2D shape
        image_2d = image.reshape(image_shape)
        # Apply convolution using the averaging filter kernel
        blurred_image = convolve(image_2d, kernel, mode='nearest')
        # Flatten the blurred image back to its original shape
        blurred_images[i] = blurred_image.flatten()
        
    return blurred_images

# Assuming your images are 28x28 pixels, adjust if they're different
image_shape = (28, 28)  # Update this according to your dataset

# Apply noise reduction (blurring) to both training and test sets
X_train_blurred = apply_blur_to_images(features_df, image_shape)
X_test_blurred = apply_blur_to_images(test_features_df, image_shape)

# The rest of your logistic regression workflow remains the same
# For example:
X_train, X_val, y_train, y_val = train_test_split(X_train_blurred, labels, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train.ravel())  # Ensure y_train is the correct shape

# Predicting on validation set
y_pred = model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy: {accuracy}")

# You can proceed with predicting on X_test_blurred and evaluating as needed


Validation Accuracy: 0.9445833333333333


In [None]:
max_iters = range(0, 100)  # Testing from 10 to 100, in steps of 10

# Re-initialize lists to store accuracies
accuracies_untransformed = []
accuracies_transformed = []

for max_iter in max_iters:
    # Model for untransformed features
    model_untransformed = LogisticRegression(solver='liblinear', max_iter=max_iter)
    model_untransformed.fit(features_df, labels.ravel())
    acc_untransformed = accuracy_score(labels, model_untransformed.predict(features_df))
    accuracies_untransformed.append(acc_untransformed)
    
    # Model for transformed features
    model_transformed = LogisticRegression(solver='liblinear', max_iter=max_iter)
    model_transformed.fit(X_train_blurred, labels.ravel())
    acc_transformed = accuracy_score(labels, model_transformed.predict(X_train_blurred))
    accuracies_transformed.append(acc_transformed)

# Plotting the adjusted results
plt.figure(figsize=(10, 6))
plt.plot(max_iters, accuracies_untransformed, label='Model 0: Untransformed Features', marker='o')
plt.plot(max_iters, accuracies_transformed, label='Model 1: Transformed Features', marker='x')
plt.xlabel('Maximum Iterations')
plt.ylabel('Accuracy on Training Data')
plt.title('Effect of Max Iteration on Model Accuracy (Extended Range)')
plt.legend()
plt.grid(True)
plt.show()

In [5]:
# Define the function for Bayesian optimization
def f_score(C):
    lr = LogisticRegression(max_iter=500, C=C)
    lr.fit(X_train, y_train)
    acc = accuracy_score(y_test, lr.predict(X_test))
    return acc

# Define the bounds for the hyperparameter C
pbounds = {'C': (0.001, 10)}

# Initialize Bayesian Optimization
opt = BayesianOptimization(
    f=f_score,
    pbounds=pbounds,
    verbose=2,
    random_state=1,
)

# Optimize
opt.maximize(
    init_points=2,  # You can adjust this number based on how many initial points you want to explore
    n_iter=10,  # And adjust this for the number of iterations for the optimization process
)

C1 = opt.max['params']["C"]     # Take the best parameters
print("best C:",C1)


NameError: name 'BayesianOptimization' is not defined

In [None]:
# Assuming the data loading and preprocessing steps have been done as per your initial code

# Define a range of C values to explore. For example:
c_values = np.logspace(-4, 4, 20)  # This creates 20 values spaced evenly on a log scale

best_score = 0
best_c = None

# Make sure you are using the augmented and binarized data (X_train_augmented, y_train_augmented) for finding the best C
# Assuming X_train, y_train are already defined and refer to the augmented, binarized dataset

for c in c_values:
    model = LogisticRegression(C=c, max_iter=500)  # Increased max_iter to ensure convergence
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    mean_score = np.mean(scores)
    
    if mean_score > best_score:
        best_score = mean_score
        best_c = c

print("Best C:", best_c)
print("Best cross-validation accuracy:", best_score)

# After finding the best C, train the final model on the full training set
final_model = LogisticRegression(C=best_c, max_iter=500)  # Using the best C value found
final_model.fit(X_train, y_train)

In [None]:
# Training on the training set
lr1 = LogisticRegression(max_iter=200,C=C1)
lr1.fit(X_train,y_train)


# Test accuracy in test set
pre = lr1.predict(X_test)
acc = accuracy_score(pre,y_test)
print("Accuracy on the test set:",acc)

Next, the training data is binarized (turning each image into a black and white image), with the pixel value of each pixel less than or equal to 0.4 placed at 0 and greater than 0.4 placed at 1. Then, again, a Bayesian optimizer is used to select the best regularization parameter C for the logistic regression classifier to achieve the highest accuracy

In [None]:
X_train_copy = X_train.copy()
# Binarization of all training images
for i in range(len(X_train_copy)):
    for j in range(len(X_train_copy[i])):
        if X_train_copy[i][j] <= 0.4:
            X_train_copy[i][j] = 0
        else:
            X_train_copy[i][j] = 1
X_train_copy

In [None]:
X_test_copy = X_test.copy()
# Binarization of all test images
for i in range(len(X_test_copy)):
    for j in range(len(X_test_copy[i])):
        if X_test_copy[i][j] <= 0.4:
            X_test_copy[i][j] = 0
        else:
            X_test_copy[i][j] = 1
X_test_copy

In [None]:
# The optimal hyperparameter C, the regularization parameter, is selected using a Bayesian optimizer
def f_score(C):
    lr = LogisticRegression(max_iter=200,C=C)
    lr.fit(X_train_copy,y_train)
    acc = accuracy_score(y_test,lr.predict(X_test_copy))
    return acc     # The goal of adjusting parameters to maximize accuracy


# Determine the range of parameter values
pbounds = {'C': (0,1)}

# Constructing a Bayesian Optimizer
opt = BayesianOptimization(
    f=f_score,
    pbounds=pbounds,
    verbose=2,  # verbose = 2 prints all, verbose = 1 prints the maximum value found in the run, verbose = 0 prints nothing
    random_state=1
)

# Start running
opt.maximize(
    init_points=10,  # Steps of random search
    n_iter=130  # Number of iterations to perform Bayesian optimization
)

C2 = opt.max['params']["C"]     # Take the best parameters
print("best C:",C2)


In [None]:
# Training on the training set
lr2 = LogisticRegression(max_iter=200,C=C2)
lr2.fit(X_train_copy,y_train)


# Test accuracy in test set
pre = lr2.predict(X_test_copy)
acc = accuracy_score(pre,y_test)
print("Accuracy on the test set:",acc)

In [None]:
# Flip all the images in the training set and expand them to the training set
X_train_copy = X_train.tolist()
y_train_copy = y_train.tolist()
# print(X_train.shape)
# print(y_train.shape)
for i in range(len(X_train)):
    temp_img = []
    img = X_train[i].reshape(28,28)
    img_new = cv2.flip(img, 0)  # Vertical Flip
    img_new = img_new.reshape(-1).tolist()
    X_train_copy.append(img_new)
    y_train_copy.append(y_train[i])
X_train = np.array(X_train_copy)
y_train = np.array(y_train_copy)
print(X_train.shape)  # Print the shape of x_train
print(y_train.shape)  # Print the shape of y_train


In [None]:
You can see that the shape of both x_train and y_train become twice the original

In [None]:
# The optimal hyperparameter C, the regularization parameter, is selected using a Bayesian optimizer
def f_score(C):
    lr = LogisticRegression(max_iter=200,C=C)
    lr.fit(X_train,y_train)
    acc = accuracy_score(y_test,lr.predict(X_test))
    return acc     # The goal of adjusting parameters to maximize accuracy

# Determine the range of parameter values
pbounds = {'C': (0,1)}

# Constructing a Bayesian Optimizer
opt = BayesianOptimization(
    f=f_score, 
    pbounds=pbounds,  
    verbose=2,  # verbose = 2 prints all, verbose = 1 prints the maximum value found in the run, verbose = 0 prints nothing
    random_state=1
)

# Start running
opt.maximize(
    init_points=10,  # Steps of random search
    n_iter=130  # Number of iterations to perform Bayesian optimization
)

C3 = opt.max['params']["C"]     # Take the best parameters
print("best C:",C3)

In [None]:
# Training on the training set
lr3 = LogisticRegression(max_iter=200,C=C3)
lr3.fit(X_train,y_train)


# Test accuracy in test set
pre = lr3.predict(X_test)
acc = accuracy_score(pre,y_test)
print("Accuracy on the test set:",acc)

In [None]:
import joblib

joblib.dump(lr1,"./model/logistic_regression_classifier.m")