## 0. Dependencies

If you have not installed openCV, run the following code

In [None]:
# ! pip install opencv-python

## 1. Subsetting and Loading

In [None]:
import cv2
import os
import random
import numpy as np

In [None]:
# set a seed
random.seed(328)

# define dir
data_dir = "./crop_part1"

# random sampling
n = 1000
fnames_subset = random.sample(os.listdir(data_dir), n)

Construct y

In [None]:
y = [] # 0 (male) or 1 (female)
for fname in fnames_subset:
    temp = fname.split("_")
    y.append(temp[1])
    pass

y = np.array(y)

Construct X as np arrays

In [None]:
# X: flattened version
# X_origin_dict: original version; key: index, value: 3-D np array
X_rgb = list()
X_origin_dict = dict()
for i, fname in enumerate(fnames_subset):
    # construct dir
    dir = data_dir + "/" + fname
    
    # read the data
    dat = cv2.imread(dir)
    
    # convert to rgb
    dat = cv2.cvtColor(dat, cv2.COLOR_BGR2RGB)
    
    # store the original data
    X_origin_dict[i] = dat
    
    # store the data
    X_rgb.append(dat)

# convert to np array
X_rgb = np.array(X_rgb)
print("The shape of the X_rgb is:", X_rgb.shape)


## 2. Modeling

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import copy
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
import time
from sklearn.linear_model import LogisticRegression 
import matplotlib
from sklearn import metrics
from sklearn.metrics import precision_recall_curve

In [None]:
X_rgb_train, X_rgb_test, y_train, y_test = train_test_split(
    X_rgb, y, test_size=0.2, random_state=329
)

X_rgb_train, X_rgb_val, y_train, y_val = train_test_split(
    X_rgb_train, y_train, test_size=0.2, random_state=525
)


In [None]:
X_rgb_flattened_train = np.array([x.flatten() for x in X_rgb_train])
X_rgb_flattened_val = np.array([x.flatten() for x in X_rgb_val])
X_rgb_flattened_test = np.array([x.flatten() for x in X_rgb_test])

In [None]:
# feature scaling
scaler = StandardScaler()
scaler.fit(X_rgb_flattened_train)

X_rgb_flattened_train_scaled = X_rgb_flattened_train.copy() 
X_rgb_flattened_train_scaled = scaler.transform(X_rgb_flattened_train)
X_rgb_flattened_val_scaled = scaler.transform(X_rgb_flattened_val)
X_rgb_flattened_test_scaled = scaler.transform(X_rgb_flattened_test)


In [None]:
# construct a list of C
C = []
temp = -4
step = (4 - (-4)) / (20 - 1) 
while temp <= 4:
    C.append(temp)
    temp += step 
C.pop()
C.append(4)
C_log = copy.deepcopy(C)
C = np.array(C)
C = np.power(10, C)
print("There are {} C values.".format(len(C))) 
print("The C values are:", C)

In [None]:
# init dictionaries for storing results of each interation
num_parameters_l1 = []
auc_l1 = []

# loop through 20 C values
for i, c in enumerate(C):
    model_lr = LogisticRegression(
        penalty="l1", C=c, solver="liblinear", random_state=214
    )

    model_lr.fit(X_rgb_flattened_train_scaled, y_train)

    # counting non-zero parameters
    cnt = 0
    for coef in model_lr.coef_.flatten():
        if coef != 0:
            cnt += 1

    num_parameters_l1.append(cnt)

    # evaluate on val set
    y_val_prob = model_lr.predict_proba(X_rgb_flattened_val_scaled)[:, 1]

    # calculating the auc
    auc_model_lr = metrics.roc_auc_score(y_val, y_val_prob)
    auc_l1.append(auc_model_lr)
    pass

In [None]:
# set format as retina
%config InlineBackend.figure_format = "retina"

# set theme
sns.set_theme(style="whitegrid")

# set subplots to 2*2
fig, axs = plt.subplots(1, 2, figsize=(10, 4))

# data for the subplots
dat = [num_parameters_l1, auc_l1]

y_lab = ["Numbers of Non-zero Parameters for l1",
         "Area under the ROC Curve for l1"]

title = [
    "Numbers of Non-zero Parameters vs. log(C) for l1",
    "Area under the ROC Curve vs. log(c) for l1"
]

# make a loop for each plot
for i in range(2):
    axs[i].plot(C_log, dat[i],
                    label=y_lab[i])

    # set the title for each subplot
    axs[i].set_title(
        title[i], fontsize=10
    )

    # set labels
    axs[i].set_xlabel("log(C) (base=10)", fontsize=9)
    axs[i].set_ylabel(y_lab[i], fontsize=9)

    # set ticks
    axs[i].tick_params(labelsize=7)

    # set legends
    legend = axs[i].legend(prop={"size": 8}, loc="lower right")


# use tight layout to save space
plt.tight_layout()

plt.show()

$C=0.0886$ is the optimal choice

In [None]:
max_auc, ind = 0, 0
for i, auc in enumerate(auc_l1):
    if auc > max_auc:
        max_auc = auc
        ind = i
c_rgb = C[ind]
print("The best C is: {:.4f}".format(c_rgb))

In [None]:
model_lr_rgb = LogisticRegression(
    penalty="l1", C=0.0886, solver="liblinear", random_state=214
)

model_lr_rgb.fit(X_rgb_flattened_train_scaled, y_train)

# evaluate on val set
y_test_prob = model_lr_rgb.predict_proba(X_rgb_flattened_test_scaled)[:, 1]

# calculating the auc
auc_model_lr_rgb = metrics.roc_auc_score(y_test, y_test_prob)
print("The AUC is {:.5f}".format(auc_model_lr_rgb))

The grey-scale input is tried next.

In [None]:
X_grey_train = np.array([cv2.cvtColor(dt, cv2.COLOR_BGR2GRAY) for dt in X_rgb_train])
X_grey_val = np.array([cv2.cvtColor(dt, cv2.COLOR_BGR2GRAY) for dt in X_rgb_val])
X_grey_test = np.array([cv2.cvtColor(dt, cv2.COLOR_BGR2GRAY) for dt in X_rgb_test])

In [None]:
X_grey_flattened_train = np.array([x.flatten() for x in X_grey_train])
X_grey_flattened_val = np.array([x.flatten() for x in X_grey_val])
X_grey_flattened_test = np.array([x.flatten() for x in X_grey_test])

In [None]:
# feature scaling
scaler = StandardScaler()
scaler.fit(X_grey_flattened_train)

X_grey_flattened_train_scaled = X_grey_flattened_train.copy() 
X_grey_flattened_train_scaled = scaler.transform(X_grey_flattened_train)
X_grey_flattened_val_scaled = scaler.transform(X_grey_flattened_val)
X_grey_flattened_test_scaled = scaler.transform(X_grey_flattened_test)

In [None]:
# init dictionaries for storing results of each interation
num_parameters_l1_grey = []
auc_l1_grey = []

# loop through 20 C values
for i, c in enumerate(C):
    model_lr = LogisticRegression(
        penalty="l1", C=c, solver="liblinear", random_state=214
    )

    model_lr.fit(X_grey_flattened_train_scaled, y_train)

    # counting non-zero parameters
    cnt = 0
    for coef in model_lr.coef_.flatten():
        if coef != 0:
            cnt += 1

    num_parameters_l1_grey.append(cnt)

    # evaluate on val set
    y_val_prob = model_lr.predict_proba(X_grey_flattened_val_scaled)[:, 1]

    # calculating the auc
    auc_model_lr = metrics.roc_auc_score(y_val, y_val_prob)
    auc_l1_grey.append(auc_model_lr)
    pass

In [None]:
# set format as retina
%config InlineBackend.figure_format = "retina"

# set theme
sns.set_theme(style="whitegrid")

# set subplots to 2*2
fig, axs = plt.subplots(1, 2, figsize=(10, 4))

# data for the subplots
dat = [num_parameters_l1_grey, auc_l1_grey]

y_lab = ["Numbers of Non-zero Parameters for Data in Grey-scale",
         "Area under the ROC Curve for Data in Grey-scale"]

title = [
    "Numbers of Non-zero Parameters vs. log(C) for Data in Grey-scale",
    "Area under the ROC Curve vs. log(c) for Data in Grey-scale"
]

# make a loop for each plot
for i in range(2):
    axs[i].plot(C_log, dat[i],
                    label=y_lab[i])

    # set the title for each subplot
    axs[i].set_title(
        title[i], fontsize=10
    )

    # set labels
    axs[i].set_xlabel("log(C) (base=10)", fontsize=9)
    axs[i].set_ylabel(y_lab[i], fontsize=9)

    # set ticks
    axs[i].tick_params(labelsize=7)

    # set legends
    legend = axs[i].legend(prop={"size": 8}, loc="lower right")


# use tight layout to save space
plt.tight_layout()

plt.show()

In [None]:
max_auc, ind = 0, 0
for i, auc in enumerate(auc_l1_grey):
    if auc > max_auc:
        max_auc = auc
        ind = i
c_grey = C[ind]
print("The best C is: {:.4f}".format(c_grey))

In [None]:
model_lr_rgb = LogisticRegression(
    penalty="l1", C=0.2336, solver="liblinear", random_state=214
)

model_lr_rgb.fit(X_grey_flattened_train_scaled, y_train)

# evaluate on val set
y_test_prob = model_lr_rgb.predict_proba(X_rgb_flattened_test_scaled)[:, 1]

# calculating the auc
auc_model_lr_rgb = metrics.roc_auc_score(y_test, y_test_prob)
print("The AUC is {:.5f}".format(auc_model_lr_rgb))

The grey-scale input is worse in this case. It may because of some information loss.