In [None]:
#######################################################
# Oracle model with l1 penalty without feature extraction
#######################################################
import pandas as pd
import statsmodels.api as sm
import numpy as np
from scipy.spatial import distance
from sklearn import metrics
import math
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss


In [2]:
#=============================function to calculate the inverse logit link============================
def invLink(lp):
    pv = 1/(1 + math.exp(-lp))
    return pv

# vectorize the function
invLinkVec = np.vectorize(invLink)
#========================Define the log-likelihood for evaluation========================
# logistic regression log-likelihood
def loglikeli(YEval, predCombined):
    ll = np.mean(YEval * np.log(predCombined) + (1-YEval) * np.log(1 - predCombined))
    return ll

In [3]:
#=============================read data=====================================
import pickle
fname = "data_splitted_without_feature_extraction_dic.p"
infile = open(fname, 'rb')
new_dict = pickle.load(infile)
infile.close()

In [5]:
Y = new_dict["Y"]
YTest = new_dict["YTest"]

XA = new_dict["XA"]
XB = new_dict["XB"]

XATest = new_dict["XATest"]
XBTest = new_dict["XBTest"]

X = np.concatenate((XA, XB),axis = 1)
XTest = np.concatenate((XATest, XBTest),axis = 1)

################################################
# To fit logistic regression with l1 penalty, standardize the training set predictor observations, then apply the same transformation to the test set.
################################################

# 1. Fit separate scalers for XA and XB
scaler_A = StandardScaler().fit(XA)
scaler_B = StandardScaler().fit(XB)

# 2. Transform train+val (XA, XB)
XA_std = scaler_A.transform(XA)
XB_std = scaler_B.transform(XB)

# 3. Transform test (XATest, XBTest) using the same fitted scalers
XATest_std = scaler_A.transform(XATest)
XBTest_std = scaler_B.transform(XBTest)

# 4. Concatenate for full model input
X_std = np.concatenate([XA_std, XB_std], axis=1)
XTest_std = np.concatenate([XATest_std, XBTest_std], axis=1)

In [6]:
# the training set size, which is 2223
N = len(Y)

eps = 0.001  # λ_min = eps * λ_max

# the length of the sequence of lambda values
K = 20

# Compute lambda_max
dot_products = np.abs(X_std.T @ (Y-0.5)) / N  
lambda_max = np.max(dot_products) 
lambda_min = eps * lambda_max


# Construct lambda path (log-spaced)
# in np.logspace, the starting value is 10^{x_start} and ending value is 10^{x_end}
# In this way, if we express this sequence as 10^{z_1}, 10^{z_2}, the difference between two adjacent expontials should be the same
## e.g., np.log10(lambdas[1]) - np.log10(lambdas[0] = np.log10(lambdas[3])- np.log10(lambdas[2])
lambdas = np.logspace(np.log10(lambda_max), np.log10(lambda_min), K)
Cs = 1 /lambdas  # sklearn uses C = 1 / (alpha * lambda)


In [8]:
# Cross-validation setup
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

mean_cv_losses = []
for i in range(len(Cs)):
    C = Cs[i]
    print(str(i + 1) + "th C")  # +1 to print the 1-based index
    fold = 1
    losses = []
    for train_idx, val_idx in kf.split(X_std, Y):
        print(str(fold) + "th fold")
        fold += 1
        
        X_train, X_val = X_std[train_idx], X_std[val_idx]
        y_train, y_val = Y[train_idx], Y[val_idx]

        model = LogisticRegression(penalty='l1', solver='liblinear', C=C, max_iter=1000)
        model.fit(X_train, y_train)
        prob_val = model.predict_proba(X_val)[:, 1]
        loss = log_loss(y_val, prob_val)
        losses.append(loss)
    mean_cv_losses.append(np.mean(losses))

# Select best lambda
best_idx = np.argmin(mean_cv_losses)
best_lambda = lambdas[best_idx]
best_C = Cs[best_idx]

1th C
1th fold
2th fold
3th fold
4th fold
5th fold
6th fold
7th fold
8th fold
9th fold
10th fold
2th C
1th fold
2th fold
3th fold
4th fold
5th fold
6th fold
7th fold
8th fold
9th fold
10th fold
3th C
1th fold
2th fold
3th fold
4th fold
5th fold
6th fold
7th fold
8th fold
9th fold
10th fold
4th C
1th fold
2th fold
3th fold
4th fold
5th fold
6th fold
7th fold
8th fold
9th fold
10th fold
5th C
1th fold
2th fold
3th fold
4th fold
5th fold
6th fold
7th fold
8th fold
9th fold
10th fold
6th C
1th fold
2th fold
3th fold
4th fold
5th fold
6th fold
7th fold
8th fold
9th fold
10th fold
7th C
1th fold
2th fold
3th fold
4th fold
5th fold
6th fold
7th fold
8th fold
9th fold
10th fold
8th C
1th fold
2th fold
3th fold
4th fold
5th fold
6th fold
7th fold
8th fold
9th fold
10th fold
9th C
1th fold
2th fold
3th fold
4th fold
5th fold
6th fold
7th fold
8th fold
9th fold
10th fold
10th C
1th fold
2th fold
3th fold
4th fold
5th fold
6th fold
7th fold
8th fold
9th fold
10th fold
11th C
1th fold
2th fold
3th 

In [10]:
Selected_model = LogisticRegression(penalty='l1', solver='liblinear', C=best_C, max_iter=1000)
Selected_model.fit(X_std, Y)

In [12]:
prob_test = model.predict_proba(XTest_std)[:, 1]

In [None]:
################################
# log_loss from sklearn.metrics computes the negative loglikelihood
################################

# print(loglikeli(YTest, prob_test))
# print(log_loss(YTest, prob_test))
# -2.9637245006358275
# 2.9637245006358275


-2.9637245006358275
2.9637245006358275


In [None]:
# Compute AUC

fpr, tpr, _ = metrics.roc_curve(YTest, prob_test)
AUC = metrics.auc(fpr, tpr)
print(AUC)
# 0.5476666666666666

0.5476666666666666
