In [12]:
# -*- coding: utf-8 -*-
import numpy as np

# 基础函数
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def calculate_loss(y, tx, w, lambda_):
    """
    Compute the regularized loss for logistic regression.
    """
    pred = sigmoid(np.dot(tx, w)).reshape(-1, 1)  # Ensure pred is a column vector
    regularization_term = (lambda_ / 2) * np.sum(w**2)
    loss = -np.mean(y * np.log(pred + 1e-15) + (1 - y) * np.log(1 - pred + 1e-15)) + regularization_term
    return loss

def calculate_gradient(y, tx, w, lambda_):
    """
    Compute the regularized gradient for logistic regression.
    """
    pred = sigmoid(np.dot(tx, w)).reshape(-1, 1)
    if y.shape != pred.shape:
        y = y.reshape(-1, 1)  # Transform y to a column vector
    gradient = (np.dot(tx.T, (pred - y)) / len(y)) + (lambda_ * w).reshape(-1, 1) / len(tx)
    return gradient.flatten()  # Ensure gradient has the same shape as w

def logistic_regression(y, tx, initial_w, max_iters, gamma, lambda_, batch_size=64):
    """
    Regularized logistic regression using mini-batch gradient descent.
    """
    w = initial_w
    n_samples = len(y)

    for i in range(max_iters):
        # Shuffle the data
        indices = np.random.permutation(n_samples)
        tx = tx[indices]
        y = y[indices]

        # Mini-batch gradient descent
        for start in range(0, n_samples, batch_size):
            end = min(start + batch_size, n_samples)
            batch_tx = tx[start:end]
            batch_y = y[start:end]
            
            # Compute loss and gradient for the mini-batch
            loss = calculate_loss(batch_y, batch_tx, w, lambda_)
            gradient = calculate_gradient(batch_y, batch_tx, w, lambda_)
            
            # Update weights
            w = w - gamma * gradient
        
        # Optionally print progress
        if i % 100 == 0:
            print(f"Current iteration number: {i}, loss: {loss}")
    
    return w, loss

def predict(tx, w):
    """
    Predict the class labels using the logistic regression model.
    """
    y_pred = sigmoid(np.dot(tx, w))
    y_pred[y_pred > 0.3] = 1
    y_pred[y_pred <= 0.3] = 0
    return y_pred

# 加载数据
x_train_pre = np.loadtxt("x_train_encoded.csv", delimiter=",", skiprows=1)
y_train = np.loadtxt("y_train.csv", delimiter=",", skiprows=1)

# 提取标签
y_train_pre = y_train[:, 1]

x_test_pre = np.loadtxt("x_test_encoded.csv", delimiter=",", skiprows=1)

# 修改标签，将 -1 改为 0
y_train_pre[y_train_pre == -1] = 0
y_train_pre = y_train_pre.astype(int)

# 初始化参数
initial_w = np.zeros(x_train_pre.shape[1])  # 权重初始化为零向量
max_iters = 500  # 最大迭代次数
gamma = 0.01  # 学习率
lambda_ = 0.1  # 正则化强度
batch_size = 64  # 小批量的大小

# 训练正则化逻辑回归模型
w, loss = logistic_regression(y_train_pre, x_train_pre, initial_w, max_iters, gamma, lambda_, batch_size)
print(f"Final loss after training: {loss}")


Current iteration number: 0, loss: 0.5287843803108359
Current iteration number: 100, loss: 0.13857868630382153
Current iteration number: 200, loss: 0.6292596056283293
Current iteration number: 300, loss: 0.26031581800597725
Current iteration number: 400, loss: 0.30973012334098293
Final loss after training: 1.046137076915924


In [21]:
# 加载sample-submission中的Id列
sample_submission = np.loadtxt('sample-submission.csv', delimiter=',', skiprows=1, usecols=0, dtype=int)

# 在测试集上进行预测
y_pred = predict(x_test_pre, w)

#y_pred=np.where(y_pred == 0, -1, y_pred)

# 查找 y_pred 中 NaN 的个数
num_nans = np.sum(np.isnan(y_pred))
print(f"y_pred 中有 {num_nans} 个 NaN")

# 将 y_pred 中的 NaN 替换为 -1
y_pred_test = np.nan_to_num(y_pred, nan=-1)
# save as .csv
header = "Id,Prediction"
results = np.hstack((sample_submission.reshape(-1, 1), y_pred_test.reshape(-1, 1)))

np.savetxt('C:/Users/y/Documents/ml_exercise/ML_project_1/logistic_regression_with_l2.csv', 
           results, delimiter=',', header=header, comments='', fmt='%d')

y_pred 中有 46116 个 NaN


test on the 10% of x_train_pre

In [16]:
# Randomly select 10% of the data
np.random.seed(42)  # For reproducibility
num_samples = x_train_pre.shape[0]
sample_size = int(num_samples * 0.1)
indices = np.random.choice(num_samples, sample_size, replace=False)

# Extract the sampled training data and labels
x_train_sample = x_train_pre[indices]
y_train_sample = y_train_pre[indices]

print(x_train_pre.shape)
print(x_train_sample.shape)
print(y_train_sample.shape)
print(y_train_sample[:10])

(328135, 424)
(32813, 424)
(32813,)
[0 0 1 0 0 0 0 0 0 0]


In [19]:
from basic_functions import *
# The prediction results of logistic regression
y_pred_sample = predict(x_train_sample, w)
f1_score_logiregression=calculate_f1_score(y_pred_sample,y_train_sample)
accuracy_logiregression=calculate_accuracy(y_pred_sample,y_train_sample)
print(f1_score_logiregression,accuracy_logiregression)


0.4020530367835757 0.893487337335812


When lambda=0.1, threshold=0.3