In [2]:
import numpy as np
from logistic_regression import *
from basic_functions import *

# load data
x_train_pre = np.loadtxt("x_train_processed_0_missing_1029.csv", delimiter=",", skiprows=1)
y_train = np.loadtxt("y_train.csv", delimiter=",", skiprows=1)

#提取第二列到最后一列
y_train_pre = y_train[:, 1:]  # 这将返回所有行和从第二列开始的所有列
x_test_pre = np.loadtxt("x_test_processed_0_missing_1029.csv", delimiter=",", skiprows=1)

# change the label 
y_train_pre[y_train_pre == -1] = 0

# 添加偏置项
tx = np.hstack((np.ones((x_train_pre.shape[0], 1)), x_train_pre))
tx_test = np.hstack((np.ones((x_test_pre.shape[0], 1)), x_test_pre))

# 加载sample-submission中的Id列
sample_submission = np.loadtxt('sample-submission.csv', delimiter=',', skiprows=1, usecols=0, dtype=int)

# 初始化参数
initial_w = np.zeros((tx.shape[1], 1))
max_iters = 1000
gamma = 0.01
batch_size = 64  

print(tx)
print(initial_w)
print(tx.shape[1])
print(tx.shape[0])
print(initial_w.shape[1])
print(initial_w.shape[0])

def mini_batch_logistic_regression(y, tx, initial_w, max_iters, gamma, batch_size):
    w = initial_w
    for i in range(max_iters):
        for batch_start in range(0, len(y), batch_size):
            batch_end = batch_start + batch_size
            y_batch = y[batch_start:batch_end]
            tx_batch = tx[batch_start:batch_end]
            
            # 计算损失和梯度
            loss = calculate_loss(y_batch, tx_batch, w)
            gradient = calculate_gradient(y_batch, tx_batch, w)
            
            # 更新权重
            w = w - gamma * gradient
        
        if i % 100 == 0:
            print(f"当前迭代次数：{i}, 损失：{loss}")
    return w, loss

# 训练逻辑回归模型
w,loss= mini_batch_logistic_regression(y_train_pre, tx, initial_w, max_iters, gamma, batch_size)

print(w)

# 输出最终的损失
print(f"最终损失: {loss}")

# 使用训练好的模型在测试集上进行预测
y_pred_test = predict(tx_test, w)

y_pred_test = np.where(y_pred_test == 0, -1, y_pred_test)

#y_pred_test = np.where(y_pred_test == 0, -1, y_pred_test)

# save as .csv
header = "Id,Prediction"
results = np.hstack((sample_submission.reshape(-1, 1), y_pred_test.reshape(-1, 1)))

np.savetxt('C:/Users/y/Documents/ml_exercise/ML_project_1/result_logistic_regression.csv', 
           results, delimiter=',', header=header, comments='', fmt='%d')

[[ 1.          1.4362964   0.         ...  0.          1.
   0.        ]
 [ 1.          0.18877218  0.         ...  0.          1.
   0.        ]
 [ 1.         -0.62211856  0.         ...  0.          1.
   0.        ]
 ...
 [ 1.          0.56302944  0.         ...  0.          1.
   0.        ]
 [ 1.          0.18877218  0.         ...  0.          1.
   0.        ]
 [ 1.          0.12639597  0.         ...  0.          1.
   0.        ]]
[[0.]
 [0.]
 [0.]
 ...
 [0.]
 [0.]
 [0.]]
1414
328135
1
1414
当前迭代次数：0, 损失：0.02751289796667825
当前迭代次数：100, 损失：0.02394028829116565
当前迭代次数：200, 损失：0.023661144363907215
当前迭代次数：300, 损失：0.02352038267771307
当前迭代次数：400, 损失：0.023431769958925482
当前迭代次数：500, 损失：0.023371008737592274
当前迭代次数：600, 损失：0.02332767969887362
当前迭代次数：700, 损失：0.023295892396115847
当前迭代次数：800, 损失：0.023271929554719396
当前迭代次数：900, 损失：0.023253354812806683
[[ 0.00609276]
 [ 0.01591066]
 [-0.20514364]
 ...
 [-0.01897061]
 [ 0.00117247]
 [ 0.02389091]]
最终损失: 0.02323868012973748


In [34]:
# 使用训练好的模型在测试集上进行预测
y_pred_test = predict(tx_test, w)

# 查找 y_pred_test 中 NaN 的个数
num_nans = np.sum(np.isnan(y_pred_test))
print(y_pred_test[:10])
print(f"y_pred_test 中有 {num_nans} 个 NaN")

# 将 y_pred_test 中的 NaN 替换为 -1
y_pred_test = np.nan_to_num(y_pred_test, nan=-1)
# save as .csv
header = "Id,Prediction"
results = np.hstack((sample_submission.reshape(-1, 1), y_pred_test.reshape(-1, 1)))

np.savetxt('C:/Users/y/Documents/ml_exercise/ML_project_1/result_logistic_regression.csv', 
           results, delimiter=',', header=header, comments='', fmt='%d')



[[ 1.]
 [nan]
 [ 1.]
 [ 1.]
 [ 1.]
 [nan]
 [nan]
 [ 1.]
 [ 1.]
 [ 1.]]
y_pred_test 中有 46116 个 NaN


The following is a test conducted on 10% of the training set

In [3]:
# Randomly select 10% of the data
np.random.seed(42)  # For reproducibility
num_samples = tx.shape[0]
sample_size = int(num_samples * 0.1)
indices = np.random.choice(num_samples, sample_size, replace=False)

# Extract the sampled training data and labels
x_train_sample = tx[indices]
y_train_sample = y_train_pre[indices]

print(tx.shape)
print(x_train_sample.shape)
print(y_train_sample.shape)
print(y_train_sample[:10])

(328135, 1414)
(32813, 1414)
(32813, 1)
[[0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]]


In [5]:
# The prediction results of logistic regression
y_pred_sample = predict(x_train_sample, w)
f1_score_logiregression=calculate_f1_score(y_pred_sample,y_train_sample)
accuracy_logiregression=calculate_accuracy(y_pred_sample,y_train_sample)
print(f1_score_logiregression,accuracy_logiregression)


0.24736699972994872 0.9150641514034071
