In [12]:
import pandas as pd
import torch
import torch.nn as nn
import torch.utils.data as Data
import numpy as np
from sklearn.preprocessing import StandardScaler

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#读取数据
train1=pd.read_csv('C:/Users/31665/Downloads/boston-housing/train.csv')
test1=pd.read_csv('C:/Users/31665/Downloads/boston-housing/test.csv')
#删除一下ID列
train = train1.iloc[:, 1:]
test = test1.iloc[:, 1:] 
#查看数据的基本信息
train.shape
test.shape
#看看数据是否有缺失值
print(train.isnull().sum())
print(test.isnull().sum())

#检测异常值，使用IQR方法
Q1 = train.quantile(0.25)
Q3 = train.quantile(0.75)
IQR = Q3 - Q1

outliers = ((train < (Q1 - 1.5 * IQR)) | (train > (Q3 + 1.5* IQR))).any(axis=0)#any只要检测到异常便会false，axis=0表示对列检测
num_outliers = outliers.sum()

# 计算异常值比例
total_samples = train.shape[0]
outlier_ratio = (num_outliers / total_samples) * 100

print(f"异常值数量: {num_outliers}")
print(f"异常值比例: {outlier_ratio:.2f}%")

train1 = train.copy()
for col in train.columns[:-1]:# 遍历每个特征列除去最后一列
    Q1, Q3 = train[col].quantile(0.25), train[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # 用中位数替换超出范围的异常值，使用np.where，满足条件将异常值替换为中位数
    median = train[col].median()
    train1[col] = np.where(
        (train[col] < lower_bound) | (train[col] > upper_bound),
        median,  
        train[col]
    )

#进行数据划分，x1将最后一列去除
x1_data=train.iloc[:, :-1].values
y1_data=train.medv.values
x2_data=test.values

#进行数据标准化
scaler=StandardScaler()

x1_data=scaler.fit_transform(x1_data)#只对测试集进行fit操作保证标准一样
x2_data=scaler.transform(x2_data)
#将数据转换为张量
x1_data=torch.tensor(x1_data,dtype=torch.float32)
x2_data=torch.tensor(x2_data,dtype=torch.float32)
y1_data=torch.tensor(y1_data,dtype=torch.float32).view(-1,1)
#将数据移到device上
x1_data, x2_data, y1_data = x1_data.to(device), x2_data.to(device), y1_data.to(device)
#使用dataset和dataloader
train_data=Data.TensorDataset(x1_data,y1_data)
train_loader=Data.DataLoader(dataset=train_data,
                             batch_size=32,
                             shuffle=True,
                             drop_last=True,
                             num_workers=0)
#定义神经网络,定义了一个3层全连接
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.fc1=nn.Linear(13, 64)
        self.bn1=nn.BatchNorm1d(64)#进行批归一化，加速收敛
        self.fc2=nn.Linear(64, 32)
        self.bn2=nn.BatchNorm1d(32)
        self.fc3=nn.Linear(32, 1)
        self.relu=nn.ReLU()
        self.dropout = nn.Dropout(0.3) 

    def forward(self, x):
        x=self.bn1(self.relu(self.fc1(x)))
        x=self.dropout(x)
        x=self.bn2(self.relu(self.fc2(x)))
        x=self.fc3(x)
        return x
        

model = Model().to(device)

criterion = nn.MSELoss(reduction='mean')
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)

for epoch in range(500):
    model.train()
    running_loss = 0.0

    for batch_x, batch_y in train_loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)

        y_pred = model(batch_x)
        loss = criterion(y_pred, batch_y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    
    if epoch%10==0:
        
         print(f'Epoch {epoch + 1}, Loss: {running_loss / len(train_loader):.4f}')


model.eval()
with torch.no_grad():
    
    y_test_pred = model(x2_data)
    y_test_pred = y_test_pred.cpu().numpy()
    first_column = test1.iloc[:, 0].values

    predictions_df = pd.DataFrame({
        'ID': first_column,
        'medv': y_test_pred.flatten()  
    })

    print(predictions_df)

model.train()

predictions_df.to_csv('predictions.csv', index=False)



crim       0
zn         0
indus      0
chas       0
nox        0
rm         0
age        0
dis        0
rad        0
tax        0
ptratio    0
black      0
lstat      0
medv       0
dtype: int64
crim       0
zn         0
indus      0
chas       0
nox        0
rm         0
age        0
dis        0
rad        0
tax        0
ptratio    0
black      0
lstat      0
dtype: int64
异常值数量: 9
异常值比例: 2.70%
Epoch 1, Loss: 611.7552
Epoch 11, Loss: 525.6938
Epoch 21, Loss: 432.8469
Epoch 31, Loss: 327.1600
Epoch 41, Loss: 200.8491
Epoch 51, Loss: 104.0177
Epoch 61, Loss: 42.8302
Epoch 71, Loss: 19.6672
Epoch 81, Loss: 11.5043
Epoch 91, Loss: 11.1980
Epoch 101, Loss: 10.9779
Epoch 111, Loss: 7.6141
Epoch 121, Loss: 9.4869
Epoch 131, Loss: 9.3116
Epoch 141, Loss: 9.4097
Epoch 151, Loss: 6.2439
Epoch 161, Loss: 9.3153
Epoch 171, Loss: 8.0015
Epoch 181, Loss: 9.0845
Epoch 191, Loss: 7.1990
Epoch 201, Loss: 7.5598
Epoch 211, Loss: 7.8753
Epoch 221, Loss: 7.0776
Epoch 231, Loss: 10.4523
Epoch 241, Loss: 7