## 使用深度学习方法进行建模

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
data=pd.read_csv("dubai_properties.csv")
data.dropna(inplace=True)
print(data.head(3))
data

                                             Address    Rent  Beds  Baths  \
0  The Gate Tower 2, The Gate Tower, Shams Gate D...  124000     3      4   
1                Water's Edge, Yas Island, Abu Dhabi  140000     3      4   
2            Al Raha Lofts, Al Raha Beach, Abu Dhabi   99000     2      3   

        Type  Area_in_sqft  Rent_per_sqft Rent_category Frequency  \
0  Apartment          1785      69.467787        Medium    Yearly   
1  Apartment          1422      98.452883        Medium    Yearly   
2  Apartment          1314      75.342466        Medium    Yearly   

    Furnishing   Purpose Posted_date  Age_of_listing_in_days        Location  \
0  Unfurnished  For Rent  2024-03-07                      45  Al Reem Island   
1  Unfurnished  For Rent  2024-03-08                      44      Yas Island   
2    Furnished  For Rent  2024-03-21                      31   Al Raha Beach   

        City   Latitude  Longitude  
0  Abu Dhabi  24.493598  54.407841  
1  Abu Dhabi  24.49

Unnamed: 0,Address,Rent,Beds,Baths,Type,Area_in_sqft,Rent_per_sqft,Rent_category,Frequency,Furnishing,Purpose,Posted_date,Age_of_listing_in_days,Location,City,Latitude,Longitude
0,"The Gate Tower 2, The Gate Tower, Shams Gate D...",124000,3,4,Apartment,1785,69.467787,Medium,Yearly,Unfurnished,For Rent,2024-03-07,45,Al Reem Island,Abu Dhabi,24.493598,54.407841
1,"Water's Edge, Yas Island, Abu Dhabi",140000,3,4,Apartment,1422,98.452883,Medium,Yearly,Unfurnished,For Rent,2024-03-08,44,Yas Island,Abu Dhabi,24.494022,54.607372
2,"Al Raha Lofts, Al Raha Beach, Abu Dhabi",99000,2,3,Apartment,1314,75.342466,Medium,Yearly,Furnished,For Rent,2024-03-21,31,Al Raha Beach,Abu Dhabi,24.485931,54.600939
3,"Marina Heights, Marina Square, Al Reem Island,...",220000,3,4,Penthouse,3843,57.246942,High,Yearly,Unfurnished,For Rent,2024-02-24,57,Al Reem Island,Abu Dhabi,24.493598,54.407841
4,"West Yas, Yas Island, Abu Dhabi",350000,5,7,Villa,6860,51.020408,High,Yearly,Unfurnished,For Rent,2024-02-16,65,Yas Island,Abu Dhabi,24.494022,54.607372
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73729,"Al Salamah, Umm Al Quwain",10000,0,1,Apartment,249,40.160643,Low,Yearly,Unfurnished,For Rent,2023-12-09,134,Al Salamah,Umm Al Quwain,25.493412,55.575994
73734,"Al Ramlah, Umm Al Quwain",25000,1,1,Villa,500,50.000000,Low,Yearly,Unfurnished,For Rent,2023-12-01,142,Al Ramlah,Umm Al Quwain,25.511461,55.578804
73735,"Umm Al Quwain Marina, Umm Al Quwain",22000,2,2,Apartment,1000,22.000000,Low,Yearly,Unfurnished,For Rent,2024-03-26,26,Umm Al Quwain Marina,Umm Al Quwain,25.527959,55.606527
73737,"Al Huboob 1, Al Salamah, Umm Al Quwain",14000,0,1,Apartment,419,33.412888,Low,Yearly,Unfurnished,For Rent,2023-12-14,129,Al Salamah,Umm Al Quwain,25.493412,55.575994


In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import torch
import torch.nn as nn
import torch.optim as optim

# 准备特征和目标变量
X = data.drop(['Rent','Address'],  axis=1)
y = data['Rent']

ModuleNotFoundError: No module named 'torch'

In [None]:
# 编码分类特征
label_encoders = {}
for column in X.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    # 对每个分类特征进行编码
    X[column] = le.fit_transform(X[column])
    label_encoders[column] = le

# 查看数据
print(X.head(2))
for column, le in label_encoders.items():
    num_classes = len(le.classes_)
    print(f"列 '{column}' 有 {num_classes} 种分类。")


In [None]:
# 根据分类剔除分类为1的维度
X = X.drop(['Frequency', 'Purpose'], axis=1)

In [None]:
# 归一化数值特征
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

print(X_scaled[:3])
print(X_scaled.shape)
print(y.shape)

In [None]:
# 归一化正确，尝试将y也归一化处理
y_from_to_frame = y.to_frame()
print(y_from_to_frame.shape)
y_scaled = scaler.fit_transform(y_from_to_frame)

In [None]:
# 划分数据集
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=42)

# 转换为PyTorch的张量
X_train_tensor = torch.FloatTensor(X_train)
y_train_tensor = torch.FloatTensor(y_train)
X_test_tensor = torch.FloatTensor(X_test)
y_test_tensor = torch.FloatTensor(y_test)

print(X_train_tensor[:3].numpy())
print(X_train_tensor.size())
print(y_test_tensor.size())

In [None]:
# 3 构建神经网络
class RentPredictor(nn.Module):
    def __init__(self, input_dim):
        super(RentPredictor, self).__init__()
        self.layer1 = nn.Linear(input_dim, 64)
        self.layer2 = nn.Linear(64, 128)
        self.layer3 = nn.Linear(128, 32)
        self.layer4 = nn.Linear(32, 1)
    
    def forward(self, x):
        x = torch.relu(self.layer1(x))
        x = torch.relu(self.layer2(x))
        x = torch.relu(self.layer3(x))
        x = self.layer4(x)
        return x

In [None]:
from torch.utils.data import DataLoader

# 将模型放到GPU上运行    
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")


# 实例化模型
model = RentPredictor(X.shape[1]).to(device)
# 将数据也送到GPU
X_train_tensor_gpu = X_train_tensor.to(device)
X_test_tensor_gpu = X_test_tensor.to(device)
y_train_tensor_gpu = y_train_tensor.to(device)
y_test_tensor_gpu = y_test_tensor.to(device)

# 创建数据加载器，指定批量大小
batch_size = 256
train_dataset = torch.utils.data.TensorDataset(X_train_tensor_gpu, y_train_tensor_gpu)
test_dataset = torch.utils.data.TensorDataset(X_test_tensor_gpu, y_test_tensor_gpu)
train_data_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_data_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)  # 关闭打乱

# 定义损失函数和优化器
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.00001)
epochs = 1

# 训练模型
for epoch in range(epochs):
    model.train()
    for batch in train_data_loader:
        X_batch, y_batch = batch
        optimizer.zero_grad()
        y_pred = model(X_batch)
        print(model(X_batch).shape)
        print(y_pred.shape)
        loss = criterion(y_pred, y_batch.unsqueeze(1))
        loss.backward()
        optimizer.step()
        
    # 评估模型
    model.eval()
    with torch.no_grad():
        for X_val_batch, y_val_batch in test_data_loader:
            # X_val_batch = X_val_batch.to(device)
            # y_val_batch = y_val_batch.to(device)
            y_val_pred = model(X_val_batch)
            val_loss = criterion(y_val_pred, y_val_batch.unsqueeze(1))
        print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item()}, Val Loss: {val_loss.item()}')


In [None]:
# 5. 计算指标
model.eval()
with torch.no_grad():
    y_test_pred = model(X_test_tensor_gpu)
    test_loss = criterion(y_test_pred, y_test_tensor_gpu.unsqueeze(1))
    mse = test_loss.item()
    rmse = torch.sqrt(test_loss).item()
    y_mean = torch.mean(y_test_tensor_gpu.unsqueeze(1))
    ss_total = torch.sum((y_test_tensor.unsqueeze(1) - y_mean) ** 2)
    ss_res = torch.sum((y_test_tensor.unsqueeze(1) - y_test_pred) ** 2)
    r2 = 1 - (ss_res / ss_total).item()

# 打印指标
print(f'Test MSE: {mse}')
print(f'Test RMSE: {rmse}')
print(f'Test R²: {r2}')

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

model.eval()
with torch.no_grad():
    y_val_pred = model(X_test_tensor.to(device))
    
# 将预测结果从GPU复制到CPU
y_val_pred_cpu = y_val_pred.to('cpu')

# 将预测结果转换为Numpy数组
y_val_pred_numpy = y_val_pred_cpu.numpy()

# 安全地使用y_val_pred_numpy作为mean_squared_error的输入
mse_lr = mean_squared_error(y_test, y_val_pred_numpy)
rmse_lr = np.sqrt(mse_lr)
r2_lr = r2_score(y_test, y_val_pred_numpy)

# 打印指标
print(f'Test MSE: {mse_lr}')
print(f'Test RMSE: {rmse_lr}')
print(f'Test R²: {r2_lr}')