In [13]:
import pandas as pd
import pickle
# # 从文件中读取DataFrame对象
# with open('jianhuatest.pkl', 'rb') as f:
#     encoded_df = pickle.load(f)
# 读取a.csv和b.csv文件
df_a = pd.read_csv(r"D:\桌面\大数据竞赛\训练集竞赛2\cwtz_train.csv")
df_b = pd.read_excel(r"D:\桌面\大数据竞赛\文本连接.xlsx")
X_text = df_b["Report Texts"]
X_numerical = df_a.iloc[:, 1:-1].values
y = df_a.iloc[:, -1].values

In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

import torch
from torch.utils.data import TensorDataset, DataLoader
from torch import nn, optim

In [3]:
# df_a = pd.read_csv('train.csv').sample(frac = 0.1, random_state=42)
# X_numerical = df_a.iloc[:, 1:-2].values
# y = df_a.iloc[:, -2].values
# X_text = df_a.iloc[:, -1]

In [15]:
# 对文本特征进行向量化处理
vectorizer = CountVectorizer()
X_text = vectorizer.fit_transform(X_text).toarray()

# 对数字特征进行标准化处理
scaler = StandardScaler()
X_numerical = scaler.fit_transform(X_numerical)

# 划分训练集和测试集
X_text_train, X_text_test, X_numerical_train, X_numerical_test, y_train, y_test = train_test_split(
    X_text, X_numerical, y, test_size=0.2, random_state=0)

In [16]:
len(X_text[0])

42668

In [17]:
len(X_text)

7236

In [18]:
# 将数据转换为PyTorch张量
X_text_train = torch.tensor(X_text_train)
X_text_test = torch.tensor(X_text_test)
X_numerical_train = torch.tensor(X_numerical_train, dtype=torch.float32)
X_numerical_test = torch.tensor(X_numerical_test, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32)

# 将数据封装成TensorDataset
train_dataset = TensorDataset(X_text_train, X_numerical_train, y_train)
test_dataset = TensorDataset(X_text_test, X_numerical_test, y_test)

In [19]:
# 定义批量大小和数据加载器
batch_size = 16
train_dataloader = DataLoader(train_dataset, batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

In [21]:
# 定义模型
class MyModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.text_embedding = nn.Embedding(num_embeddings=116, embedding_dim=100)
        self.text_rnn = nn.LSTM(input_size=100, hidden_size=64, batch_first=True)
        self.linear1 = nn.Linear(in_features=116, out_features=32)
        self.linear2 = nn.Linear(in_features=32, out_features=1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, text, numerical):
        text = self.text_embedding(text)
        text_output, _ = self.text_rnn(text)
        text_output = text_output[:, -1, :]
        x = torch.cat([text_output, numerical], dim=1)
        x = self.relu(self.linear1(x))
        x = self.sigmoid(self.linear2(x))
        return x

In [22]:
# 将模型移动到GPU上
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MyModel().to(device)

In [23]:
# 定义损失函数和优化器
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters())

In [None]:
#注意，这里由于模型错误，运行下列代码需要数小时的代码，而且效果不好，错误原因读者可自行理解

In [24]:
# 训练模型
num_epochs = 4
train_losses = []
test_losses = []
test_auc_scores = []
for epoch in range(num_epochs):
    # 训练模型
    train_loss = 0.0
    model.train()
    for i, batch in enumerate(train_dataloader):
        text, numerical, label = batch
        text = text.to(device)
        numerical = numerical.to(device)
        label = label.to(device)
        optimizer.zero_grad()
        output = model(text, numerical)
        loss = criterion(output.squeeze(), label)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * len(batch)
    train_loss /= len(train_dataset)
    train_losses.append(train_loss)

    # 测试模型
    test_loss = 0.0
    predictions = []
    targets = []
    model.eval()
    with torch.no_grad():
        for batch in test_dataloader:
            text, numerical, label = batch
            text = text.to(device)
            numerical = numerical.to(device)
            label = label.to(device)
            output = model(text, numerical)
            loss = criterion(output.squeeze(), label)
            test_loss += loss.item() * len(batch)
            predictions.append(output.cpu().numpy())
            targets.append(label.cpu().numpy())
    test_loss /= len(test_dataset)
    test_losses.append(test_loss)
    predictions = np.concatenate(predictions)
    targets = np.concatenate(targets)
    test_auc_score = roc_auc_score(targets, predictions)
    test_auc_scores.append(test_auc_score)

    print(f"Epoch {epoch + 1:02d}, "
          f"Train Loss: {train_loss:.4f}, "
          f"Test Loss: {test_loss:.4f}, "
          f"Test AUC: {test_auc_score:.4f}")

KeyboardInterrupt: 

In [None]:
# 绘制 loss 和 AUC 曲线
import matplotlib.pyplot as plt

fig, ax = plt.subplots(1, 2, figsize=(12, 4))
ax[0].plot(train_losses, label="Train Loss")
ax[0].plot(test_losses, label="Test Loss")
ax[0].set_xlabel("Epoch")
ax[0].set_ylabel("Loss")
ax[0].legend()
ax[1].plot(test_auc_scores)
ax[1].set_xlabel("Epoch")
ax[1].set_ylabel("AUC")
plt.show()

In [None]:
sum(predictions)

In [None]:
sum(targets)

##  测试集给出结果

In [None]:
df_a = pd.read_csv(r"D:\桌面\cwtz_test.csv")
df_b = pd.read_excel(r"D:\桌面\连接文本test.xlsx")

# 将文本特征和数字特征分离
X_text_test = df_b.iloc[:, 1].values
X_numerical_test = df_a.iloc[:, :-1].values


In [None]:
# 对文本特征进行向量化处理
vectorizer = CountVectorizer()
X_text_test = vectorizer.fit_transform(X_text_test).toarray()

# 对数字特征进行标准化处理
scaler = StandardScaler()
X_numerical_test = scaler.fit_transform(X_numerical_test)

In [None]:
# # 将预处理后的数据转换为PyTorch张量
X_text_test = torch.tensor(X_text_test)
X_numerical_test = torch.tensor(X_numerical_test, dtype=torch.float32)

# # 将数据封装成TensorDataset
test_dataset = TensorDataset(X_text_test, X_numerical_test)

# 定义批量大小和数据加载器
batch_size = 32
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

# 使用训练好的模型进行预测
predictions = []
model.eval()  # 切换为评估模式
with torch.no_grad():  # 禁用梯度计算，加速推理过程
    for batch in test_dataloader:
        text, numerical = batch
        text = text.to(device)
        numerical = numerical.to(device)
        output = model(text, numerical)
        predictions.extend(output.cpu().numpy().tolist())


In [None]:
# 输出预测结果
jieguo=[]
jieguo=[x[0] for x in predictions ]
df_test=pd.read_excel(r"D:\桌面\竞赛二结果.xlsx")
df_test['预测结果'] = jieguo
df_test.to_excel(r"D:\桌面\竞赛二结果.xlsx", index=False)

In [None]:
len(predictions)

In [None]:
plt.plot(jieguo)

## 模块展开详解

In [None]:
class MyModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.text_embedding = nn.Embedding(num_embeddings=116, embedding_dim=100)
        self.text_rnn = nn.LSTM(input_size=100, hidden_size=64, batch_first=True)
        self.linear1 = nn.Linear(in_features=116, out_features=32)
        self.linear2 = nn.Linear(in_features=32, out_features=1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, text, numerical):
        text = self.text_embedding(text)
        text_output, _ = self.text_rnn(text)
        text_output = text_output[:, -1, :]
        x = torch.cat([text_output, numerical], dim=1)
        x = self.relu(self.linear1(x))
        x = self.sigmoid(self.linear2(x))
        return x

In [None]:
text_embedding = nn.Embedding(num_embeddings=10, embedding_dim=5)
text_rnn = nn.LSTM(input_size=5, hidden_size=4, batch_first=True)
linear1 = nn.Linear(in_features=7, out_features=5)
linear2 = nn.Linear(in_features=5, out_features=1)
relu = nn.ReLU()
sigmoid = nn.Sigmoid()

In [None]:
text=torch.tensor([1,2])

In [None]:
text=text_embedding(text)

In [None]:
text

In [None]:
text_output, _ = text_rnn(text)

In [None]:
text_output

In [None]:
numer=torch.tensor([[1,2,3],[2,3,4]])
x=torch.cat([text_output, numer], dim=1)

In [None]:
x

In [None]:
x = relu(linear1(x))
x

In [None]:
x = sigmoid(linear2(x))
x