In [None]:
#导入
import torch
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [10]:
# 数据处理
df = pd.read_csv("cwurData.csv")
features = ["quality_of_education", "alumni_employment", "quality_of_faculty", "publications", "influence", "citations",
            "patents", "score"]
X = df[features].values
y = df["world_rank"].values

In [11]:
# 填充缺失值
X = np.nan_to_num(X, nan=np.nanmean(X, axis=0))
y = np.nan_to_num(y, nan=np.nanmean(y))

In [12]:
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# 检查数据中是否存在 NaN 或 inf
if np.any(np.isnan(X_train)) or np.any(np.isnan(y_train)):
    print("Training data contains NaN values.")
if np.any(np.isnan(X_test)) or np.any(np.isnan(y_test)):
    print("Test data contains NaN values.")

In [14]:
# 将数据转移到 GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
X_train = torch.tensor(X_train).float().to(device)
y_train = torch.tensor(y_train).float().to(device)
X_test = torch.tensor(X_test).float().to(device)
y_test = torch.tensor(y_test).float().to(device)

In [15]:
# 构建线性回归模型
class LinearRegressionModel(torch.nn.Module):
    def __init__(self, input_dim):
        super(LinearRegressionModel, self).__init__()
        self.linear = torch.nn.Linear(input_dim, 1)

    def forward(self, x):
        return self.linear(x)

In [17]:
# 初始化模型
input_dim = X_train.shape[1]
model = LinearRegressionModel(input_dim).to(device)

In [18]:
# 定义损失函数和优化器
criterion = torch.nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)  # 更低的学习率

In [19]:
# 训练模型
epochs = 1000
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()

    # 前向传播
    predictions = model(X_train)

    # 计算损失
    loss = criterion(predictions.squeeze(), y_train)

    # 反向传播
    loss.backward()

    # 梯度裁剪：防止梯度爆炸
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

    # 更新参数
    optimizer.step()

    # 输出训练进度
    if (epoch + 1) % 100 == 0:
        print(f"Epoch [{epoch + 1}/{epochs}], Loss: {loss.item():.4f}")

Epoch [100/1000], Loss: 7918.7554
Epoch [200/1000], Loss: 7523.8237
Epoch [300/1000], Loss: 7418.2339
Epoch [400/1000], Loss: 7363.1689
Epoch [500/1000], Loss: 7330.1587
Epoch [600/1000], Loss: 7309.8521
Epoch [700/1000], Loss: 7297.1948
Epoch [800/1000], Loss: 7289.1885
Epoch [900/1000], Loss: 7284.0254
Epoch [1000/1000], Loss: 7280.6152


In [20]:
# 评估模型
model.eval()
with torch.no_grad():
    y_pred = model(X_test).squeeze()
    mse = mean_squared_error(y_test.cpu().numpy(), y_pred.cpu().numpy())
    print(f"Mean Squared Error: {mse}")

    # 计算准确度（以±10为阈值）
    threshold = 10
    accuracy = np.mean(np.abs(y_test.cpu().numpy() - y_pred.cpu().numpy()) <= threshold)
    print(f"Accuracy (within ±{threshold} ranks): {accuracy * 100:.2f}%")

    # 输出预测结果的前10个
    output_df = pd.DataFrame({"Actual Rank": y_test[:10].cpu().numpy(), "Predicted Rank": y_pred[:10].cpu().numpy()})
    print(output_df)

Mean Squared Error: 9101.1962890625
Accuracy (within ±10 ranks): 13.18%
   Actual Rank  Predicted Rank
0        252.0      222.264008
1        135.0      268.736572
2        562.0      484.814026
3        536.0      591.082581
4        377.0      456.468353
5        911.0      894.368652
6        395.0      594.057434
7        331.0      353.020813
8        452.0      569.058411
9        620.0      573.238098


In [None]:
# 对新大学进行预测（并不新）
new_university = torch.tensor([[355, 423, 210, 558, 558, 363, 737, 44.77]]).float().to(device)
predicted_rank = model(new_university).item()
print("Predicted World Rank for University of Salamanca:", predicted_rank)

Predicted World Rank for University of Salamanca: 573.2380981445312
