In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset, random_split

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

import os
import pandas as pd
import numpy as np
import requests
import time

In [4]:
def remove_duplicate_links(input_csv, output_csv):
    # 读取原始 CSV 文件
    df = pd.read_csv(input_csv)
    
    # 提取 project_a 和 project_b 列的所有链接
    links_a = df['project_a'].tolist()
    links_b = df['project_b'].tolist()

    # 合并所有链接，去重
    all_links = list(set(links_a + links_b))

    # 将结果保存为新的 CSV 文件
    result_df = pd.DataFrame(all_links, columns=['project_url'])
    result_df.to_csv(output_csv, index=False)
    print(f"Processed {len(all_links)} unique project URLs and saved to {output_csv}")



Processed 117 unique project URLs and saved to unique_urls.csv


In [None]:
# 示例使用
input_file = 'all.csv'  # 输入文件路径
output_file = 'urls_unique.csv'  # 输出文件路径
remove_duplicate_links(input_file, output_file)

In [None]:
# 获取 GitHub 项目特征（stars、forks、open issues、open pull requests）
def get_github_features(project_url, headers):
    """
    获取 GitHub 项目的特征：stars、forks、open issues 和 open pull requests
    返回：stars, forks, open_issues, open_pulls
    """
    repo_name = project_url.split('/')[-1]
    owner_name = project_url.split('/')[-2]

    # GitHub API 请求 URL
    repo_url = f"https://api.github.com/repos/{owner_name}/{repo_name}"
    issues_url = f"https://api.github.com/repos/{owner_name}/{repo_name}/issues?state=open"
    pulls_url = f"https://api.github.com/repos/{owner_name}/{repo_name}/pulls?state=open"

    # 请求项目的基本信息
    response = requests.get(repo_url, headers=headers)
    if response.status_code != 200:
        return None, None, None, None  # 请求失败时返回 None

    data = response.json()

    stars = data.get('stargazers_count', 0)  # 获取 stars 数量
    forks = data.get('forks_count', 0)      # 获取 forks 数量

    # 请求 open issues 和 open pull requests
    open_issues_response = requests.get(issues_url, headers=headers)
    open_pulls_response = requests.get(pulls_url, headers=headers)

    if open_issues_response.status_code != 200 or open_pulls_response.status_code != 200:
        return stars, forks, None, None  # 获取 issues 和 pull requests 失败时返回 None

    open_issues = len(open_issues_response.json())  # 获取 open issues 数量
    open_pulls = len(open_pulls_response.json())    # 获取 open pull requests 数量

    return stars, forks, open_issues, open_pulls

In [12]:
# 处理 urls_unique 中的项目，获取其 GitHub 信息并保存到新的 CSV 文件
def update_github_info(input_csv, output_csv, access_token):
    """
    更新 CSV 数据，添加 stars、forks、open issues 和 open pull requests 信息
    """
    # 读取原始数据
    df = pd.read_csv(input_csv)

    # 设置 GitHub 请求头
    headers = {'Authorization': f'token {access_token}'}

    # 初始化新的列
    stars = []
    forks = []
    open_issues = []
    open_pulls = []

    # 遍历每个项目 URL，获取其 GitHub 信息
    for i, row in df.iterrows():
        project_url = row['project_url']  # 获取 project_url

        # 获取 GitHub 项目的特征
        stars_val, forks_val, open_issues_val, open_pulls_val = get_github_features(project_url, headers)

        # 将结果添加到列表中
        stars.append(stars_val)
        forks.append(forks_val)
        open_issues.append(open_issues_val)
        open_pulls.append(open_pulls_val)

        # 每处理完一行就保存一次
        df.at[i, 'stars'] = stars_val
        df.at[i, 'forks'] = forks_val
        df.at[i, 'open_issues'] = open_issues_val
        df.at[i, 'open_pulls'] = open_pulls_val

        # 每行数据处理完后保存到输出文件
        df.to_csv(output_csv, index=False)
        print(f"Processed row {i + 1} and saved to {output_csv}")

    print(f"Finished processing. Data saved to {output_csv}")

In [14]:
input_file = 'unique_urls.csv'  # 输入文件路径，包含 GitHub 项目 URL
output_file = 'urls_cache.csv'  # 输出文件路径
access_token = "xxx" # GitHub 访问令牌 需要自行设置
update_github_info(input_file, output_file, access_token)

Processed row 1 and saved to urls_cache.csv
Processed row 2 and saved to urls_cache.csv
Processed row 3 and saved to urls_cache.csv
Processed row 4 and saved to urls_cache.csv
Processed row 5 and saved to urls_cache.csv
Processed row 6 and saved to urls_cache.csv
Processed row 7 and saved to urls_cache.csv
Processed row 8 and saved to urls_cache.csv
Processed row 9 and saved to urls_cache.csv
Processed row 10 and saved to urls_cache.csv
Processed row 11 and saved to urls_cache.csv
Processed row 12 and saved to urls_cache.csv
Processed row 13 and saved to urls_cache.csv
Processed row 14 and saved to urls_cache.csv
Processed row 15 and saved to urls_cache.csv
Processed row 16 and saved to urls_cache.csv
Processed row 17 and saved to urls_cache.csv
Processed row 18 and saved to urls_cache.csv
Processed row 19 and saved to urls_cache.csv
Processed row 20 and saved to urls_cache.csv
Processed row 21 and saved to urls_cache.csv
Processed row 22 and saved to urls_cache.csv
Processed row 23 an

In [None]:
# 函数：根据 project_url 获取相关特征
def get_project_features(dataset_file,project_url):
    # 读取 dataset.csv
    dataset_df = pd.read_csv(dataset_file)
    # 读取 urls_cache.csv
    cache_df = pd.read_csv(cache_file)
    # 将 urls_cache.csv 转换为字典，方便根据 project_url 查找特征
    cache_dict = cache_df.set_index('project_url').to_dict(orient='index')
    if project_url in cache_dict:
        return cache_dict[project_url]
    else:
        return None  # 如果没有找到该 URL 对应的信息，返回 None
    
def add_github_features(dataset_file, cache_file, output_file):
    # 为 dataset.csv 添加新的列
    stars_a, forks_a, open_issues_a, open_pulls_a = [], [], [], []
    stars_b, forks_b, open_issues_b, open_pulls_b = [], [], [], []

    # 读取 dataset.csv
    dataset_df = pd.read_csv(dataset_file)
    
    # 遍历 dataset.csv 中的每一行，添加相应的特征
    for _, row in dataset_df.iterrows():
        # 获取 project_a 和 project_b 的特征
        features_a = get_project_features(dataset_file, row['project_a'])
        features_b = get_project_features(dataset_file, row['project_b'])

        if features_a:
            stars_a.append(features_a['stars'])
            forks_a.append(features_a['forks'])
            open_issues_a.append(features_a['open_issues'])
            open_pulls_a.append(features_a['open_pulls'])
        else:
            stars_a.append(None)
            forks_a.append(None)
            open_issues_a.append(None)
            open_pulls_a.append(None)

        if features_b:
            stars_b.append(features_b['stars'])
            forks_b.append(features_b['forks'])
            open_issues_b.append(features_b['open_issues'])
            open_pulls_b.append(features_b['open_pulls'])
        else:
            stars_b.append(None)
            forks_b.append(None)
            open_issues_b.append(None)
            open_pulls_b.append(None)

    # 将新列添加到 dataset_df 中
    dataset_df['stars_a'] = stars_a
    dataset_df['forks_a'] = forks_a
    dataset_df['open_issues_a'] = open_issues_a
    dataset_df['open_pulls_a'] = open_pulls_a
    dataset_df['stars_b'] = stars_b
    dataset_df['forks_b'] = forks_b
    dataset_df['open_issues_b'] = open_issues_b
    dataset_df['open_pulls_b'] = open_pulls_b

    # 将更新后的 DataFrame 保存到新的 CSV 文件
    dataset_df.to_csv(output_file, index=False)
    print(f"Updated data saved to {output_file}")


In [None]:
# 示例使用
dataset_file = 'dataset.csv'  # 输入文件路径，包含项目 URL 和权重
cache_file = 'urls_cache.csv'  # 输入文件路径，包含 GitHub 项目的特征
output_file = 'dataset_update.csv'  # 输出文件路径，保存更新后的数据
add_github_features(dataset_file, cache_file, output_file)

In [3]:
# Step 1: 数据加载与预处理
def load_and_preprocess_data(file_path):
    # 加载数据
    df = pd.read_csv(file_path)

    # id,project_a,project_b,weight_a,weight_b,stars_a,forks_a,open_issues_a,open_pulls_a,stars_b,forks_b,open_issues_b,open_pulls_b
    
    # 提取特征和目标
    features = df[['stars_a', 'forks_a', 'open_issues_a', 'open_pulls_a', 'stars_b', 'forks_b', 'open_issues_b', 'open_pulls_b']].values
    targets = df[['weight_a']].values

    # 特征标准化
    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(features)

    return features_scaled, targets

In [4]:
# Step 2: 构建神经网络模型
class FundingPredictionModel(nn.Module):
    def __init__(self, input_dim):
        super(FundingPredictionModel, self).__init__()
        # 定义神经网络结构
        self.fc1 = nn.Linear(input_dim, 32)
        self.fc2 = nn.Linear(32, 64)
        self.fc3 = nn.Linear(64, 32)
        self.fc4 = nn.Linear(32, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.pool(F.relu(self.fc2(x)), 2)
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [16]:
class FundingPredictionModel(nn.Module):
    def __init__(self, input_dim):
        super(FundingPredictionModel, self).__init__()
        # 定义神经网络结构
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=16, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv1d(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool1d(kernel_size=2, stride=2, padding=0)
        self.dropout = nn.Dropout(p=0.5)
        self.fc1 = nn.Linear(32 * (input_dim // 4), 128)  # 输入维度减半后
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 32)
        self.fc4 = nn.Linear(32, 1)
    def forward(self, x):
        x = x.unsqueeze(1)# ([1909, 1, 8])
        x = self.conv1(x) # [1909, 16, 8]
        x = F.relu(x)
        x = self.pool(x) # [1909, 16, 4]
        
        x = self.conv2(x)# [1909, 32, 4]
        x = F.relu(x)
        x = self.pool(x) # [1909, 32, 2]
        
        x = x.view(x.size(0), -1)# [1909, 64]
        x = self.dropout(x)
        x = F.relu(self.fc1(x)) # [1909, 128]
        x = F.relu(self.fc2(x)) # [1909, 64]
        x = F.relu(self.fc3(x)) # [1909, 32]
        x = self.fc4(x) # [1909, 1]
        return x

In [6]:
# Step 3: 模型训练
def train_model(model, X_train, y_train, X_val, y_val, epochs=100, batch_size=32, lr=0.001):
    # 损失函数与优化器
    criterion = nn.MSELoss()  # 使用均方误差损失函数
    optimizer = optim.Adam(model.parameters(), lr=lr)

    # 将数据转换为 Tensor
    X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
    X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
    y_val_tensor = torch.tensor(y_val, dtype=torch.float32)

    # 训练过程
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        # 前向传播
        outputs = model(X_train_tensor)
        # 计算损失
        loss = criterion(outputs, y_train_tensor)
        # 反向传播
        loss.backward()
        optimizer.step()

        # 每十分之一个epoch打印一次损失
        if (epoch + 1) % (epochs//10) == 0:
            print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}")
            # 每10个epoch验证
            model.eval()
            with torch.no_grad():
                val_outputs = model(X_val_tensor)
                val_loss = mean_squared_error(y_val_tensor.numpy(), val_outputs.numpy())
                print(f"Validation MSE at epoch {epoch+1}: {val_loss:.4f}")

    return model

In [7]:

# Step 4: 模型评估
def evaluate_model(model, X_test):
    model.eval()
    X_test_tensor = torch.tensor(X_test, dtype=torch.float32)

    with torch.no_grad():
        predictions  = model(X_test_tensor).squeeze()

    submission = pd.DataFrame({'id': test_data['id'], 'pred': predictions.numpy()})
    submission.to_csv('sample_submission.csv', index=False, float_format='%.11f')
    print("Predictions saved to sample_submission.csv")

In [None]:
device= "cpu"
if torch.cuda.is_available():
    device = "cuda:3"
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
    device = "mps"
print(f"using device:{device}")

device_type = "cuda" if device.startswith("cuda") else "cpu"
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed(42)

# Step 1: 数据加载与预处理
file_path = 'dataset_update.csv'  # 数据集文件路径
X, y = load_and_preprocess_data(file_path)

# Step 2: 划分数据集
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: 模型初始化
input_dim = X_train.shape[1]  # 特征数量
print(f"Input dimension: {input_dim}")
model = FundingPredictionModel(input_dim)

# Step 4: 训练模型
trained_model = train_model(model, X_train, y_train, X_val, y_val, epochs=600, batch_size=512)

# Step 5: 评估模型
# 读取测试数据
test_data = pd.read_csv('test.csv')
cache_file = 'urls_cache.csv'  # 输入文件路径，包含 GitHub 项目的特征
test_data[['stars_a', 'forks_a', 'open_issues_a', 'open_pulls_a']] = test_data['project_a'].apply(lambda x: pd.Series(get_project_features(cache_file,x)))
test_data[['stars_b', 'forks_b', 'open_issues_b', 'open_pulls_b']] = test_data['project_b'].apply(lambda x: pd.Series(get_project_features(cache_file,x)))

# 选择特征
features = ['stars_a', 'forks_a', 'open_issues_a', 'open_pulls_a', 
            'stars_b', 'forks_b', 'open_issues_b', 'open_pulls_b']
# 标准化测试数据
X_test = test_data[features].values
scaler = StandardScaler()
X_test_scaled = scaler.fit_transform(X_test)
print("Evaluating model on test data...")
evaluate_model(trained_model, X_test_scaled)

using device:cpu
Input dimension: 8
Epoch [60/600], Loss: 0.1155
Validation MSE at epoch 60: 0.1180
Epoch [120/600], Loss: 0.1098
Validation MSE at epoch 120: 0.1114
Epoch [180/600], Loss: 0.1018
Validation MSE at epoch 180: 0.1056
Epoch [240/600], Loss: 0.0953
Validation MSE at epoch 240: 0.0999
Epoch [300/600], Loss: 0.0893
Validation MSE at epoch 300: 0.0945
Epoch [360/600], Loss: 0.0841
Validation MSE at epoch 360: 0.0896
Epoch [420/600], Loss: 0.0795
Validation MSE at epoch 420: 0.0873
Epoch [480/600], Loss: 0.0766
Validation MSE at epoch 480: 0.0852
Epoch [540/600], Loss: 0.0733
Validation MSE at epoch 540: 0.0825
Epoch [600/600], Loss: 0.0682
Validation MSE at epoch 600: 0.0823
Evaluating model on test data...
Predictions saved to sample_submission.csv


In [8]:
# 实例使用
test_data = pd.read_csv('test.csv')
cache_file = 'urls_cache.csv'  # 输入文件路径，包含 GitHub 项目的特征
test_data[['stars_a', 'forks_a', 'open_issues_a', 'open_pulls_a']] = test_data['project_a'].apply(lambda x: pd.Series(get_project_features(cache_file, x)))
print(test_data[['stars_a', 'forks_a', 'open_issues_a', 'open_pulls_a']])

      stars_a  forks_a  open_issues_a  open_pulls_a
0       731.0     75.0           18.0           9.0
1       731.0     75.0           18.0           9.0
2       731.0     75.0           18.0           9.0
3       731.0     75.0           18.0           9.0
4      3499.0   1045.0           30.0          30.0
...       ...      ...            ...           ...
1018    181.0     32.0           30.0          14.0
1019    181.0     32.0           30.0          14.0
1020   1466.0    478.0           30.0          14.0
1021   1466.0    478.0           30.0          14.0
1022   3199.0   1166.0           30.0          30.0

[1023 rows x 4 columns]
