In [53]:
from dotenv import load_dotenv
load_dotenv()
from typing import Dict, List
import polars as pl
import httpx
import os

In [54]:
repository_url = (
    "https://raw.githubusercontent.com/deepfunding/mini-contest/refs/heads/main/"
)

df_train = pl.read_csv(f"{repository_url}/dataset.csv")
df_test = pl.read_csv(f"{repository_url}/test.csv")

# Light preprocessing to get project IDs instead of full URLs
df_train = df_train.with_columns(
    pl.col("project_a").str.split("github.com/").list.last().alias("project_a"),
    pl.col("project_b").str.split("github.com/").list.last().alias("project_b"),
)
df_test = df_test.with_columns(
    pl.col("project_a").str.split("github.com/").list.last().alias("project_a"),
    pl.col("project_b").str.split("github.com/").list.last().alias("project_b"),
)

In [55]:
df_train = pl.concat(
    [
        df_train,
        df_train.select(
            "id",
            pl.col("project_b").alias("project_a"),
            pl.col("project_a").alias("project_b"),
            pl.col("weight_b").alias("weight_a"),
            pl.col("weight_a").alias("weight_b"),
        ),
    ]
)

In [56]:
def get_repository_info(repository_id: str, client: httpx.Client) -> Dict:
    """
    Fetch repository information from GitHub API for a given repo URL.
    """
    api_url = f"https://api.github.com/repos/{repository_id}"

    headers = {
        "Accept": "application/vnd.github+json",
        "Authorization": f"Bearer {os.getenv('GITHUB_TOKEN')}",
        "X-GitHub-Api-Version": "2022-11-28",
    }

    try:
        response = client.get(api_url, headers=headers)
        response.raise_for_status()
        return response.json()
    except httpx.HTTPError:
        print(f"Error fetching data for {repository_id}")
        print(response.text)
        return {}

In [57]:
def get_projects_info(projects: List[str]) -> pl.DataFrame:
    """
    Fetch project information from GitHub API for a list of project IDs and return as a Polars DataFrame.
    """
    data = []
    with httpx.Client(
        transport=httpx.HTTPTransport(retries=5, verify=False),
        follow_redirects=True,
        limits=httpx.Limits(max_keepalive_connections=5, max_connections=10),
    ) as client:
        for project_id in projects:
            info = get_repository_info(project_id, client)
            if info:
                data.append(info)

    df = pl.DataFrame(data)
    return df


In [58]:
projects = (
    pl.concat(
        [
            df_train.get_column("project_a"),
            df_train.get_column("project_b"),
            df_test.get_column("project_a"),
            df_test.get_column("project_b"),
        ]
    )
    .unique()
    .to_list()
)

In [59]:
df_projects = get_projects_info(projects)

In [60]:
def add_github_projects_data(
    df: pl.DataFrame, df_projects: pl.DataFrame
) -> pl.DataFrame:
    """
    Add GitHub projects data to both projects in the DataFrame.
    """

    df_projects = df_projects.select(
        pl.col("full_name").str.to_lowercase().alias("project_id"),
        pl.col("created_at"),
        pl.col("updated_at"),
        pl.col("size"),
        pl.col("stargazers_count").alias("stars"),
        pl.col("watchers_count").alias("watchers"),
        pl.col("forks_count").alias("forks"),
        pl.col("open_issues_count").alias("open_issues"),
        pl.col("subscribers_count"),
    )

    df = df.join(
        df_projects,
        left_on="project_a",
        right_on="project_id",
        how="left",
        suffix="_a",
    )

    df = df.join(
        df_projects,
        left_on="project_b",
        right_on="project_id",
        how="left",
        suffix="_b",
    )

    return df

In [61]:
def extract_ratio_features(df: pl.DataFrame) -> pl.DataFrame:
    """
    Extract ratio-based features from repository data.
    """
    features = df.clone()

    # Basic ratios
    features = features.with_columns(
        [
            (pl.col("stars") / (pl.col("stars") + pl.col("stars_b"))).alias(
                "stars_ratio"
            ),
            (pl.col("watchers") / (pl.col("watchers") + pl.col("watchers_b"))).alias(
                "watchers_ratio"
            ),
            (pl.col("forks") / (pl.col("forks") + pl.col("forks_b"))).alias(
                "forks_ratio"
            ),
            (pl.col("size") / (pl.col("size") + pl.col("size_b"))).alias("size_ratio"),
            (
                pl.col("open_issues")
                / (pl.col("open_issues") + pl.col("open_issues_b"))
            ).alias("issues_ratio"),
            (
                pl.col("subscribers_count")
                / (pl.col("subscribers_count") + pl.col("subscribers_count_b"))
            ).alias("subscribers_count_ratio"),
        ]
    )

    return features

In [62]:
from datetime import datetime
import numpy as np

def extract_temporal_features(df: pl.DataFrame) -> pl.DataFrame:
    """
    Extract temporal features from repository data.
    """
    features = df.clone()

    if "created_at" in features.columns and "updated_at" in features.columns:
        features = features.with_columns(
            [
                pl.col("created_at")
                .str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%SZ")
                .alias("created_dt"),
                pl.col("updated_at")
                .str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%SZ")
                .alias("updated_dt"),
                pl.col("created_at_b")
                .str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%SZ")
                .alias("created_dt_b"),
                pl.col("updated_at_b")
                .str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%SZ")
                .alias("updated_dt_b"),
            ]
        )

        # Calculate days since last update
        now = pl.lit(datetime.now())
        features = features.with_columns(
            [
                ((now - pl.col("updated_dt")).dt.total_days()).alias(
                    "days_since_update"
                ),
                ((now - pl.col("updated_dt_b")).dt.total_days()).alias(
                    "days_since_update_b"
                ),
                (
                    (
                        pl.col("updated_dt").cast(pl.Int64)
                        - pl.col("created_dt").cast(pl.Int64)
                    )
                    / (24 * 3600)
                ).alias("age_days"),
                (
                    (
                        pl.col("updated_dt_b").cast(pl.Int64)
                        - pl.col("created_dt_b").cast(pl.Int64)
                    )
                    / (24 * 3600)
                ).alias("age_days_b"),
            ]
        )

    return features


In [63]:
def calculate_v_index(df_dependent: pl.DataFrame, df_repo: pl.DataFrame) -> Dict[str, int]:
  """
  Calculate V-Index of a software package.
  V-Index is N where N is the number of first-order dependencies that have
  at least N second-order dependencies.
  """
  data = {}
  for i in range(len(df_repo)):
    repo_url = df_repo['repo_url'][i]
    dependents = df_repo['list_of_dependents_in_oso'][i]

    data[repo_url] = 0

    # Filter the DataFrame for rows where 'package_artifact_name' is in the 'dependents' list
    df_dependents = df_dependent.filter(df_dependent['package_artifact_name'].is_in(dependents))

    # Count the number of rows in the filtered DataFrame
    first_order_counts = df_dependents.height  # Alternatively, df_dependents.shape[0]

    # Get the list of values in the 'num_dependents' column
    second_order_counts = df_dependents['num_dependents'].to_list()

    for j in range(first_order_counts):
      if j + 1 > second_order_counts[j]:
        data[repo_url] = j
  return data

In [64]:
df_dependent = pl.read_csv("data/dependent-metrics.csv")
df_repo = pl.read_parquet("data/repo_metrics_and_metadata.parquet")

In [16]:
def add_v_index_features(df: pl.DataFrame, df_v_index: pl.DataFrame) -> pl.DataFrame:
    """
    Add v_index to the DataFrame.
    """
    df = df.join(
        df_v_index,
        left_on="project_a",
        right_on="repo_url",
        how="left",
        suffix="_a",
    )
    df = df.join(
        df_v_index,
        left_on="project_b",
        right_on="repo_url",
        how="left",
        suffix="_b",
    )
    
    eps = 1e-6
    df = df.with_columns(
        (pl.col("v_index") / (pl.col("v_index") + pl.col("v_index_b") + eps)).alias("v_index_ratio"),
    )
    
    df = df.with_columns(
        (pl.col("stars") * (pl.col("v_index") + eps)).alias("stars_intersection_v_index"),
        (pl.col("stars_b") * (pl.col("v_index_b") + eps)).alias("stars_b_intersection_v_index_b"),
        (pl.col("stars_ratio") * (pl.col("v_index_ratio") + eps)).alias("stars_ratio_intersection_v_index_ratio"),
    )

    return df

In [17]:
df_train_full = add_github_projects_data(df_train, df_projects)
df_train_full = extract_ratio_features(df_train_full)
df_train_full = extract_temporal_features(df_train_full)

df_test_full = add_github_projects_data(df_test, df_projects)
df_test_full = extract_ratio_features(df_test_full)
df_test_full = extract_temporal_features(df_test_full)

In [18]:
v_index = calculate_v_index(df_dependent, df_repo)

In [19]:
df_v_index = pl.DataFrame({"repo_url": list(v_index.keys()), "v_index": list(v_index.values())})
df_v_index = df_v_index.with_columns(
        pl.col("repo_url").str.split("github.com/").list.last().alias("repo_url"),
    )

In [20]:
df_train_full = add_v_index_features(df_train_full, df_v_index)
df_test_full = add_v_index_features(df_test_full, df_v_index)

In [21]:
len(df_train_full)

4774

In [30]:
features = [
   "age_days", 
   "age_days_b", 
   "days_since_update", 
   "days_since_update_b",    
   "stars", 
   "stars_b", 
   "stars_ratio", 
   "forks", 
   "forks_b", 
   "forks_ratio", 
   "open_issues", 
   "open_issues_b", 
   "issues_ratio",
   "size", 
   "size_b", 
   "size_ratio", 
   "subscribers_count", 
   "subscribers_count_b",  
   "subscribers_count_ratio",
   "v_index", 
   "v_index_b", 
   "v_index_ratio", 
   "stars_intersection_v_index",
   "stars_b_intersection_v_index_b",
   "stars_ratio_intersection_v_index_ratio",
]

In [31]:
X = df_train_full.select(features).to_numpy()

y = df_train_full.get_column("weight_a").to_numpy()

In [32]:
import lightgbm as lgb
import numpy as np
from sklearn.model_selection import KFold

lgb_train_data = lgb.Dataset(X, label=y)

# Define parameters
params = {
    "objective": "regression",
    "metric": "mse",
    "force_col_wise": True,
    "num_leaves": 100,
}

# Perform 5-fold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = []

for train_idx, val_idx in kf.split(X):
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    # Create training and validation datasets
    train_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_val, label=y_val)

    # Train model
    model = lgb.train(params, train_data, valid_sets=[val_data])

    # Make predictions and calculate MSE
    y_pred = model.predict(X_val)
    mse = np.mean((y_val - y_pred) ** 2)
    cv_scores.append(mse)

# Calculate mean and std of MSE scores
cv_scores = np.array(cv_scores)
mean_mse = cv_scores.mean()
std_mse = cv_scores.std()

print(f"Cross-validation MSE: {mean_mse:.4f} (+/- {std_mse:.4f})")

[LightGBM] [Info] Total Bins 3470
[LightGBM] [Info] Number of data points in the train set: 3819, number of used features: 25
[LightGBM] [Info] Start training from score 0.495267
[LightGBM] [Info] Total Bins 3469
[LightGBM] [Info] Number of data points in the train set: 3819, number of used features: 25
[LightGBM] [Info] Start training from score 0.503440
[LightGBM] [Info] Total Bins 3469
[LightGBM] [Info] Number of data points in the train set: 3819, number of used features: 25
[LightGBM] [Info] Start training from score 0.494855
[LightGBM] [Info] Total Bins 3470
[LightGBM] [Info] Number of data points in the train set: 3819, number of used features: 25
[LightGBM] [Info] Start training from score 0.505230
[LightGBM] [Info] Total Bins 3470
[LightGBM] [Info] Number of data points in the train set: 3820, number of used features: 25
[LightGBM] [Info] Start training from score 0.501208
Cross-validation MSE: 0.0191 (+/- 0.0011)


In [33]:
# Train model on the entire dataset
model = lgb.train(
    params,
    lgb_train_data,
)

[LightGBM] [Info] Total Bins 3470
[LightGBM] [Info] Number of data points in the train set: 4774, number of used features: 25
[LightGBM] [Info] Start training from score 0.500000


In [34]:
X_test = df_test_full.select(features).to_numpy()

lgb_test_data = lgb.Dataset(X_test)

test_predictions = model.predict(X_test)
test_predictions = pl.Series(test_predictions.tolist()).round(6).clip(0)

In [35]:
importance = model.feature_importance()

feature_importance = pl.DataFrame({"feature": features, "importance": importance.tolist()}).sort(
    "importance", descending=True
)

feature_importance.plot.bar(x="importance", y="feature")


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.scatterplot(x='weight_a', y='v_index_ratio', data=df_train_full)
plt.title('Scatter Plot')
plt.show()

In [37]:
import datetime
df_test.select(pl.col("id"), pl.Series(test_predictions).alias("pred")).write_csv(
    f"data/submission_{datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}-mse_{mean_mse:.6f}.csv"
)
