In [1]:
import pandas as pd
from pathlib import Path

part_path = Path("part-2")
raw_path = Path(f"{part_path}/raw")
processed_path = Path(f"{part_path}/processed")
submission_path = Path(f"{part_path}/submission")


In [2]:
df_train = pd.read_csv(f"{part_path}/train.csv")
df_test = pd.read_csv(f"{part_path}/test.csv")

In [3]:
# get all projects in the train/test datasets
projects = pd.concat([
    df_train['project_a'],
    df_train['project_b'],
    df_test['project_a'],
    df_test['project_b']
]).unique().tolist()

In [4]:
len(projects), projects[:5]

(117,
 ['https://github.com/mochajs/mocha',
  'https://github.com/chzyer/readline',
  'https://github.com/gulpjs/gulp',
  'https://github.com/webpack/webpack',
  'https://github.com/redux-saga/redux-saga'])

In [5]:
def remove_str(projects):
    projects = [p.replace('https://github.com/', '') for p in projects]
    return projects

def remove_str_df(df):
    df['project_a'] = df['project_a'].str.replace('https://github.com/', '')
    df['project_b'] = df['project_b'].str.replace('https://github.com/', '')
    return df


In [6]:
def check_na(df):
    return df[df.isna().sum(axis=1) > 0]

In [7]:
# remove https://github.com/ from project names
df_train = remove_str_df(df_train)
df_test = remove_str_df(df_test)
projects = remove_str(projects)

In [8]:
df_projects = pd.read_csv(f"{raw_path}/github-projects.csv")

In [9]:
def add_repo_matrics(df_projects, df):
    df_projects = df_projects[['full_name', 'description', 'created_at', 'updated_at', 'size', 'has_wiki',
                          'stargazers_count', 'watchers_count', 'forks_count', 
                          'open_issues_count', 'subscribers_count']]
    # Rename columns
    df_projects = df_projects.rename(columns={
        'full_name': 'project',
        'stargazers_count': 'stars',
        'watchers_count': 'watchers', 
        'forks_count': 'forks',
        'open_issues_count': 'open_issues'
    })
    # Convert project to lowercase
    df_projects['project'] = df_projects['project'].str.lower()

    # add github repo data
    df = df.merge(
        df_projects,
        left_on="project_a",
        right_on="project",
        how="left",
        suffixes=('', '_a')
    )
    df.drop(columns=['project'], inplace=True)

    df = df.merge(
        df_projects,
        left_on="project_b",
        right_on="project",
        how="left",
        suffixes=('', '_b')
    )
    df.drop(columns=['project'], inplace=True)    
    return df

In [10]:
df_train = add_repo_matrics(df_projects, df_train)
df_test = add_repo_matrics(df_projects, df_test)

In [11]:
# get github activity data
df_act = pd.read_csv(f"{raw_path}/github-activity.csv")
df_act['repo_url'] = df_act['repo_url'].str.replace('https://github.com/', '')

In [12]:
missing_values = set(projects) - set(df_act['repo_url'])
missing_values

{'bradfitz/iter',
 'humanwhocodes/object-schema',
 'pnpm/cmd-shim',
 'sheetjs/js-crc32',
 'vweevers/module-error'}

In [13]:
def add_repo_activity(df_act, df):
    # add github activity data
    df = df.merge(
        df_act,
        left_on="project_a",
        right_on="repo_url",
        how="left",
        suffixes=('', '_a')
    )
    df.drop(columns=['repo_url'], inplace=True)

    df = df.merge(
        df_act,
        left_on="project_b",
        right_on="repo_url",
        how="left",
        suffixes=('', '_b')
    )
    df.drop(columns=['repo_url'], inplace=True)
    
    return df

In [14]:
df_train = add_repo_activity(df_act, df_train)
df_test = add_repo_activity(df_act, df_test)

In [15]:
df_train.columns = df_train.columns.str.lower()
df_test.columns = df_test.columns.str.lower()

In [16]:
isna_train = check_na(df_train)
isna_test = check_na(df_test)
isna_train, isna_test

(          id           project_a                       project_b  weight_a  \
 398     1113  emotion-js/emotion                   pnpm/cmd-shim  0.490741   
 413     1128  go-task/slim-sprig                   pnpm/cmd-shim  0.063830   
 427     1142       level/levelup                   pnpm/cmd-shim  0.241379   
 440     1155     chzyer/readline                   pnpm/cmd-shim  0.043478   
 452     1167           vuejs/vue                   pnpm/cmd-shim  0.979779   
 ...      ...                 ...                             ...       ...   
 20677  20603     mattn/go-isatty                zloirock/core-js  0.013424   
 20678  20604     mattn/go-isatty               pytest-dev/pytest  0.029344   
 20679  20605     mattn/go-isatty                    clap-rs/clap  0.227586   
 20680  20606     mattn/go-isatty  import-js/eslint-plugin-import  0.268293   
 20681  20607     mattn/go-isatty           webreflection/flatted  0.188356   
 
        weight_b  total_amount_usd          funder

In [17]:
df_train.fillna(0, inplace=True)
df_test.fillna(0, inplace=True)

In [18]:
def add_dependent_metrics(df_repo, df):

    df_repo_copy = df_repo[['repo_url', 'num_dependents_in_oso', 'oso_dependency_rank']].copy()
    df_repo_copy['repo_url'] = df_repo_copy['repo_url'].str.replace('https://github.com/', '')
    
    # Rename columns
    df_repo_copy = df_repo_copy.rename(columns={
        'num_dependents_in_oso': 'num_dependents',
        'oso_dependency_rank': 'dependency_rank'
    })
    
    # add github repo data
    df = df.merge(
        df_repo_copy,
        left_on="project_a",
        right_on="repo_url",
        how="left",
        suffixes=('', '_a')
    )
    df.drop(columns=['repo_url'], inplace=True)

    df = df.merge(
        df_repo_copy,
        left_on="project_b",
        right_on="repo_url",
        how="left",
        suffixes=('', '_b')
    )
    df.drop(columns=['repo_url'], inplace=True)    
    
    return df

In [19]:
df_dependent = pd.read_csv(f"{raw_path}/dependent-metrics.csv")
df_repo = pd.read_csv(f"{raw_path}/repo_metrics_and_metadata.csv")

In [20]:
df_train = add_dependent_metrics(df_repo, df_train)
df_test = add_dependent_metrics(df_repo, df_test)

In [21]:
isna_train = check_na(df_train)
isna_test = check_na(df_test)
isna_train, isna_test

(Empty DataFrame
 Columns: [id, project_a, project_b, weight_a, weight_b, total_amount_usd, funder, quarter, description, created_at, updated_at, size, has_wiki, stars, watchers, forks, open_issues, subscribers_count, description_b, created_at_b, updated_at_b, size_b, has_wiki_b, stars_b, watchers_b, forks_b, open_issues_b, subscribers_count_b, commit_code, forked, issue_closed, issue_comment, issue_opened, issue_reopened, pull_request_closed, pull_request_merged, pull_request_opened, pull_request_reopened, pull_request_review_comment, release_published, starred, commit_code_b, forked_b, issue_closed_b, issue_comment_b, issue_opened_b, issue_reopened_b, pull_request_closed_b, pull_request_merged_b, pull_request_opened_b, pull_request_reopened_b, pull_request_review_comment_b, release_published_b, starred_b, num_dependents, dependency_rank, num_dependents_b, dependency_rank_b]
 Index: []
 
 [0 rows x 58 columns],
 Empty DataFrame
 Columns: [id, project_a, project_b, total_amount_usd, fu

In [22]:
def calculate_v_index(df_dependent, df_repo):
    """
    Calculate V-Index of a software package.
    
    V-Index is N where N is the number of first-order dependencies that have
    at least N second-order dependencies.
    """
    data = {}

    for _, row in df_repo.iterrows():
        repo_url = row['repo_url']

        # convert to list
        first_order_dependents = row['list_of_dependents_in_oso'].strip("[]").replace("'", "").split()
        
        # first order dependencies
        first_order_df = df_dependent[df_dependent['package_artifact_name'].isin(first_order_dependents)].copy()

        # Sort in descending order by 'num_dependents' (the second-order counts)
        first_order_df.sort_values(by='num_dependents', ascending=False, inplace=True)
        
        # convert the second-order counts to a list
        second_order_counts = first_order_df['num_dependents'].tolist()
        
        # V-Index logic:
        # We want the largest N s.t. there are at least N dependencies (first-order),
        # each having >= N dependents (second-order).
        v_index = 0
        for i, count in enumerate(second_order_counts):
            # i is zero-based, so the candidate N is (i+1).
            # If count < (i+1), we can't claim an index of (i+1).
            if count < (i + 1):
                v_index = i  # the largest index we could achieve so far
                break
        else:
            # If we never break, it means *all* dependencies had enough second-order
            # so the V-Index equals the total number of first-order dependencies
            v_index = len(second_order_counts)

        data[repo_url] = v_index

    return data

In [23]:
v_index = calculate_v_index(df_dependent, df_repo)
df_v_index = pd.DataFrame.from_dict(v_index, orient='index', columns=['v_index']).reset_index().rename(columns={'index': 'repo_url'})
df_v_index['repo_url'] = df_v_index['repo_url'].str.replace('https://github.com/', '')

In [24]:
def add_v_index_features(df_v_index, df):
    """
    Add v_index to the DataFrame.
    """
    df = df.merge(
        df_v_index,
        left_on="project_a",
        right_on="repo_url",
        how="left",
        suffixes=('', '_a'),
    )
    df.drop(columns=['repo_url'], inplace=True)
    
    df = df.merge(
        df_v_index,
        left_on="project_b",
        right_on="repo_url",
        how="left",
        suffixes=('', '_b'),
    )
    df.drop(columns=['repo_url'], inplace=True)

    return df

In [25]:
df_train = add_v_index_features(df_v_index, df_train)
df_test = add_v_index_features(df_v_index, df_test)

In [26]:
isna_train = check_na(df_train)
isna_test = check_na(df_test)
isna_train, isna_test

(Empty DataFrame
 Columns: [id, project_a, project_b, weight_a, weight_b, total_amount_usd, funder, quarter, description, created_at, updated_at, size, has_wiki, stars, watchers, forks, open_issues, subscribers_count, description_b, created_at_b, updated_at_b, size_b, has_wiki_b, stars_b, watchers_b, forks_b, open_issues_b, subscribers_count_b, commit_code, forked, issue_closed, issue_comment, issue_opened, issue_reopened, pull_request_closed, pull_request_merged, pull_request_opened, pull_request_reopened, pull_request_review_comment, release_published, starred, commit_code_b, forked_b, issue_closed_b, issue_comment_b, issue_opened_b, issue_reopened_b, pull_request_closed_b, pull_request_merged_b, pull_request_opened_b, pull_request_reopened_b, pull_request_review_comment_b, release_published_b, starred_b, num_dependents, dependency_rank, num_dependents_b, dependency_rank_b, v_index, v_index_b]
 Index: []
 
 [0 rows x 60 columns],
 Empty DataFrame
 Columns: [id, project_a, project_b, 

In [27]:
def add_feature_ratios(df):
    eps = 1e-6
    # get repo metrics ratios
    df['stars_ratio'] = df['stars'] / (df['stars'] + df['stars_b'] + eps)
    df['forks_ratio'] = df['forks'] / (df['forks'] + df['forks_b'] + eps)
    df['size_ratio'] = df['size'] / (df['size'] + df['size_b'] + eps)
    df['watchers_ratio'] = df['watchers'] / (df['watchers'] + df['watchers_b'] + eps)
    df['issues_ratio'] = df['open_issues'] / (df['open_issues'] + df['open_issues_b'] + eps)
    df['subscribers_ratio'] = df['subscribers_count'] / (df['subscribers_count'] + df['subscribers_count_b'] + eps)

    # get activity ratio(last 6 months)
    df['commits_ratio'] = df['commit_code'] / (df['commit_code'] + df['commit_code_b'] + eps)
    df['forked_ratio'] = df['forked'] / (df['forked'] + df['forked_b'] + eps)
    df['issue_closed_ratio'] = df['issue_closed'] / (df['issue_closed'] + df['issue_closed_b'] + eps)
    df['issue_comment_ratio'] = df['issue_comment'] / (df['issue_comment'] + df['issue_comment_b'] + eps)
    df['issue_opened_ratio'] = df['issue_opened'] / (df['issue_opened'] + df['issue_opened_b'] + eps)
    df['issue_reopened_ratio'] = df['issue_reopened'] / (df['issue_reopened'] + df['issue_reopened_b'] + eps)
    df['pull_request_closed_ratio'] = df['pull_request_closed'] / (df['pull_request_closed'] + df['pull_request_closed_b'] + eps)
    df['pull_request_merged_ratio'] = df['pull_request_merged'] / (df['pull_request_merged'] + df['pull_request_merged_b'] + eps)
    df['pull_request_opened_ratio'] = df['pull_request_opened'] / (df['pull_request_opened'] + df['pull_request_opened_b'] + eps)
    df['pull_request_reopened_ratio'] = df['pull_request_reopened'] / (df['pull_request_reopened'] + df['pull_request_reopened_b'] + eps)
    df['pull_request_review_comment_ratio'] = df['pull_request_review_comment'] / (df['pull_request_review_comment'] + df['pull_request_review_comment_b'] + eps)
    df['release_published_ratio'] = df['release_published'] / (df['release_published'] + df['release_published_b'] + eps)
    df['starred_ratio'] = df['starred'] / (df['starred'] + df['starred_b'] + eps)

    # dependent metrics ratios
    df['num_dependents_ratio'] = df['num_dependents'] / (df['num_dependents'] + df['num_dependents_b'] + eps)
    
    # v_index ratios
    df['v_index_ratio'] = df['v_index'] / (df['v_index'] + df['v_index_b'] + eps)
    df['stars_intersection_v_index'] = df['stars'] * (df['v_index'] + eps)
    df['stars_b_intersection_v_index_b'] = df['stars_b'] * (df['v_index_b'] + eps)
    df['stars_ratio_intersection_v_index_ratio'] = df['stars_ratio'] * (df['v_index_ratio'] + eps)

    return df

In [28]:
df_train = add_feature_ratios(df_train)
df_test = add_feature_ratios(df_test)

In [29]:
isna_train = check_na(df_train)
isna_test = check_na(df_test)
isna_train, isna_test

(Empty DataFrame
 Columns: [id, project_a, project_b, weight_a, weight_b, total_amount_usd, funder, quarter, description, created_at, updated_at, size, has_wiki, stars, watchers, forks, open_issues, subscribers_count, description_b, created_at_b, updated_at_b, size_b, has_wiki_b, stars_b, watchers_b, forks_b, open_issues_b, subscribers_count_b, commit_code, forked, issue_closed, issue_comment, issue_opened, issue_reopened, pull_request_closed, pull_request_merged, pull_request_opened, pull_request_reopened, pull_request_review_comment, release_published, starred, commit_code_b, forked_b, issue_closed_b, issue_comment_b, issue_opened_b, issue_reopened_b, pull_request_closed_b, pull_request_merged_b, pull_request_opened_b, pull_request_reopened_b, pull_request_review_comment_b, release_published_b, starred_b, num_dependents, dependency_rank, num_dependents_b, dependency_rank_b, v_index, v_index_b, stars_ratio, forks_ratio, size_ratio, watchers_ratio, issues_ratio, subscribers_ratio, comm

In [30]:
df_train.to_csv(f"{processed_path}/train-pre-embeddings.csv", index=False)
df_test.to_csv(f"{processed_path}/test-pre-embeddings.csv", index=False)
