In [4]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

TIME_COLUMN = "Execution time series"

def read_dataset(filename):
    df = pd.read_csv(filename, index_col=0)
    for column in ["Stars", "Size", "Mentionable Users", "Commits"]:
        df[column] = df[column].astype('int')

    df = df.drop_duplicates(subset=['Repository'])
    df = df.query('Commits >= 100 & Language != None')
    # Repos less or equal 1Gb size
    df = df.query('Size <= 1000000')
    
    # Add columns
    df.loc[:, TIME_COLUMN] = 0
    df[TIME_COLUMN] = df[TIME_COLUMN].astype('object')
    df.loc[:, "Bus Factor"] = 0
    df["Bus Factor"] = df["Bus Factor"].astype('int')
    return df

df = read_dataset("data/dataset.csv")
df.describe()

Unnamed: 0,Stars,Size,Mentionable Users,Commits,Bus Factor,Memory
count,908.0,908.0,908.0,908.0,908.0,908.0
mean,29422.636564,134879.270925,517.128855,1788.287445,0.0,0.0
std,26674.378773,189029.243041,970.831351,4548.024352,0.0,0.0
min,8925.0,517.0,1.0,100.0,0.0,0.0
25%,16168.0,17433.25,122.75,274.75,0.0,0.0
50%,21460.5,57482.0,249.0,704.5,0.0,0.0
75%,32524.5,165092.25,523.25,1815.0,0.0,0.0
max,367200.0,998104.0,10000.0,100489.0,0.0,0.0


In [5]:
# Merge datasets if needed
def merge(df, filename):
    prev_test = pd.read_csv(filename, index_col=0)
    for index, row in prev_test.iterrows():
        if row["Bus Factor"] != 0:
            idx = df.index[df['Repository'] == row["Repository"]]
            df.loc[idx, TIME_COLUMN] = row[TIME_COLUMN]
            df.loc[idx, 'Bus Factor'] = row["Bus Factor"]
    return df

df = merge(df, "data/result.csv")
df = df.drop('Memory', axis=1)
df.query('Repository == "flutter/flutter"')

Unnamed: 0,Repository,Clone URL,Stars,Size,Mentionable Users,Commits,Language,Bus Factor,Execution time series
5,flutter/flutter,https://github.com/flutter/flutter.git,153561,259459,1277,8307,Dart,10,"[15543, 16065, 15605, 15026, 14761, 14981, 150..."


In [None]:
import pandas as pd
import statistics
import requests
import time
import json

API_BASE_PATH = "http://localhost:8080"
ARTIFACTS_BASE_PATH = "../compose-workdir/artifacts/"

cnt = 0
failed = []

for index, row in df.iterrows():
    cnt += 1
    task_is_failed = False
    elapsedMillis = []
    busFactor = None

    print("=" * 12)
    print(f"[{cnt}/{len(df)}] " + row["Repository"])
    print()
    if row["Repository"] in failed:
        print("Not supported")
        continue
    if row["Bus Factor"] != 0:
        print("Already calculated")
        continue

    for i in range(10):
        payload = {
            "cloneUrl": row["Clone URL"],
            "owner": row["Repository"].split("/")[0],
            "repo": row["Repository"].split("/")[1],
        }
        r = requests.post(API_BASE_PATH + "/api/task/submit", json=payload)
        print(r.text + ": " + row["Repository"])

        task_is_running = True
        while task_is_running:
            time.sleep(5)
            status = requests.get(API_BASE_PATH + "/api/task/events")
            for msg in status.json():
                print(msg)
                if msg["status"] == "FAILED":
                    task_is_running = False
                    task_is_failed = True
                if msg["status"] == "DONE":
                    task_is_running = False

        if task_is_failed:
            failed.append(row["Repository"])
            df.drop(index, inplace=True)
            break

        path = ARTIFACTS_BASE_PATH + row["Repository"] + "/metrics.json"
        metrics = None
        with open(path, "r") as file:
            metrics = json.load(file)
        busFactor = int(metrics["busFactor"])
        elapsedMillis.append(int(metrics["elapsedMillis"]))
    if task_is_failed:
        continue

    df.at[index, TIME_COLUMN] = elapsedMillis
    df.at[index, "Bus Factor"] = busFactor


In [6]:
import datetime

now = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ")
with open("data/result_" + now + ".csv", "w+") as output:
    output.write(df.to_csv())


In [None]:
print(len(failed), failed)
df