In [9]:
from pprint import pprint
from functools import reduce
import brotli
import pickle


with open("./dump/dbeaver_dbeaver/commits.szhat".replace("\\","/"), "rb") as handler:
    # print(handler.read())
    data = handler.read()
    decompressed_data = brotli.decompress(data)
    obj = pickle.loads(decompressed_data)

# print(c[1].files)
# avg = reduce(lambda x, y: x+y, map(lambda c: c.files, commits))/ len(commits)
# c = commits[0]

# print(c.stats.additions)

In [10]:
print([o for o in obj if o.files is None])

[Commit(sha="fffdc7e483462617869d63f38d4f6473b084c876")]


## Gathering the data

In [7]:
import glob
import pickle
from collections import defaultdict
from pprint import pprint
import pandas as pd

repo_dicts = defaultdict(lambda: {})
metric_dicts = defaultdict(lambda: defaultdict(lambda: {}))
for p in glob.glob(".\intermediate\*\*.pickle"):
    with open(p, "rb") as file:
        metric = pickle.load(file)
        dic = metric.__dict__
        repo_name = dic.pop("repo_name")
        metric_type = p.split('\\')[-1].split(".")[0]

        metric_dicts[metric_type][repo_name] |= dic
        repo_dicts[repo_name] |= dic
        
data = pd.DataFrame.from_dict(repo_dicts)
data.to_csv("metrics.csv",index=True)

## Working with the data

In [10]:
name_predicate_pairs= [
    ("Have releases", lambda inner: inner.get("[Releases] Count")),
    ("Have no releases", lambda inner: not inner.get("[Releases] Count")),
    ("Have source code only releases", lambda inner: not inner.get("[Releases] Total downloads")),
    ("Have binary and source code releases", lambda inner: inner.get("[Releases] Total downloads")),
    ]


data_to_gather = {
    name: {repo: inner for repo, inner in repo_dicts.items() if func(inner)}
    for name, func in name_predicate_pairs
}

print(f"Metrics count: {len(data)}")
print(f"Repo    count: {len(data.keys())}\n")

for type, inner in data_to_gather.items():
    print(f"{type}: {len(inner)}")
    # display(pd.DataFrame(inner))

Metrics count: 107
Repo    count: 173

Have releases: 121
Have no releases: 52
Have source code only releases: 114
Have binary and source code releases: 59


## Calculating correlations

In [129]:
from itertools import combinations_with_replacement
from IPython.display import display, Markdown, Latex

LOWER_BOUND = 0.65

def prune_dict(dct, mask):
    result = {}
    for k, v in mask.items():
        if isinstance(v, dict):
            value = prune_dict(dct[k], v)
            if value: # check that dict is non-empty
                result[k] = value
        elif v:
            result[k] = dct[k]
    return result

# for data, name in zip([data, downloads_data, no_downloads_data], ["correlations", "downloads_correlations", "no_downloads_correlations"]):
values = {name:data.loc[name] for name in data.index}
correlations = pd.DataFrame(values).corr(method='pearson').values
frame = pd.DataFrame(pd.np.triu(correlations), index=data.index, columns=data.index)
frame.to_csv('correlations.csv', index=True)


for g1, g2 in combinations_with_replacement(metric_dicts.items(), 2):
    m1, inner_dict1 = g1
    m2, inner_dict2 = g2

    res = {name:inner_dict1[name] | inner_dict2[name] for name in inner_dict1}
    res_df = pd.DataFrame(res).transpose().corr(method='pearson')
    metric1_names = list(list(inner_dict1.values())[0].keys())
    metric2_names = list(list(inner_dict2.values())[0].keys())
    df = res_df[metric1_names].loc[metric2_names]
    
    # df = df[df[df.columns] > 0.7]
    mask = df.where(df != 1).where(df != 0).where(abs(df) > LOWER_BOUND)
    mask = mask.dropna(how="all").dropna(axis="columns", how="all")
    mask = mask.iloc[:, :-1].notnull().astype(float).groupby(level=0).sum().to_dict()
    tmp = prune_dict(df.to_dict(), mask)

    if tmp:
        display(Markdown(f"# {m1} X {m2}"))
        for name, val in tmp.items():   
            # print(tmp)
            display(pd.DataFrame({name: val}))
    # display(pd.DataFrame(df))

  frame = pd.DataFrame(pd.np.triu(correlations), index=data.index, columns=data.index)


# Commits X Commits

Unnamed: 0,[Commits] Count
[Commits] Per day,0.784588


Unnamed: 0,[Commits] Total lines added
[Commits] Average additions,0.70534
[Commits] Average deletions,0.802878
[Commits] Total lines deleted,0.958184


Unnamed: 0,[Commits] Total lines deleted
[Commits] Average deletions,0.673539
[Commits] Total lines added,0.958184


Unnamed: 0,[Commits] Average additions
[Commits] Average deletions,0.94839
[Commits] Average files changed,0.933814
[Commits] Total lines added,0.70534


Unnamed: 0,[Commits] Average deletions
[Commits] Average additions,0.94839
[Commits] Average files changed,0.80291
[Commits] Total lines added,0.802878
[Commits] Total lines deleted,0.673539


Unnamed: 0,[Commits] Average files changed
[Commits] Average additions,0.933814
[Commits] Average deletions,0.80291


# Commits X Contributors

Unnamed: 0,[Commits] Count
[Contributors Top-10] Average commits,0.727257
[Contributors Top-10] Average participation weeks,0.741778


Unnamed: 0,[Commits] Total lines added
[Contributors Top-10] Average additions,0.841847
[Contributors Top-10] Average deletions,0.838033


# Commits X Meta

Unnamed: 0,[Commits] Average additions
[Repo] Files,0.877736
[Repo] Size,0.688263


Unnamed: 0,[Commits] Average deletions
[Repo] Files,0.720174


# Contributors X Contributors

Unnamed: 0,[Contributors Top-10] Average commits
[Contributors Top-10] Average participation weeks,0.839245


Unnamed: 0,[Contributors Top-10] Average participation weeks
[Contributors Top-10] Average commits,0.839245


Unnamed: 0,[Contributors Top-10] Average additions
[Contributors Top-10] Average deletions,0.995033


Unnamed: 0,[Contributors Top-10] Average deletions
[Contributors Top-10] Average additions,0.995033


Unnamed: 0,[Contributors Top-10] Average followers
[Contributors Top-10] Average forks,0.967236
[Contributors Top-10] Average stars,0.963205
[Contributors Top-10] Average watchers,0.960362


Unnamed: 0,[Contributors Top-10] Average stars
[Contributors Top-10] Average followers,0.963205
[Contributors Top-10] Average forks,0.978754
[Contributors Top-10] Average watchers,0.973593


Unnamed: 0,[Contributors Top-10] Average watchers
[Contributors Top-10] Average followers,0.960362
[Contributors Top-10] Average forks,0.962885
[Contributors Top-10] Average stars,0.973593


# Contributors X Issues

Unnamed: 0,[Contributors] Count
[Issues] Total comments,0.719359


# Contributors X WorkflowRuns

Unnamed: 0,[Contributors Top-10] Average additions
[Workflow Runs] Average success duration (ms),0.701475


# Forks X Forks

Unnamed: 0,[Forks] Count
[Forks] Per day (True),0.815695


# Forks X Meta

Unnamed: 0,[Forks] Count
[Repo] Network members,0.999756
[Repo] Watchers,0.874756


# Issues X Issues

Unnamed: 0,[Issues] Total comments
[Issues] Count,0.773205
[Issues] Per day (True),0.665683


Unnamed: 0,[Issues] Count
[Issues] Open,0.709967
[Issues] Per day,0.812893
[Issues] Per day (True),0.854595
[Issues] Total comments,0.773205


Unnamed: 0,[Issues] Open
[Issues] Count,0.709967
[Issues] Per day,0.653239
[Issues] Per day (True),0.678638


Unnamed: 0,[Issues] Average participants (beside creator)
[Issues] Average comments,0.746953


Unnamed: 0,[Issues] Average comments
[Issues] Average participants (beside creator),0.746953


Unnamed: 0,[Issues] Per day
[Issues] Count,0.812893
[Issues] Open,0.653239
[Issues] Per day (True),0.975626


# Issues X Pulls

Unnamed: 0,[Issues] Total comments
[Pulls] Count,0.787926
[Pulls] Created per day,0.722461
[Pulls] Created per day (True),0.758612


Unnamed: 0,[Issues] Average assignees
[Pulls] Average assignees,0.716236


# Meta X Meta

Unnamed: 0,[Repo] Size
[Repo] Files,0.791074


Unnamed: 0,[Repo] Files
[Repo] Size,0.791074


Unnamed: 0,[Repo] Branches
[Repo] Deployments,0.833862


Unnamed: 0,[Repo] Watchers
[Repo] Network members,0.874472


Unnamed: 0,[Repo] Deployments
[Repo] Branches,0.833862


# Meta X Stars

Unnamed: 0,[Repo] Watchers
[Stars] Count,0.933091


# Pulls X Pulls

Unnamed: 0,[Pulls] Count
[Pulls] Created per day,0.740626
[Pulls] Created per day (True),0.897193
[Pulls] Total lines added,0.699002


Unnamed: 0,[Pulls] Closed to total
[Pulls] Mergeable to total,-0.725863


Unnamed: 0,[Pulls] Total lines added
[Pulls] Count,0.699002
[Pulls] Created per day,0.737517
[Pulls] Created per day (True),0.770747
[Pulls] Total lines deleted,0.738806


Unnamed: 0,[Pulls] Total lines deleted
[Pulls] Total lines added,0.738806


Unnamed: 0,[Pulls] Average lines deleted
[Pulls] Average files changed,0.675491


Unnamed: 0,[Pulls] Average lines added
[Pulls] Average commits,0.789796


Unnamed: 0,[Pulls] Average commits
[Pulls] Average lines added,0.789796


Unnamed: 0,[Pulls] Average files changed
[Pulls] Average lines deleted,0.675491


Unnamed: 0,[Pulls] Mergeable to total
[Pulls] Closed to total,-0.725863


Unnamed: 0,[Pulls] Created per day
[Pulls] Count,0.740626
[Pulls] Created per day (True),0.924544
[Pulls] Total lines added,0.737517


# Pulls X WorkflowRuns

Unnamed: 0,[Pulls] Created per day
[Workflow Runs] Average successes per day,0.664499


# Releases X Releases

Unnamed: 0,[Releases] Count
[Releases] Per day created (True),0.899357
[Releases] Per day published (True),0.848949


Unnamed: 0,[Releases] Total downloads
[Releases] Average asset downloads,0.774496
[Releases] Downloads per day (True),0.944255


Unnamed: 0,[Releases] Average asset downloads
[Releases] Downloads per day (True),0.766112
[Releases] Total downloads,0.774496


Unnamed: 0,[Releases] Per day created (True)
[Releases] Count,0.899357
[Releases] Per day published (True),0.97267


Unnamed: 0,[Releases] Per day published (True)
[Releases] Count,0.848949
[Releases] Per day created (True),0.97267


# Stars X Stars

Unnamed: 0,[Stars] Count
[Stars] Per day (True),0.673777


# WorkflowRuns X WorkflowRuns

Unnamed: 0,[Workflow Runs] Count
[Workflow Runs] Average successes per day,0.893371
[Workflow Runs] Average successes per day (True),0.831688


Unnamed: 0,[Workflow Runs] Average successes per day
[Workflow Runs] Average successes per day (True),0.850415
[Workflow Runs] Count,0.893371


Unnamed: 0,[Workflow Runs] Average successes per day (True)
[Workflow Runs] Average fails per day (True),0.659509
[Workflow Runs] Average successes per day,0.850415
[Workflow Runs] Count,0.831688
