In [76]:
from github import Github
import pandas as pd
import pickle

### Initialization

In [77]:
# create github object using an access token
g = Github("17134a5c624118d9eaf2f62518bad8231f6529c4")
#set the repository
repo = g.get_repo("elastic/elasticsearch")

### Issue labels

In [78]:
labels = repo.get_labels()
label_names = []
for label in labels:
    label_names.append(label.name)

In [79]:
label_names

[':Analytics/Aggregations',
 ':Analytics/Geo',
 ':Analytics/Graph',
 ':Analytics/Rollup',
 ':Core/Features/CAT APIs',
 ':Core/Features/Data streams',
 ':Core/Features/Features',
 ':Core/Features/ILM+SLM',
 ':Core/Features/Indices APIs',
 ':Core/Features/Ingest',
 ':Core/Features/Java High Level REST Client',
 ':Core/Features/Java Low Level REST Client',
 ':Core/Features/Monitoring',
 ':Core/Features/Stats',
 ':Core/Features/Watcher',
 ':Core/Infra/Build',
 ':Core/Infra/Circuit Breakers',
 ':Core/Infra/Core',
 ':Core/Infra/Logging',
 ':Core/Infra/Packaging',
 ':Core/Infra/Plugins',
 ':Core/Infra/REST API',
 ':Core/Infra/Resiliency',
 ':Core/Infra/Scripting',
 ':Core/Infra/Settings',
 ':Core/Infra/Transport API',
 ':Distributed/Allocation',
 ':Distributed/Autoscaling',
 ':Distributed/CCR',
 ':Distributed/CRUD',
 ':Distributed/Cluster Coordination',
 ':Distributed/Discovery-Plugins',
 ':Distributed/Distributed',
 ':Distributed/Engine',
 ':Distributed/Network',
 ':Distributed/Recovery',
 '

In [80]:
pdNames = pd.Series(label_names)

In [81]:
pdNames[~pdNames.str.startswith("v")]

0      :Analytics/Aggregations
1               :Analytics/Geo
2             :Analytics/Graph
3            :Analytics/Rollup
4      :Core/Features/CAT APIs
                ...           
115                 resiliency
116                    stalled
117               team-discuss
118              test-forwards
437                  won't fix
Length: 120, dtype: object

### Issues

""" download all the closed issues and dump them into a file
i = []
f = open('closed_issues.pkl', 'wb')
tmp = repo.get_issues(state="closed")
for issue in tmp:
    i.append(issue)
    pickle.dump(issue, f)

""" download all the closed issues and dump them into a file
i = []
f = open('open_issues.pkl', 'wb')
tmp = repo.get_issues(state="open")
for issue in tmp:
    i.append(issue)
    pickle.dump(issue, f)

In [174]:
issues = []
with (open("closed_issues.pkl", "rb")) as openfile:
    while True:
        try:
            issues.append(pickle.load(openfile))
        except EOFError:
            break

In [175]:
len(issues)

58172

import pprint

pprint.pprint(issues[0].__dict__)

In [176]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import numpy as np

label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(label_names)

onehot_encoder = OneHotEncoder(sparse=False)

In [177]:
issues_encoded = []
for issue in issues:
    vector = np.full((len(label_names),), False)
    label_encoded = label_encoder.transform([label.name for label in issue.labels])
    for label in label_encoded:
        vector[label] = True
    issues_encoded.append(vector)

In [178]:
issue_labels = pd.DataFrame(issues_encoded, columns=label_encoder.classes_)

In [179]:
issue_labels.columns

Index([':Analytics/Aggregations', ':Analytics/Geo', ':Analytics/Graph',
       ':Analytics/Rollup', ':Core/Features/CAT APIs',
       ':Core/Features/Data streams', ':Core/Features/Features',
       ':Core/Features/ILM+SLM', ':Core/Features/Indices APIs',
       ':Core/Features/Ingest',
       ...
       'v7.7.0', 'v7.7.1', 'v7.7.2', 'v7.8.0', 'v7.8.1', 'v7.8.2', 'v7.9.0',
       'v7.9.1', 'v8.0.0', 'won't fix'],
      dtype='object', length=438)

Count the number of issues labelled ***bug*** for each version

In [180]:
version_labels = [col for col in issue_labels if col.startswith('v')]
issue_labels[version_labels][issue_labels[">bug"]].sum().sort_values(ascending=False).head(40)

v8.0.0           1175
v2.0.0-beta1     1029
v7.0.0-beta1      986
v6.0.0-alpha1     340
v7.2.0            324
v1.5.0            254
v6.7.0            242
v5.0.0-alpha1     232
v6.5.0            216
v6.4.0            195
v7.4.0            188
v6.6.0            184
v7.6.0            178
v7.7.0            174
v1.4.0.Beta1      167
v1.0.0.Beta1      163
v7.9.0            157
v7.5.0            156
v7.3.0            155
v6.3.0            139
v2.2.0            135
v1.2.0            127
v1.6.0            127
v1.1.0            114
v7.8.0            112
v6.0.0-beta1      104
v5.4.0            102
v1.3.0             92
v5.2.0             88
v5.1.1             86
v5.5.0             83
v6.2.0             83
v2.3.0             82
v7.0.0-rc2         81
v2.4.0             80
v5.0.0-alpha5      74
v5.0.0-beta1       73
v0.20.0.RC1        73
v6.1.0             71
v5.6.0             70
dtype: int64

### Tags

In [36]:
tags = repo.get_tags()
tmp = []
for tag in tags:
    tmp.append((tag.name, tag.commit.sha))

In [38]:
tagsDf = pd.DataFrame(tmp, columns=["Tag", "Sha"])

In [39]:
tagsDf.head()

Unnamed: 0,Tag,Sha
0,v7.8.1,b5ca9c58fb664ca8bf9e4057fc229b3396bf3a89
1,v7.8.0,757314695644ea9a1dc2fecd26d1a43856725e65
2,v7.7.1,ad56dce891c901a492bb1ee393f12dfff473a423
3,v7.7.0,81a1e9eda8e6183f5237786246f6dced26a10eaf
4,v7.6.2,ef48eb35cf30adf4db14086e8aabd07ef6fb113f


### ToDo

* Comment out part that downloads the issues into files
* transform the file into a dataframe with row=issue and columns for each existing label
* export it into csv for easier import
* Count how many issues each tag has
* Plot ditribution of issue types per tag
* Plot distribution of issue types in general
* Does the number increase over time? (more in last releases? Sign of bias)
* Is there a trend in the types of issues that has become more popular?