In [2]:
import pandas as pd
import pickle
import json
import seaborn as sns
import pprint
pp = pprint.PrettyPrinter(depth=6)
from jira import JIRA
import matplotlib
import matplotlib.pyplot as plt
matplotlib.rcParams['figure.figsize'] = (15.0, 5.0)
pd.set_option('display.max_columns', 120)

In [3]:
jira = JIRA('https://jira.sonarsource.com/')

In [4]:
pkl_folder = "../pkl"
csv_folder = "../csv"

In [94]:
issue_fields = ['issuetype',
 'customfield_12130',
 'customfield_11041',
 'customfield_12132',
 'customfield_12131',
 'project',
 'customfield_12134',
 'customfield_12530',
 'fixVersions',
 'customfield_12133',
 'customfield_12136',
 'customfield_12532',
 'resolution',
 'customfield_10431',
 'customfield_12135',
 'customfield_12531',
 'customfield_12138',
 'customfield_10432',
 'customfield_12534',
 'customfield_12137',
 'customfield_10433',
 'customfield_12533',
 'customfield_11832',
 'customfield_11831',
 'customfield_11833',
 'resolutiondate',
 'workratio',
 'lastViewed',
 'watches',
 'created',
 'customfield_11032',
 'customfield_11033',
 'priority',
 'customfield_11630',
 'customfield_11233',
 'customfield_11036',
 'customfield_11830',
 'labels',
 'customfield_11631',
 'customfield_11038',
 'customfield_10930',
 'customfield_11347',
 'customfield_10931',
 'customfield_10932',
 'versions',
 'issuelinks',
 'assignee',
 'updated',
 'status',
 'components',
 'customfield_12031',
 'customfield_11140',
 'customfield_12030',
 'customfield_12033',
 'customfield_11141',
 'description',
 'customfield_12032',
 'customfield_12034',
 'customfield_12430',
 'customfield_11344',
 'customfield_11343',
 'customfield_10530',
 'customfield_11345',
 'customfield_10730',
 'customfield_11337',
 'customfield_11930',
 'customfield_11139',
 'customfield_11338',
 'summary',
 'creator',
 'subtasks',
 'customfield_11130',
 'customfield_11131',
 'reporter',
 'customfield_11132',
 'customfield_11133',
 'customfield_10243',
 'customfield_11335',
 'customfield_11334',
 'customfield_11730',
 'customfield_10434',
 'customfield_12536',
 'customfield_12535',
 'customfield_12139',
 'environment',
 'customfield_12538',
 'customfield_10437',
 'customfield_12537',
 'customfield_10438',
 'duedate',
 'votes',
 #'__module__',
 #'__dict__',
 #'__weakref__',
 #'__doc__'
       ]

| Field          | Description                                                     | Type of variable |
|----------------|-----------------------------------------------------------------|------------------|
| fixVersions    | Search for issues that are assigned to a particular fix version | list of versions |
| issuetype      | Type of issue                                                   | Categorical      |
| project        | The project the issue belongs to                                | String           |
| resolution     | The resolution of the issue                                     | Categorical      |
| resolutiondate | The date the issue was resolved                                 | Date             |
| workratio      | calculated as workRatio = (timeSpent / originalEstimate) x 100  | Number           |
| lastViewed     | The date at which the issue was last viewed                     | Date             |
| watches        | Users watching the issue                                        | ?                |
| created        | The date of the creation of the issue                           | Date             |
| priority       | Priority assigned to the issue                                  | Categorical      |
| labels         | Labels assigned to the issue                                    | Strings          |
| versions       | The versions affected by the issue                              | List             |
| assignee       | The user the issue is assigned to                               | User             |
| status         | The status of the issue                                         | Categorical      |
| description    | The description of the issue                                    | String           |
| summary        | A summary of the issue reported                                 | String           |
| creator        | The user that created the issue                                 | User             |
| subtasks       | The subtasks of the issue                                       | List of issues   |
| reporter       | The user who reported the issue. May be the same as the creator | User             |
| duedate        | The date the issue is due to be resolved                        | Date             |

Issues are retrieved and proccesed here given that pickle cannot serialize issues.
Therefore they need to be processed into a dataframe to be saved as csv

In [6]:
#download all issues
size = 100
initial = 0
issue_tuples = []
all_issues = []
while True:
    start= initial*size
    issues = jira.search_issues('project=SONAR',  start, size)
    all_issues = all_issues + issues
    if len(issues) == 0:
        break
    initial += 1
    if(initial % 10 == 0):
        print(f"Issues downloaded: {len(all_issues)}")

Issues downloaded: 1000
Issues downloaded: 2000
Issues downloaded: 3000
Issues downloaded: 4000
Issues downloaded: 5000
Issues downloaded: 6000
Issues downloaded: 7000
Issues downloaded: 8000
Issues downloaded: 9000
Issues downloaded: 10000
Issues downloaded: 11000


In [7]:
#create tuples for dataframe creation
for issue in all_issues:
    values = []
    values.append(issue.key)
    values.append(issue.id)
    for field in issue_fields:
        values.append(getattr(issue.fields, field))
    issue_tuples.append(values)

NameError: name 'issue_fields' is not defined

In [None]:
issues_df = pd.DataFrame(issue_tuples, columns=(["issue_key", "issue_id"] + issue_fields))
issues_df

In [None]:
def extract_features(df, field, fields_to_extract):
    for fte in fields_to_extract:
        df[f"{field}_{fte}"] = issues_df[field].apply(lambda x: getattr(x,fte) if x else None)
    return df

#### Extract features

In [None]:
extract_features(issues_df, "issuetype", ["id", "name"])
extract_features(issues_df, "priority", ["id", "name"])
extract_features(issues_df, "assignee", ["key", "name"])
extract_features(issues_df, "reporter", ["key", "name"])
extract_features(issues_df, "creator", ["key", "name"])
extract_features(issues_df, "status", ["id", "name"])
extract_features(issues_df, "resolution", ["id", "name"])
extract_features(issues_df, "votes", ["votes"])

#### Type of columns

In [None]:
issues_df["issue_id"] = pd.to_numeric(issues_df.issue_id)
issues_df['created'] = pd.to_datetime(issues_df['created'],errors='coerce', utc=True)
issues_df['resolutiondate'] = pd.to_datetime(issues_df['resolutiondate'],errors='coerce', utc=True)
issues_df['updated'] = pd.to_datetime(issues_df['updated'],errors='coerce', utc=True)
issues_df['duedate'] = pd.to_datetime(issues_df['duedate'],errors='coerce', utc=True)

#### Rename columns

In [None]:
issues_df = issues_df.rename(columns={"customfield_11630": "edition"})

In [None]:
#issues_df.to_csv("csv/issues.csv")
#issues_df = pd.read_csv("csv/issues.csv", index_col=0)
issues_df

### Extract versions

In [None]:
def extract_versions(all_issues, version_field, prefix):
    values = []
    for issue in all_issues:
        for version in getattr(issue.fields, version_field):
            values.append((issue.id,version.name,version.id))
    return pd.DataFrame(values, columns=["issue_id", f"{prefix}_name", f"{prefix}_id"])

In [None]:
issues_fixversions = extract_versions(all_issues, "fixVersions", "fixVersion")
issues_fixversions.to_csv(f"{csv_folder}/issues_fixversions.csv")
issues_fixversions = pd.read_csv(f"{csv_folder}/issues_fixversions.csv", index_col=0)
issues_fixversions

In [None]:
issues_versions = extract_versions(all_issues, "versions", "version")
issues_versions.to_csv(f"{csv_folder}/issues_versions.csv")
issues_versions = pd.read_csv(f"{csv_folder}/issues_versions.csv", index_col=0)
issues_versions

### Extract labels

In [None]:
def extract_labels(all_issues, field, column):
    values = []
    for issue in all_issues:
        for value in getattr(issue.fields, field):
            values.append((issue.id, value))
    return pd.DataFrame(values, columns=["issue_id", column])

In [None]:
issues_labels = extract_labels(all_issues, "labels", "label")
issues_labels.to_csv(f"{csv_folder}/issues_labels.csv")
issues_labels = pd.read_csv(f"{csv_folder}/issues_labels.csv", index_col=0)
issues_labels

### Extract subtasks

In [None]:
def extract_subtasks(all_issues, field, prefix):
    values = []
    for issue in all_issues:
        for subtask in getattr(issue.fields, field):
            values.append((issue.id,subtask.key,subtask.id))
    return pd.DataFrame(values, columns=["issue_id", f"{prefix}_key", f"{prefix}_id"])

In [None]:
issues_substaks = extract_subtasks(all_issues, "subtasks", "subtask")
issues_substaks.to_csv(f"{csv_folder}/issues_subtasks.csv")
issues_substaks = pd.read_csv(f"{csv_folder}/issues_subtasks.csv", index_col=0)
issues_substaks

## Field analysis

### Workratio

In [None]:
sns.distplot(issues_df.workratio)

In [None]:
sns.barplot(x="workratio", y="issue_id", data=issues_df.groupby("workratio").count().reset_index())

### IssueType

In [None]:
sns.countplot(x="issuetype_name", data=issues_df[["issue_id", "issuetype_name"]])

### Resolution

In [None]:
issues_df.resolution_name.unique()

In [None]:
ax = sns.countplot(x="resolution_name", data=issues_df[["issue_id", "resolution_name"]])
ax.tick_params(axis='x', labelrotation= 60)

### Watches

In [None]:
pp.pprint(issues_df.watches[0])

### Priority

In [None]:
issues_df.priority_name.unique()

In [None]:
ax = sns.countplot(x="priority_name", data=issues_df[["issue_id", "priority_name"]])

### Labels

In [None]:
len(issues_labels.label.unique())

In [None]:
sns.distplot(issues_labels.groupby("issue_id").count().label)

### Versions

In [None]:
len(issues_versions.version_name.unique())

In [None]:
top_versions = issues_versions.groupby("version_name").count().issue_id.reset_index().sort_values(by="issue_id", ascending=False)
top_versions.head(10)

In [None]:
ax = sns.barplot(x="version_name", y="issue_id", data=top_versions[["issue_id", "version_name"]].head(20))

In [None]:
version_merged = pd.merge(issues_df, issues_versions, on="issue_id")

In [None]:
top_versions.head(10).version_name.values

In [None]:
versions_sorted = issues_versions.sort_values(by="version_name").version_name.unique()

In [None]:
f, axes = plt.subplots(5,1, figsize=(15,20), sharey=True)
for i in range(5):
    data = version_merged[version_merged.version_name.isin(versions_sorted[30*i:30*(i+1)])]
    ax = sns.countplot(ax=axes[i], x="version_name", 
                       hue="issuetype_name", data=data.sort_values(by="version_name").head(3000),
                      palette={"Bug":"r","Improvement":"orange",
                               "New Feature": "green",
                               "Task": "yellow",
                               "Sub-task": "pink",
                              "Documentation": "grey"})
    ax.tick_params(axis='x', labelrotation= 60)
    if i > 0:
        ax.get_legend().remove()

## No version issues
Ideally all bugs would be labeled with a version. Why is it not the case?

In [None]:
no_version_issues = issues_df[~issues_df.issue_id.isin(version_merged.issue_id.unique())]
bug_no_version = no_version_issues[no_version_issues.issuetype_name == "Bug"]
bug_no_version

In [None]:
bug_no_version[bug_no_version.status_name == "Resolved"][["summary", "description", "status_name", "resolution_name", "resolutiondate","issue_id", "issue_key"]]

In [None]:
bug_no_version[bug_no_version.resolution_name == "Fixed"]

### Status

In [None]:
issues_df.status.unique()

In [None]:
ax = sns.countplot(x="status", data=issues_df[["issue_id", "status"]])

### Edition

In [None]:
ax = sns.countplot(x="edition", data=issues_df)

In [None]:
issues_df.edition.describe()

In [None]:
issues_df.edition.isna().sum()

In [None]:
issues_df[(issues_df.edition.isna()) & (issues_df.resolution=="Fixed") & (issues_df.issuetype_name=="Bug")]