In [61]:
import re
import os

import numpy as np
import pandas as pd

from pydriller import Repository

![](../experiment/data/output/bug_evolve.png)

![](../experiment/data/output/sq_evolve.png)

In [2]:
from sq_effect_study.analyse_sq_history import get_start_weeks_per_proj


start_dates_df = get_start_weeks_per_proj("../experiment/data/input")
systems = ["daffodil", "groovy", "hadoop-ozone", "karaf", "ratis"]
start_dates_df = start_dates_df[start_dates_df.project_gh.isin(systems)]
start_dates_df

Unnamed: 0,project_gh,date,week
2,daffodil,2020-02-27 17:29:14+00:00,202008
4,groovy,2020-03-22 00:59:45+00:00,202011
5,hadoop-ozone,2019-11-14 17:45:30+00:00,201945
10,karaf,2020-10-05 14:20:05+00:00,202040
16,ratis,2020-05-28 10:30:20+00:00,202021


In [3]:
ref_day = pd.to_datetime("14-Jun-2021", utc=True)
ref_day

Timestamp('2021-06-14 00:00:00+0000', tz='UTC')

For Karaf, we do not have a complete year of data but only ca. 2/3 of a year.

In [4]:
ref_day - start_dates_df.date

2    472 days 06:30:46
4    448 days 23:00:15
5    577 days 06:14:30
10   251 days 09:39:55
16   381 days 13:29:40
Name: date, dtype: timedelta64[ns]

In [11]:
start_dates_df.date - pd.DateOffset(years=1)
start_dates_df.date + pd.DateOffset(years=1)



2    2021-02-27 17:29:14+00:00
4    2021-03-22 00:59:45+00:00
5    2020-11-14 17:45:30+00:00
10   2021-10-05 14:20:05+00:00
16   2021-05-28 10:30:20+00:00
Name: date, dtype: datetime64[ns, UTC]

# For which systems do we have statistically significant decrease of defect reports?

In [5]:
from sq_effect_study.analyse_issue_tracker import get_bug_frequencies_as_df, compute_stats


inpath = "../experiment/data/input"
defect_df = get_bug_frequencies_as_df(inpath)

compute_stats(defect_df, start_dates_df)

Daffodil False none 2.0 2.0 2.264228379268529 0.13239213892922697
Groovy True increase 4.0 5.0 9.426642021850904 0.002138555235024594
Hadoop Ozone False decrease 15.0 12.0 2.8829243321285807 0.08952330630935662
Karaf False increase 2.0 2.5 0.23494204610350103 0.6278833438878546
Ratis True decrease 3.0 2.0 15.423150872226364 8.592909969954674e-05


['ratis']

Ratis is the only project in the dataset where the number of defects are significantly reduced in the year before and after SC application.

Hadoop Ozone is close though with a p-value that is only ca. 0.04 above 0.05.

# Closer Analysis of Ratis 

We analyse Ratis more closely, since it is the only project in our study for which we can find a statistically significant decrease of reported bugs in two consecutive years (one before and one after introduction of SonarQube).

### Median and average number of reported defects

In [21]:
defect_df[defect_df.project_gh == "ratis"]


start_dt = pd.to_datetime(start_dates_df[start_dates_df.project_gh == "ratis"].date.iloc[0])
lower_dt = np.datetime64(start_dt - pd.DateOffset(years=1))
upper_dt = np.datetime64(start_dt + pd.DateOffset(years=1))

q = ((defect_df.project_gh == "ratis") & 
     (defect_df.created_week >= lower_dt) & 
     (defect_df.created_week < np.datetime64(start_dt)))
before = defect_df[q].bugs_per_week

print(f"In period {lower_dt} to {start_dt}:")
print(before.describe())

q = ((defect_df.project_gh == "ratis") & 
     (defect_df.created_week >= np.datetime64(start_dt)) & 
     (defect_df.created_week < upper_dt))
after = defect_df[q].bugs_per_week

print(f"In period {start_dt} to {upper_dt}:")
print(after.describe())

In period 2019-05-28T10:30:20.000000 to 2020-05-28 10:30:20+00:00:
count    47.000000
mean      4.042553
std       2.773770
min       1.000000
25%       2.000000
50%       3.000000
75%       5.000000
max      12.000000
Name: bugs_per_week, dtype: float64
In period 2020-05-28 10:30:20+00:00 to 2021-05-28T10:30:20.000000:
count    40.000000
mean      2.150000
std       1.424151
min       1.000000
25%       1.000000
50%       2.000000
75%       3.000000
max       7.000000
Name: bugs_per_week, dtype: float64


### Issues that refer to `[Ss]onar`

In [52]:
PATTERN = "[Ss]onar"


def identify_sonar_issues(sys_name):
    issues_path = f"../experiment/data/input/{sys_name}_jira.csv"
    sdf = pd.read_csv(issues_path)
    sdf.created = pd.to_datetime(sdf.created, utc=True)

    q = sdf.description.str.contains(PATTERN, na=False, regex=True) & (
        sdf.status == "Resolved"
    )

    rsdf = sdf[q][:]
    rsdf["project"] = [sys_name] * rsdf.shape[0]
    return rsdf    
    
    
systems = ["daffodil", "groovy", "hadoop-ozone", "karaf", "ratis"]
sdfs = []
for sys_name in systems:
    sdf = identify_sonar_issues(sys_name)
    no_resolved_sonar_iss = sdf.shape[0]
    print(
        f"Number of resolved issues that mention `[Ss]onar for {sys_name}: {no_resolved_sonar_iss}"
    )
    sdfs.append(sdf)
sdf = pd.concat(sdfs)
sdf[sdf.project == "ratis"]

Number of resolved issues that mention `[Ss]onar for daffodil: 0
Number of resolved issues that mention `[Ss]onar for groovy: 1
Number of resolved issues that mention `[Ss]onar for hadoop-ozone: 96
Number of resolved issues that mention `[Ss]onar for karaf: 1
Number of resolved issues that mention `[Ss]onar for ratis: 12


Unnamed: 0,issue_type,issue_component,creator_name,creator_display_name,reporter_name,reporter_display_name,priority,description,labels,created,resolution,updated,status,id,key,project
15,Bug,[],softgitron,Roni Juntunen,softgitron,Roni Juntunen,Minor,Sonar Qube has detected one possible NPE issue...,[],2021-04-24 12:09:56+00:00,2021-05-05 11:37:08+00:00,2021-05-05 11:37:08+00:00,Resolved,13374812,RATIS-1367,ratis
16,Bug,[],softgitron,Roni Juntunen,softgitron,Roni Juntunen,Minor,Sonar Qube has detected two possible NPE issue...,[],2021-04-24 11:28:18+00:00,2021-05-14 02:03:27+00:00,2021-05-14 02:04:09+00:00,Resolved,13374805,RATIS-1366,ratis
17,Bug,[],softgitron,Roni Juntunen,softgitron,Roni Juntunen,Minor,SonarQube has detected one possible NPE issue ...,[],2021-04-24 11:15:30+00:00,2021-04-26 09:34:44+00:00,2021-04-26 09:34:44+00:00,Resolved,13374802,RATIS-1365,ratis
18,Bug,[],softgitron,Roni Juntunen,softgitron,Roni Juntunen,Minor,SonarQube has detected two possible NPE issues...,[],2021-04-24 10:21:29+00:00,2021-05-05 11:32:38+00:00,2021-05-05 11:32:38+00:00,Resolved,13374795,RATIS-1364,ratis
71,Task,[],adoroszlai,Attila Doroszlai,adoroszlai,Attila Doroszlai,Major,Java 11 needs to be used for running the Sonar...,[],2021-02-05 10:01:59+00:00,2021-02-17 07:20:16+00:00,2021-02-17 07:47:45+00:00,Resolved,13357017,RATIS-1311,ratis
76,Improvement,[],adoroszlai,Attila Doroszlai,adoroszlai,Attila Doroszlai,Minor,GitHub Actions workflows for Ratis CI have two...,[],2021-02-04 11:58:28+00:00,2021-02-04 23:48:03+00:00,2021-02-08 08:47:40+00:00,Resolved,13356746,RATIS-1306,ratis
307,Sub-task,[],amaliujia,Rui Wang,amaliujia,Rui Wang,Major,https://sonarcloud.io/project/issues?id=apache...,[],2020-09-25 04:05:40+00:00,2020-09-25 05:58:37+00:00,2020-09-25 13:42:35+00:00,Resolved,13329329,RATIS-1075,ratis
328,Bug,[],maobaolong,Baolong Mao,maobaolong,Baolong Mao,Minor,https://sonarcloud.io/project/issues?id=apache...,[],2020-09-08 14:18:36+00:00,2020-10-29 15:59:24+00:00,2020-10-29 15:59:24+00:00,Resolved,13326499,RATIS-1054,ratis
428,Sub-task,[],dineshchitlangia,Dinesh Chitlangia,dineshchitlangia,Dinesh Chitlangia,Major,[https://sonarcloud.io/project/issues?id=apach...,[],2020-05-28 22:05:21+00:00,2020-12-03 01:15:34+00:00,2020-12-03 07:52:19+00:00,Resolved,13308177,RATIS-953,ratis
431,Sub-task,[],dineshchitlangia,Dinesh Chitlangia,dineshchitlangia,Dinesh Chitlangia,Major,"InterruptedException should not be ignored, ei...",['pull-request-available'],2020-05-28 17:03:14+00:00,2020-11-05 05:54:54+00:00,2020-11-05 05:56:08+00:00,Resolved,13308101,RATIS-950,ratis


In [54]:
sdf[sdf.project == "ratis"].shape

(12, 16)

In [56]:
iss_keys = list(sdf[sdf.project == "ratis"].key.values) + [
    "RATIS-1051",
    "RATIS-1052",
    "RATIS-1053",
    "RATIS-1055",
    # "RATIS-949",  # The latter two issues are still open (14. Jun. 21)
    # "RATIS-952",  # Therefore, do not add them to this list
]
len(iss_keys)

16

### Commits that address SonarQube issues

In [57]:
def identify_commits_for_issue(iss_key, sys_name):
    repo_path = os.path.join(os.environ["HOME"], "case_systems", sys_name)

    if sys_name == "daffodil":
        # Daffodil puts ticket references in the end of the message, if at all
        pattern = re.compile(f"{iss_key}$")
    elif sys_name in ["hadoop-ozone", "ratis"]:
        pattern = re.compile(f"{iss_key}\. ")
    elif sys_name == "groovy":
        pattern = re.compile(f"{iss_key}[:,]")
    elif sys_name == "karaf":
        pattern = re.compile(f"\[{iss_key}\]")

    rows = []
    for commit in Repository(path_to_repo=repo_path).traverse_commits():
        if re.search(pattern, commit.msg):
            rows.append(
                (
                    sys_name,
                    iss_key,
                    commit.hash,
                    commit.author_date,
                    commit.msg,
                )
            )
    cols = ["project", "iss_key", "c_hash", "date", "msg"]
    df = pd.DataFrame(rows, columns=cols)
    return df


ratis_cdfs = []
for iss_key in iss_keys:
    ratis_cdf = identify_commits_for_issue(iss_key, "ratis")
    ratis_cdfs.append(ratis_cdf)
ratis_cdf = pd.concat(ratis_cdfs)

ratis_cdf["gh_url"] = [
    f"https://github.com/apache/ratis/commit/{c}"
    for c in ratis_cdf.c_hash.values
]

In [59]:
ratis_cdf.shape

(15, 6)

In [60]:
ratis_cdf

Unnamed: 0,project,iss_key,c_hash,date,msg,gh_url
0,ratis,RATIS-1367,040bc52e19a5e36f5710ccd4fc1981e862e691e8,2021-05-05 14:35:48+03:00,RATIS-1367. Add null check for RaftConfigurati...,https://github.com/apache/ratis/commit/040bc52...
0,ratis,RATIS-1366,d65ca26a0291fc6067f860eff4ff3092d25c0aec,2021-05-14 05:03:10+03:00,RATIS-1366. Fix NPE issues in MetaStateMachine...,https://github.com/apache/ratis/commit/d65ca26...
0,ratis,RATIS-1365,ff8aa668f1a0569ba5e6b0f30dbd51a673913344,2021-04-26 12:34:12+03:00,RATIS-1365. Add message for potential NPE in C...,https://github.com/apache/ratis/commit/ff8aa66...
0,ratis,RATIS-1364,9577d564eeac36cf449843d34b7010b33d634818,2021-05-05 14:31:11+03:00,RATIS-1364. Fix Sonar Qube issues in IOUtils (...,https://github.com/apache/ratis/commit/9577d56...
0,ratis,RATIS-1311,87bd1fd1df9f02e83b973291d507ad002bd9d3f4,2021-02-17 08:19:35+01:00,RATIS-1311. Upgrade Java for Sonar check (#419),https://github.com/apache/ratis/commit/87bd1fd...
0,ratis,RATIS-1306,9d1b711f9d4606145e19a28270b0c50b04dc869b,2021-02-05 00:44:43+01:00,RATIS-1306. Eliminate duplicated GitHub Action...,https://github.com/apache/ratis/commit/9d1b711...
0,ratis,RATIS-1075,9b1d2c18f5677f3d443344e69f98e6c3ac953835,2020-09-24 22:58:10-07:00,RATIS-1075. Classes that implement AutoCloseab...,https://github.com/apache/ratis/commit/9b1d2c1...
0,ratis,RATIS-953,02caace296f4414de3eda9f4469dbd806ca594b1,2020-12-03 02:14:20+01:00,RATIS-953. XML Parsers should not be vulnerabl...,https://github.com/apache/ratis/commit/02caace...
0,ratis,RATIS-950,43a042a8bbe123bcb5e567af0aeced12eb299290,2020-11-05 00:54:03-05:00,RATIS-950. Handle Exceptions appropriately (#115),https://github.com/apache/ratis/commit/43a042a...
0,ratis,RATIS-948,a2f3895396a81ee6e31d6fd1a8a6c8a7bf121dd6,2020-06-02 13:57:09+02:00,RATIS-948. Update Sonar statistics only from t...,https://github.com/apache/ratis/commit/a2f3895...


### Development of code size since first application of SonarCloud

In [71]:
repo_path = os.path.join(os.environ["HOME"], "case_systems", "ratis")
ratis_start_dt = pd.to_datetime(start_dates_df[start_dates_df.project_gh == "ratis"].date.iloc[0])

rows = []
for commit in Repository(path_to_repo=repo_path).traverse_commits():
    is_commit_after_sq = (commit.author_date >= ratis_start_dt)
    if is_commit_after_sq:
        churn = commit.insertions - commit.deletions
        rows.append(churn)

In [72]:
sum(rows)

9905

In [67]:
ratis_start_dt

Timestamp('2020-05-28 10:30:20+0000', tz='UTC')

In [68]:
ref_day

Timestamp('2021-06-14 00:00:00+0000', tz='UTC')

That is, Ratis is growing with 9905 lines from 2020-05-28 to 2021-06-14.
More lines mean more possibilites of violating SonarCloud issues. But the trends of code smells, bugs and vulnerabilities are decreasing: https://sonarcloud.io/project/activity?id=apache-ratis

Is this due to changes in the applied SonarQube quality profile (11 times changed for Java and one time changed for XML) or did you change your way of coding to respect SonarQube rules proactively?

In [77]:
start_dates_df[start_dates_df.project_gh == "ratis"]

Unnamed: 0,project_gh,date,week
16,ratis,2020-05-28 10:30:20+00:00,202021


Find all commits that mention a bug.

Since one category of SonarQube is called `Bugs`, we search for all commits that mention a `[Bb]ug`, that are commited after the first application of SonarCloud, and for which the referenced issue is created after the first application of SonarCloud.

In [44]:
iss_df = pd.read_csv(
    os.path.join(inpath, f"ratis_jira.csv"),
    parse_dates=["created"],
    infer_datetime_format=True,
)
# The above date parsing does not seem to work properly, therefore
# cast it to datetimes
iss_df["created"] = pd.to_datetime(iss_df["created"], utc=True)

In [70]:
pattern = re.compile("[Bb]ug")
iss_pattern = re.compile("(RATIS-\d+)\.")

repo_path = os.path.join(os.environ["HOME"], "case_systems", "ratis")
ratis_start_dt = pd.to_datetime(start_dates_df[start_dates_df.project_gh == "ratis"].date.iloc[0])

rows = []
for commit in Repository(path_to_repo=repo_path).traverse_commits():
    is_bug = re.search(pattern, commit.msg)
    is_commit_after_sq = (commit.author_date >= ratis_start_dt)
    
    # get the issue creation date
    iss_key = ""
    if is_bug and is_commit_after_sq:
        if match := re.match(iss_pattern, commit.msg):
            iss_key = match.group(1)

            iss_creation_dt = iss_df[iss_df["key"] == iss_key].created.iloc[0]
            
            if iss_creation_dt >= ratis_start_dt:
                rows.append(
                    (
                        iss_key,
                        commit.hash,
                        commit.author_date,
                        commit.msg,
                    )
                )
    
pd.DataFrame(rows, columns=["key", "c_hash", "author_dt", "msg"])

Unnamed: 0,key,c_hash,author_dt,msg
0,RATIS-1116,af358415cc8ebc70665d73e5fc4812b7cb669c75,2020-10-29 21:12:04+08:00,RATIS-1116. Add DataStreamType. (#238)\n\n* RA...
1,RATIS-1158,32016e4a40ae44bb3c3718880327bfb085bff509,2020-11-16 10:05:27+08:00,RATIS-1158. Use the same proto for StateMachin...
2,RATIS-1200,86dd7fa68081e33428ed65dfdb613032b1378560,2020-12-04 14:24:32+08:00,RATIS-1200. Refactor LogAppender.SnapshotReque...
3,RATIS-1208,e5a052c18766293b94cc89e3a34a469193009537,2020-12-07 13:34:14+08:00,RATIS-1208. Separate LogAppender interface fro...
4,RATIS-1220,79223f89054109d2c499d81a149e60c15fc90453,2020-12-09 11:52:39+08:00,RATIS-1220. FileStore stream to send small pac...
5,RATIS-1236,17a2198b0016a66494da4344d218593ed2c4d7de,2020-12-14 07:31:53+08:00,RATIS-1236. Move out the leader only methods f...
6,RATIS-1251,f51196455679443be438a1ddb04bd6860f4ad6dc,2020-12-20 09:09:30+08:00,RATIS-1251. Move StateMachine and TransactionC...
7,RATIS-1252,cfa6c4f70e8dc296c921b7c8b33511d58270ab39,2020-12-21 07:33:48+08:00,RATIS-1252. Refactor RaftLogMetrics. (#365)\n\...
8,RATIS-1256,195c572024fc5fd88d00c3c0985c01215d2719aa,2020-12-22 10:41:24+08:00,RATIS-1256. Leader updateCommit should use the...
9,RATIS-1308,9fc6a1163b993177fd884bba4212326bb6abbf73,2021-02-22 16:13:10+01:00,RATIS-1308. Findbugs check is failing silently...


That are these ten commits... I believe none of them addresses a SonarQube Java Bug, or do they?

The last one does not for sure. It is a FindBugs configuration and adaption...

# Closer Analysis of Hadoop Ozone

### Median and average number of reported defects

In [73]:
defect_df[defect_df.project_gh == "hadoop-ozone"]


start_dt = pd.to_datetime(start_dates_df[start_dates_df.project_gh == "hadoop-ozone"].date.iloc[0])
lower_dt = np.datetime64(start_dt - pd.DateOffset(years=1))
upper_dt = np.datetime64(start_dt + pd.DateOffset(years=1))

q = ((defect_df.project_gh == "hadoop-ozone") & 
     (defect_df.created_week >= lower_dt) & 
     (defect_df.created_week < np.datetime64(start_dt)))
before = defect_df[q].bugs_per_week

print(f"In period {lower_dt} to {start_dt}:")
print(before.describe())

q = ((defect_df.project_gh == "hadoop-ozone") & 
     (defect_df.created_week >= np.datetime64(start_dt)) & 
     (defect_df.created_week < upper_dt))
after = defect_df[q].bugs_per_week

print(f"In period {start_dt} to {upper_dt}:")
print(after.describe())

In period 2018-11-14T17:45:30.000000 to 2019-11-14 17:45:30+00:00:
count    51.000000
mean     15.058824
std       8.112735
min       3.000000
25%       9.000000
50%      15.000000
75%      20.000000
max      45.000000
Name: bugs_per_week, dtype: float64
In period 2019-11-14 17:45:30+00:00 to 2020-11-14T17:45:30.000000:
count    52.000000
mean     12.326923
std       5.752139
min       1.000000
25%       8.750000
50%      12.000000
75%      15.000000
max      26.000000
Name: bugs_per_week, dtype: float64


### Issues that refer to `[Ss]onar`

In [74]:
sdf[sdf.project == "hadoop-ozone"]

Unnamed: 0,issue_type,issue_component,creator_name,creator_display_name,reporter_name,reporter_display_name,priority,description,labels,created,resolution,updated,status,id,key,project
534,Improvement,[],adoroszlai,Attila Doroszlai,adoroszlai,Attila Doroszlai,Major,Currently _coverage_ CI check:\r\n\r\n# calcul...,['pull-request-available'],2021-02-07 15:13:35+00:00,2021-02-09 10:06:29+00:00,2021-02-09 10:14:51+00:00,Resolved,13357398,HDDS-4801,hadoop-ozone
636,Task,[],adoroszlai,Attila Doroszlai,adoroszlai,Attila Doroszlai,Major,bq. The version of Java installed in the scann...,['pull-request-available'],2021-01-14 14:25:52+00:00,2021-01-26 14:20:43+00:00,2021-01-26 14:28:50+00:00,Resolved,13352273,HDDS-4698,hadoop-ozone
747,Bug,[],adoroszlai,Attila Doroszlai,adoroszlai,Attila Doroszlai,Major,Ozone's GitHub Actions CI workflow references ...,['pull-request-available'],2020-12-13 17:34:52+00:00,2020-12-14 07:22:06+00:00,2020-12-14 10:23:41+00:00,Resolved,13345744,HDDS-4584,hadoop-ozone
805,Improvement,[],elek,Marton Elek,elek,Marton Elek,Trivial,There is an error log which can be seen freque...,['pull-request-available'],2020-11-30 14:38:32+00:00,2020-12-11 13:13:18+00:00,2020-12-11 13:13:46+00:00,Resolved,13343327,HDDS-4526,hadoop-ozone
1125,Improvement,[],elek,Marton Elek,elek,Marton Elek,Major,I would like to start a conversation about dis...,[],2020-09-03 14:23:11+00:00,2020-09-04 09:00:51+00:00,2020-09-04 09:00:56+00:00,Resolved,13325939,HDDS-4205,hadoop-ozone
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2845,Improvement,[],sdeka,Supratim Deka,sdeka,Supratim Deka,Minor,Sonar issue:\r\nhttps://sonarcloud.io/project/...,"['pull-request-available', 'sonar']",2019-11-14 07:26:19+00:00,2019-11-14 16:54:35+00:00,2020-02-24 20:52:12+00:00,Resolved,13268216,HDDS-2480,hadoop-ozone
2846,Improvement,[],sdeka,Supratim Deka,sdeka,Supratim Deka,Minor,Sonar issue:\r\nhttps://sonarcloud.io/project/...,"['pull-request-available', 'sonar']",2019-11-14 07:20:21+00:00,2019-11-14 17:06:49+00:00,2020-02-24 20:51:36+00:00,Resolved,13268214,HDDS-2479,hadoop-ozone
2847,Improvement,[],sdeka,Supratim Deka,sdeka,Supratim Deka,Minor,Sonar issues :\r\nhttps://sonarcloud.io/projec...,"['pull-request-available', 'sonar']",2019-11-14 07:04:49+00:00,2019-11-14 16:51:32+00:00,2020-02-24 20:54:15+00:00,Resolved,13268209,HDDS-2478,hadoop-ozone
2852,Bug,[],avijayan,Aravindan Vijayan,avijayan,Aravindan Vijayan,Major,sonarcloud.io has flagged a number of code rel...,"['pull-request-available', 'sonar']",2019-11-13 21:57:18+00:00,2019-11-14 17:21:36+00:00,2020-02-24 20:52:43+00:00,Resolved,13268142,HDDS-2473,hadoop-ozone


### Commits that address SonarQube issues

In [75]:
ozone_cdfs = []
iss_keys = list(sdf[sdf.project == "hadoop-ozone"].key.values)
for iss_key in iss_keys:
    ozone_cdf = identify_commits_for_issue(iss_key, "hadoop-ozone")
    ozone_cdfs.append(ozone_cdf)
ozone_cdf = pd.concat(ozone_cdfs)

ozone_cdf["gh_url"] = [
    f"https://github.com/apache/ozone/commit/{c}"
    for c in ozone_cdf.c_hash.values
]

In [76]:
ozone_cdf

Unnamed: 0,project,iss_key,c_hash,date,msg,gh_url
0,hadoop-ozone,HDDS-4801,19d115d997f737a1e609909ab3a843e673139789,2021-02-09 11:06:20+01:00,HDDS-4801. Skip coverage check for PRs and in ...,https://github.com/apache/ozone/commit/19d115d...
0,hadoop-ozone,HDDS-4698,49df943db15a5f06fab7c0c2030c44e31705357b,2021-01-26 15:20:32+01:00,HDDS-4698. Upgrade Java for Sonar check (#1800),https://github.com/apache/ozone/commit/49df943...
0,hadoop-ozone,HDDS-4584,e0c8556ac6b8fcb559e2620a625d6deb36b2d95f,2020-12-14 08:20:40+01:00,HDDS-4584. Coverage not updated since TLP (#1698),https://github.com/apache/ozone/commit/e0c8556...
0,hadoop-ozone,HDDS-4526,9d9db48369c075a588704d2e29bd0c5bbabb53e2,2020-12-11 14:13:05+01:00,HDDS-4526. Remove false-positive error logs fr...,https://github.com/apache/ozone/commit/9d9db48...
0,hadoop-ozone,HDDS-4205,acfef2d081a1782589287eaf27d0ed67230bb44c,2020-09-04 11:00:35+02:00,HDDS-4205. Disable coverage upload to codecov ...,https://github.com/apache/ozone/commit/acfef2d...
...,...,...,...,...,...,...
0,hadoop-ozone,HDDS-2480,49dbb188358f9ed2551f03556ceba03b96bd4a86,2019-11-14 22:11:59+05:18,HDDS-2480. Sonar : remove log spam for excepti...,https://github.com/apache/ozone/commit/49dbb18...
0,hadoop-ozone,HDDS-2479,287b32235c4feedf9c668f1cd078be931e65752d,2019-11-14 22:19:07+05:18,HDDS-2479. Sonar : replace instanceof with cat...,https://github.com/apache/ozone/commit/287b322...
0,hadoop-ozone,HDDS-2478,e350aef67eec15437032ab713ae3d882c2bff61c,2019-11-14 22:07:31+05:18,HDDS-2478. Sonar : remove temporary variable i...,https://github.com/apache/ozone/commit/e350aef...
0,hadoop-ozone,HDDS-2473,d0fd848eb9a85da9b4e93eec2ee92a8b1eb4d662,2019-11-14 09:20:54-08:00,HDDS-2473. Fix code reliability issues found b...,https://github.com/apache/ozone/commit/d0fd848...


In [78]:
start_dates_df[start_dates_df.project_gh == "hadoop-ozone"]

Unnamed: 0,project_gh,date,week
5,hadoop-ozone,2019-11-14 17:45:30+00:00,201945


In [81]:
repo_path = os.path.join(os.environ["HOME"], "case_systems", "hadoop-ozone")
ozone_start_dt = pd.to_datetime(start_dates_df[start_dates_df.project_gh == "hadoop-ozone"].date.iloc[0])

rows = []
for commit in Repository(path_to_repo=repo_path).traverse_commits():
    is_commit_after_sq = (commit.author_date >= ozone_start_dt)
    if is_commit_after_sq:
        churn = commit.insertions - commit.deletions
        rows.append(churn)

In [80]:
sum(rows)

458302

In [82]:
start_dates_df

Unnamed: 0,project_gh,date,week
2,daffodil,2020-02-27 17:29:14+00:00,202008
4,groovy,2020-03-22 00:59:45+00:00,202011
5,hadoop-ozone,2019-11-14 17:45:30+00:00,201945
10,karaf,2020-10-05 14:20:05+00:00,202040
16,ratis,2020-05-28 10:30:20+00:00,202021
