In [1]:
from pathlib import Path
import pandas as pd
import os
import re
import datetime

In [2]:
# Constants
ROOT_DIR = Path("..")
DATASET_FOLDER = ROOT_DIR / 'dataset' / 'java' / 'elasticsearch'
COMMITS_FILE = DATASET_FOLDER / 'commit_history.txt'
COMMIT_INFORMATION_FILE = DATASET_FOLDER / 'dataset.csv'
CK_METRICS_FOLDER = Path("D:") / 'CK'

# used in the commit information file
CAS_DELIMITER = 'CAS_DELIMITER'

# Metrics to calculate deltas for
DELTA_METRIC_COLUMNS = ["cbo", "cboModified", "fanin", "fanout", "wmc", 
    "dit", "noc", "rfc", "lcom", "lcom*", "tcc", "lcc", 
    "totalMethodsQty", "staticMethodsQty", "publicMethodsQty", "privateMethodsQty", 
    "protectedMethodsQty", "defaultMethodsQty", "visibleMethodsQty", 
    "abstractMethodsQty", "finalMethodsQty", "synchronizedMethodsQty", 
    "totalFieldsQty", "staticFieldsQty", "publicFieldsQty", "privateFieldsQty", 
    "protectedFieldsQty", "defaultFieldsQty", "finalFieldsQty", "synchronizedFieldsQty", 
    "nosi", "loc", "returnQty", "loopQty", "comparisonsQty", "tryCatchQty", 
    "parenthesizedExpsQty", "stringLiteralsQty", "numbersQty", 
    "assignmentsQty", "mathOperationsQty", "variablesQty", "maxNestedBlocksQty", 
    "anonymousClassesQty", "innerClassesQty", "lambdasQty"]

CURRENT_STATE_ONLY_COLUMNS = [
    "uniqueWordsQty", 
    "modifiers", 
    "logStatementsQty"
]

COMMIT_INFO_COLUMNS = [
    "fix",               # Boolean
    "classification",    # Categorical
    "linked",            # Boolean
    "contains_bug",      # Boolean
    "entrophy",          # Numeric
    "la",                # Lines Added
    "ld",                # Lines Deleted
    "fileschanged",      # Count of files changed
    "ndev",              # Number of developers
    "age",               # Average file age
    "exp",               # Developer experience
    "rexp",              # Recent experience
    "sexp",              # Subsystem experience
    "glm_probability"    # Numeric
]

# Updated extended columns with proper handling
COMMIT_INFO_EXTENDED_COLUMNS = {
    "time_of_day": lambda row: row["author_date_unix_timestamp"].hour,
    "day_of_week": lambda row: row["author_date_unix_timestamp"].dayofweek,
    "is_weekend": lambda row: row["author_date_unix_timestamp"].dayofweek > 4,
    "net_lines_changed": lambda row: float(row["la"]) - float(row["ld"]),
    "absolute_lines_changed": lambda row: abs(float(row["la"]) + float(row["ld"])),
    "lines_per_file": lambda row: (float(row["la"]) + float(row["ld"])) / float(len(row["fileschanged"].split(","))),
    "author_experience": lambda row: float(row["exp"]),
    "author_ownership": lambda row: float(row["ndev"]),
    "changed_file_count": lambda row: float(len(row["fileschanged"].split(","))),
    "entropy_bucket": lambda row: pd.cut(
        pd.to_numeric([row["entrophy"]]), bins=3, labels=["low", "medium", "high"]
    )[0],
}

In [3]:
def get_commit_parent_hash(commit_hash: str, commits_df: pd.DataFrame) -> str:

    # find the commit index in the dataframe and return previous hash
    commit_index = commits_df[commits_df['hash'] == commit_hash].index[0]
    # retunr previous hash
    parent_hash = commits_df.iloc[commit_index + 1]['hash']

    return parent_hash

In [4]:
# read commit history
commits = pd.read_csv(COMMITS_FILE, header=None, names=['hash'])
commits

Unnamed: 0,hash
0,e0f4b01fb8655c82e1b62dcb07973016ed411297
1,1141edee4797f772a542528a296df2d802b50122
2,31578be7502c267e11bed31af064b857bbd2d490
3,4e5b94ae033c10921a4a3ebb7274d89420d11d6b
4,97bc2919ffb5d9e90f809a18233c49a52cf58faa
...,...
82183,bd2b0a632bfc5aabb408e7f47cfaa52a7d1b2b50
82184,78c220589e4e317523268be8401647014fbe95b4
82185,7004a9e5bae12864d2c1e05e3233183cbf2006c2
82186,b3337c312765e51cec7bde5883bbc0a08f56fb65


In [5]:
# read commit information file into df
commit_information = pd.read_csv(COMMIT_INFORMATION_FILE)

commit_information["author_date_unix_timestamp"] = pd.to_datetime(
    commit_information["author_date_unix_timestamp"], unit="s"
)

commit_information

Unnamed: 0,commit_hash,author_name,author_date_unix_timestamp,author_email,author_date,commit_message,fix,classification,linked,contains_bug,...,fileschanged,lt,ndev,age,nuc,exp,rexp,sexp,glm_probability,repository_id
0,4b868b0e11f4a59d608523c57d1a62b870ac8e0e,Niels Bauman,2024-12-09 16:57:40,33722607+nielsbauman@users.noreply.github.com,Mon Dec 9 17:57:40 2024 +0100,Fix enrich cache size setting name (#117575)Th...,True,Corrective,True,False,...,"['docs/changelog/117575.yaml', 'x-pack/plugin/...",109.666667,592.0,36.132118,66.0,392.0,0.672817,49.0,0.0,67774f36-b659-11ef-b1cd-94e70b34ea94
1,5e859d9301ffe736548dfc2b6e72807a7f9006ff,Benjamin Trent,2024-12-09 16:06:27,ben.w.trent@gmail.com,Mon Dec 9 11:06:27 2024 -0500,Even better(er) binary quantization (#117994)T...,False,Feature Addition,False,False,...,"['docs/changelog/117994.yaml', 'rest-api-spec/...",147.437500,632.0,10.090248,463.0,8249.5,0.978695,951.0,0.0,67774f36-b659-11ef-b1cd-94e70b34ea94
2,0586cbfb34c7201a996578db60d12fea8594261c,David Turner,2024-12-09 15:46:22,david.turner@elastic.co,Mon Dec 9 15:46:22 2024 +0000,Remove unused `BlobStore#deleteBlobsIgnoringIf...,False,,False,False,...,['modules/repository-azure/src/main/java/org/e...,154.857143,631.0,58.083231,351.0,15618.0,1.012307,4363.0,0.0,67774f36-b659-11ef-b1cd-94e70b34ea94
3,22a392f1b69224aac3894dd6a05b892ccbb6a75d,Ryan Ernst,2024-12-09 15:33:11,ryan@iernst.net,Mon Dec 9 07:33:11 2024 -0800,Remove client.type setting (#118192)The client...,False,,False,False,...,"['docs/changelog/118192.yaml', 'server/src/mai...",64.666667,621.0,39.898530,221.0,43014.0,0.673292,6552.0,0.0,67774f36-b659-11ef-b1cd-94e70b34ea94
4,2ecf981f24ca5d480466a4fcc669aeb52d063657,David Kyle,2024-12-09 15:22:48,david.kyle@elastic.co,Mon Dec 9 15:22:48 2024 +0000,[ML] Refactor the Chunker classes to return of...,False,,False,False,...,['x-pack/plugin/inference/src/main/java/org/el...,127.571429,622.0,55.154269,18.0,5870.0,1.005604,3260.0,0.0,67774f36-b659-11ef-b1cd-94e70b34ea94
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45602,ade36f026b0a676cc879992636856cd5f69430c1,kimchy,2010-02-11 20:34:11,kimchy@gmail.com,Thu Feb 11 22:34:11 2010 +0200,TransportClient to automatically retry another...,True,Corrective,True,False,...,['modules/elasticsearch/src/main/java/org/elas...,173.800000,1.0,3.294502,5.0,1381.0,1.138615,1330.0,0.0,67774f36-b659-11ef-b1cd-94e70b34ea94
45603,847db717c66509a817e1b965226ee1e44c08918d,kimchy,2010-02-11 17:29:25,kimchy@gmail.com,Thu Feb 11 19:29:25 2010 +0200,Transport: Support local (JVM level) transport...,False,,False,False,...,['modules/elasticsearch/src/main/java/org/elas...,56.000000,1.0,1.189296,7.0,1372.5,1.280070,1325.0,0.0,67774f36-b659-11ef-b1cd-94e70b34ea94
45604,b61964a2b8a4f6465928efcb5b1b434dbbb6b1a5,kimchy,2010-02-10 22:12:32,kimchy@gmail.com,Thu Feb 11 00:12:32 2010 +0200,Discovery: Support local (JVM level) discovery...,False,,False,False,...,['.idea/runConfigurations/Elastic_Search_Tests...,26.375000,1.0,0.886050,3.0,1362.5,1.042527,1313.0,0.0,67774f36-b659-11ef-b1cd-94e70b34ea94
45605,bd2b0a632bfc5aabb408e7f47cfaa52a7d1b2b50,kimchy,2010-02-09 20:18:46,kimchy@gmail.com,Tue Feb 9 22:18:46 2010 +0200,"Support terms filter, closes #1",False,,False,False,...,['modules/elasticsearch/src/main/java/org/elas...,118.500000,1.0,0.641898,3.0,1355.5,1.411107,1306.0,0.0,67774f36-b659-11ef-b1cd-94e70b34ea94


In [6]:
def get_CK_metrics(commit_hash: str, ck_metrics_folder: Path, changed_file: str) -> pd.DataFrame:

    # get the CK metrics for the commit
    ck_file = ck_metrics_folder / f'{commit_hash}.csv'
    ck_metrics = pd.read_csv(ck_file)

    # regex pattern for the absolute path prefix
    regex_pattern = r'/home/buzluca/JIT-SDP/repositories/elasticsearch(_worker_)?[0-9]*/'

    # Remove the matching prefix using regex
    ck_metrics['file'] = ck_metrics['file'].apply(
        lambda x: re.sub(regex_pattern, '', x)
    )

    ck_metrics['file'] = ck_metrics['file'].apply(lambda x: x.replace('\\', '/'))

    print(ck_metrics['file'])

    # filter the CK metrics for the changed files
    ck_metrics = ck_metrics[ck_metrics['file'].isin(changed_file)]
    

    return ck_metrics

In [7]:
#df = get_CK_metrics("7c048973928e7074daa34172f9eb93701cb2eba9", CK_METRICS_FOLDER, ['elasticsearch/src/main/java/org/elasticsearch/xpack/watcher/condition/ScriptCondition.java', 'elasticsearch/src/main/java/org/elasticsearch/xpack/watcher/transform/script/ExecutableScriptTransform.java', 'elasticsearch/src/main/java/org/elasticsearch/xpack/watcher/transform/script/ScriptTransformFactory.java', 'elasticsearch/src/test/java/org/elasticsearch/xpack/watcher/transform/script/ScriptTransformTests.java', 'qa/smoke-test-watcher-with-painless/src/test/resources/rest-api-spec/test/watcher_painless/50_update_scripts.yaml'])
#df

In [8]:
# for each commit in commit_information, get the commit hash and changed files
# then get the CK metrics for the commit and it's parent
# calculate the change in CK metrics in a seperate function.
# create a sepeare entry for every changed file in the commit
# create a new dataframe with the change in CK metrics and the commit hash and the changed file and commit information.
# save the new dataframe to a csv file

def calculate_change_in_metrics(file: str, ck_metrics: pd.DataFrame, parent_ck_metrics: pd.DataFrame) -> pd.DataFrame:
    # Initialize an empty DataFrame for the result
    result = pd.DataFrame(columns=DELTA_METRIC_COLUMNS + CURRENT_STATE_ONLY_COLUMNS + COMMIT_INFO_COLUMNS + list(COMMIT_INFO_EXTENDED_COLUMNS.keys()))

    # Get the file row from CK metrics
    file_ck_metrics = ck_metrics[ck_metrics['file'] == file]
    file_parent_ck_metrics = parent_ck_metrics[parent_ck_metrics['file'] == file]

    # Calculate deltas for DELTA_METRIC_COLUMNS
    for column in DELTA_METRIC_COLUMNS:
        result[column] = file_ck_metrics[column].values - file_parent_ck_metrics[column].values

    # Copy CURRENT_STATE_ONLY_COLUMNS directly
    for column in CURRENT_STATE_ONLY_COLUMNS:
        result[column] = file_ck_metrics[column].values

    # Copy COMMIT_INFO_COLUMNS directly from `commit_information`
    commit_info_row = commit_information.loc[commit_information['commit_hash'] == file_ck_metrics['commit'].values[0]]
    if not commit_info_row.empty:
        for column in COMMIT_INFO_COLUMNS:
            result[column] = commit_info_row.iloc[0][column]

    # Apply COMMIT_INFO_EXTENDED_COLUMNS functions row-wise
    for column, func in COMMIT_INFO_EXTENDED_COLUMNS.items():
        result[column] = commit_information.apply(func, axis=1)

    return result



merged_df = pd.DataFrame(columns= ["hash"] + DELTA_METRIC_COLUMNS + CURRENT_STATE_ONLY_COLUMNS + COMMIT_INFO_COLUMNS + list(COMMIT_INFO_EXTENDED_COLUMNS.keys()))
count = 0
for idx, row in commit_information.iterrows():

    #print(row['fileschanged'])
    commit_hash = row['commit_hash']
    #test
    #commit_hash = "b63b50b945d4c44c70d56a9ce2ba9a25dd897147"
    #test
    # yeah, its hacky. I know.
    changed_files = row['fileschanged'].strip("[]'").replace("'", "").split(", ")
    
    for changed_file in changed_files:
        # Get metrics for current commit and file
        try:
            ck_metrics = get_CK_metrics(commit_hash, CK_METRICS_FOLDER, [changed_file])
        except:
            #print("No commits")
            continue
        # Get parent commit metrics
        parent_hash = get_commit_parent_hash(commit_hash, commits)

        try:
            parent_ck_metrics = get_CK_metrics(parent_hash, CK_METRICS_FOLDER, [changed_file])
        except:
            #print("No parent commits")
            continue

        #print(ck_metrics)
        #print(parent_ck_metrics)
        
        if not ck_metrics.empty and not parent_ck_metrics.empty:
            # Calculate changes
            change_in_metrics = calculate_change_in_metrics(changed_file, ck_metrics, parent_ck_metrics)
            # Add commit hash and file path
            change_in_metrics['hash'] = commit_hash
            change_in_metrics['file'] = changed_file
            
            merged_df = pd.concat([merged_df, change_in_metrics])
            count += 1
            if(count > 150):
                print(merged_df)
                merged_df.to_csv("merged_df.csv")
                exit()







0        server/src/main/java/org/elasticsearch/action/...
1        modules/ingest-common/src/test/java/org/elasti...
2        server/src/test/java/org/elasticsearch/search/...
3        x-pack/plugin/security/src/test/java/org/elast...
4        x-pack/plugin/esql/src/main/generated/org/elas...
                               ...                        
36630    x-pack/plugin/esql/src/main/generated/org/elas...
36631    x-pack/plugin/esql/src/main/generated/org/elas...
36632    x-pack/plugin/core/src/test/java/org/elasticse...
36633    x-pack/plugin/searchable-snapshots/qa/minio/sr...
36634    server/src/main/java/org/elasticsearch/search/...
Name: file, Length: 36635, dtype: object
0        server/src/main/java/org/elasticsearch/action/...
1        modules/ingest-common/src/test/java/org/elasti...
2        server/src/test/java/org/elasticsearch/search/...
3        x-pack/plugin/security/src/test/java/org/elast...
4        x-pack/plugin/esql/src/main/generated/org/elas...
               

  merged_df = pd.concat([merged_df, change_in_metrics])


0        server/src/main/java/org/elasticsearch/action/...
1        modules/ingest-common/src/test/java/org/elasti...
2        server/src/test/java/org/elasticsearch/search/...
3        x-pack/plugin/security/src/test/java/org/elast...
4        x-pack/plugin/esql/src/main/generated/org/elas...
                               ...                        
36630    x-pack/plugin/esql/src/main/generated/org/elas...
36631    x-pack/plugin/esql/src/main/generated/org/elas...
36632    x-pack/plugin/core/src/test/java/org/elasticse...
36633    x-pack/plugin/searchable-snapshots/qa/minio/sr...
36634    server/src/main/java/org/elasticsearch/search/...
Name: file, Length: 36635, dtype: object
0        server/src/main/java/org/elasticsearch/action/...
1        modules/ingest-common/src/test/java/org/elasti...
2        server/src/test/java/org/elasticsearch/search/...
3        x-pack/plugin/security/src/test/java/org/elast...
4        x-pack/plugin/esql/src/main/generated/org/elas...
               

ValueError: Length of values (1) does not match length of index (2)