In [63]:
%load_ext line_profiler

The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler


In [64]:
from pathlib import Path
import pandas as pd
import os
import re
import datetime

In [65]:
# Constants
ROOT_DIR = Path("..")
DATASET_FOLDER = ROOT_DIR / 'dataset' / 'java' / 'elasticsearch'
COMMITS_FILE = DATASET_FOLDER / 'commit_history.txt'
COMMIT_INFORMATION_FILE = DATASET_FOLDER / 'dataset.csv'
CK_METRICS_FOLDER = Path("D:") / 'CK'

# used in the commit information file
CAS_DELIMITER = 'CAS_DELIMITER'

# Metrics to calculate deltas for
DELTA_METRIC_COLUMNS = ["cbo", "cboModified", "fanin", "fanout", "wmc", 
    "dit", "noc", "rfc", "lcom", "lcom*", "tcc", "lcc", 
    "totalMethodsQty", "staticMethodsQty", "publicMethodsQty", "privateMethodsQty", 
    "protectedMethodsQty", "defaultMethodsQty", "visibleMethodsQty", 
    "abstractMethodsQty", "finalMethodsQty", "synchronizedMethodsQty", 
    "totalFieldsQty", "staticFieldsQty", "publicFieldsQty", "privateFieldsQty", 
    "protectedFieldsQty", "defaultFieldsQty", "finalFieldsQty", "synchronizedFieldsQty", 
    "nosi", "loc", "returnQty", "loopQty", "comparisonsQty", "tryCatchQty", 
    "parenthesizedExpsQty", "stringLiteralsQty", "numbersQty", 
    "assignmentsQty", "mathOperationsQty", "variablesQty", "maxNestedBlocksQty", 
    "anonymousClassesQty", "innerClassesQty", "lambdasQty"]

CURRENT_STATE_ONLY_COLUMNS = [
    "uniqueWordsQty", 
    "modifiers", 
    "logStatementsQty"
]

COMMIT_INFO_COLUMNS = [
    "fix",               # Boolean
    "classification",    # Categorical
    "linked",            # Boolean
    "contains_bug",      # Boolean
    "entrophy",          # Numeric
    "la",                # Lines Added
    "ld",                # Lines Deleted
    "fileschanged",      # Count of files changed
    "ndev",              # Number of developers
    "age",               # Average file age
    "exp",               # Developer experience
    "rexp",              # Recent experience
    "sexp",              # Subsystem experience
    "glm_probability"    # Numeric
]

# Updated COMMIT_INFO_EXTENDED_COLUMNS with vectorized lambdas
COMMIT_INFO_EXTENDED_COLUMNS = {
    "time_of_day": lambda df: df["author_date_unix_timestamp"].dt.hour,
    "day_of_week": lambda df: df["author_date_unix_timestamp"].dt.dayofweek,
    "is_weekend": lambda df: df["author_date_unix_timestamp"].dt.dayofweek > 4,
    "net_lines_changed": lambda df: df["la"] - df["ld"],
    "absolute_lines_changed": lambda df: abs(df["la"] + df["ld"]),
    "lines_per_file": lambda df: (df["la"] + df["ld"]) / (df["fileschanged"].str.count(",") + 1),
    "author_experience": lambda df: df["exp"],
    "author_ownership": lambda df: df["ndev"],
    "changed_file_count": lambda df: df["fileschanged"].str.count(",") + 1,
    "entropy_bucket": lambda df: pd.cut(
        df["entrophy"], bins=3, labels=["low", "medium", "high"]
    )
}

In [66]:
def get_commit_parent_hash(commit_hash: str, commits_df: pd.DataFrame) -> str:

    # find the commit index in the dataframe and return previous hash
    commit_index = commits_df[commits_df['hash'] == commit_hash].index[0]
    # retunr previous hash
    parent_hash = commits_df.iloc[commit_index + 1]['hash']

    return parent_hash

In [67]:
# Precompute extended columns dynamically
def precompute_extended_columns(commit_info_df, extended_columns):
    for column, func in extended_columns.items():
        if callable(func):
            # Apply the function across the DataFrame without axis=1
            commit_info_df[column] = func(commit_info_df)
        else:
            raise ValueError(f"Function for column '{column}' must be callable")
    return commit_info_df

In [68]:
# read commit history
commits = pd.read_csv(COMMITS_FILE, header=None, names=['hash'])
commits

Unnamed: 0,hash
0,e0f4b01fb8655c82e1b62dcb07973016ed411297
1,1141edee4797f772a542528a296df2d802b50122
2,31578be7502c267e11bed31af064b857bbd2d490
3,4e5b94ae033c10921a4a3ebb7274d89420d11d6b
4,97bc2919ffb5d9e90f809a18233c49a52cf58faa
...,...
82183,bd2b0a632bfc5aabb408e7f47cfaa52a7d1b2b50
82184,78c220589e4e317523268be8401647014fbe95b4
82185,7004a9e5bae12864d2c1e05e3233183cbf2006c2
82186,b3337c312765e51cec7bde5883bbc0a08f56fb65


In [69]:
# read commit information file into df
commit_information = pd.read_csv(COMMIT_INFORMATION_FILE)

commit_information["author_date_unix_timestamp"] = pd.to_datetime(
    commit_information["author_date_unix_timestamp"], unit="s"
)

# Precompute the columns dynamically
commit_information = precompute_extended_columns(commit_information, COMMIT_INFO_EXTENDED_COLUMNS)

commit_information

Unnamed: 0,commit_hash,author_name,author_date_unix_timestamp,author_email,author_date,commit_message,fix,classification,linked,contains_bug,...,time_of_day,day_of_week,is_weekend,net_lines_changed,absolute_lines_changed,lines_per_file,author_experience,author_ownership,changed_file_count,entropy_bucket
0,4b868b0e11f4a59d608523c57d1a62b870ac8e0e,Niels Bauman,2024-12-09 16:57:40,33722607+nielsbauman@users.noreply.github.com,Mon Dec 9 17:57:40 2024 +0100,Fix enrich cache size setting name (#117575)Th...,True,Corrective,True,False,...,16,0,False,120.0,126.0,42.000000,392.0,592.0,3,low
1,5e859d9301ffe736548dfc2b6e72807a7f9006ff,Benjamin Trent,2024-12-09 16:06:27,ben.w.trent@gmail.com,Mon Dec 9 11:06:27 2024 -0500,Even better(er) binary quantization (#117994)T...,False,Feature Addition,False,False,...,16,0,False,3364.0,3638.0,113.687500,8249.5,632.0,32,low
2,0586cbfb34c7201a996578db60d12fea8594261c,David Turner,2024-12-09 15:46:22,david.turner@elastic.co,Mon Dec 9 15:46:22 2024 +0000,Remove unused `BlobStore#deleteBlobsIgnoringIf...,False,,False,False,...,15,0,False,-118.0,152.0,7.238095,15618.0,631.0,21,low
3,22a392f1b69224aac3894dd6a05b892ccbb6a75d,Ryan Ernst,2024-12-09 15:33:11,ryan@iernst.net,Mon Dec 9 07:33:11 2024 -0800,Remove client.type setting (#118192)The client...,False,,False,False,...,15,0,False,-1.0,23.0,7.666667,43014.0,621.0,3,low
4,2ecf981f24ca5d480466a4fcc669aeb52d063657,David Kyle,2024-12-09 15:22:48,david.kyle@elastic.co,Mon Dec 9 15:22:48 2024 +0000,[ML] Refactor the Chunker classes to return of...,False,,False,False,...,15,0,False,43.0,195.0,27.857143,5870.0,622.0,7,low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45602,ade36f026b0a676cc879992636856cd5f69430c1,kimchy,2010-02-11 20:34:11,kimchy@gmail.com,Thu Feb 11 22:34:11 2010 +0200,TransportClient to automatically retry another...,True,Corrective,True,False,...,20,3,False,269.0,553.0,110.600000,1381.0,1.0,5,low
45603,847db717c66509a817e1b965226ee1e44c08918d,kimchy,2010-02-11 17:29:25,kimchy@gmail.com,Thu Feb 11 19:29:25 2010 +0200,Transport: Support local (JVM level) transport...,False,,False,False,...,17,3,False,661.0,685.0,57.083333,1372.5,1.0,12,low
45604,b61964a2b8a4f6465928efcb5b1b434dbbb6b1a5,kimchy,2010-02-10 22:12:32,kimchy@gmail.com,Thu Feb 11 00:12:32 2010 +0200,Discovery: Support local (JVM level) discovery...,False,,False,False,...,22,2,False,381.0,391.0,48.875000,1362.5,1.0,8,low
45605,bd2b0a632bfc5aabb408e7f47cfaa52a7d1b2b50,kimchy,2010-02-09 20:18:46,kimchy@gmail.com,Tue Feb 9 22:18:46 2010 +0200,"Support terms filter, closes #1",False,,False,False,...,20,1,False,245.0,245.0,40.833333,1355.5,1.0,6,low


In [70]:
def get_CK_metrics(commit_hash: str, ck_metrics_folder: Path, changed_files: str) -> pd.DataFrame:

    # get the CK metrics for the commit
    ck_file = ck_metrics_folder / f'{commit_hash}.csv'
    ck_metrics = pd.read_csv(ck_file)

    # regex pattern for the absolute path prefix
    regex_pattern = r'/home/buzluca/JIT-SDP/repositories/elasticsearch(_worker_)?[0-9]*/'

    # Remove the matching prefix using regex
    ck_metrics['file'] = ck_metrics['file'].apply(
        lambda x: re.sub(regex_pattern, '', x)
    )

    ck_metrics['file'] = ck_metrics['file'].apply(lambda x: x.replace('\\', '/'))

    #print(ck_metrics['file'])

    # filter the CK metrics for the changed files
    ck_metrics = ck_metrics[ck_metrics['file'].isin(changed_files)]
    

    return ck_metrics

In [71]:
#df = get_CK_metrics("7c048973928e7074daa34172f9eb93701cb2eba9", CK_METRICS_FOLDER, ['elasticsearch/src/main/java/org/elasticsearch/xpack/watcher/condition/ScriptCondition.java', 'elasticsearch/src/main/java/org/elasticsearch/xpack/watcher/transform/script/ExecutableScriptTransform.java', 'elasticsearch/src/main/java/org/elasticsearch/xpack/watcher/transform/script/ScriptTransformFactory.java', 'elasticsearch/src/test/java/org/elasticsearch/xpack/watcher/transform/script/ScriptTransformTests.java', 'qa/smoke-test-watcher-with-painless/src/test/resources/rest-api-spec/test/watcher_painless/50_update_scripts.yaml'])
#df

In [72]:
def calculate_change_in_metrics(file: str, ck_metrics: pd.DataFrame, parent_ck_metrics: pd.DataFrame) -> pd.DataFrame:
    if ck_metrics.empty or parent_ck_metrics.empty:
        return pd.DataFrame()  # Return an empty DataFrame if data is missing
    
    # Initialize an empty DataFrame for the result
    result = pd.DataFrame(columns=["d_" + x for x in DELTA_METRIC_COLUMNS] + DELTA_METRIC_COLUMNS + CURRENT_STATE_ONLY_COLUMNS + COMMIT_INFO_COLUMNS + list(COMMIT_INFO_EXTENDED_COLUMNS.keys()))

    # Get the file row from CK metrics
    file_ck_metrics = ck_metrics[ck_metrics['file'] == file]
    file_parent_ck_metrics = parent_ck_metrics[parent_ck_metrics['file'] == file]

    if len(file_ck_metrics) == 0 or len(file_parent_ck_metrics) == 0:
        # Return an empty DataFrame if the file is not found in either commit
        # Usualy json, xml, etc. files are not found in CK metrics
        return pd.DataFrame()
        

    # Calculate deltas for DELTA_METRIC_COLUMNS
    for column in DELTA_METRIC_COLUMNS:
        result[f"d_{column}"] = file_ck_metrics[column].values - file_parent_ck_metrics[column].values
        result[column] = file_ck_metrics[column].values

    # Copy CURRENT_STATE_ONLY_COLUMNS directly
    for column in CURRENT_STATE_ONLY_COLUMNS:
        result[column] = file_ck_metrics[column].values

    # Copy COMMIT_INFO_COLUMNS directly from commit_information
    for column in COMMIT_INFO_COLUMNS:
        result[column] = commit_information[column]

    # Apply COMMIT_INFO_EXTENDED_COLUMNS functions row-wise
    for column, func in COMMIT_INFO_EXTENDED_COLUMNS.items():
        result[column] = commit_information[column]

    return result
    

In [73]:

def main():
    # for each commit in commit_information, get the commit hash and changed files
    # then get the CK metrics for the commit and it's parent
    # calculate the change in CK metrics in a seperate function.
    # create a sepeare entry for every changed file in the commit
    # create a new dataframe with the change in CK metrics and the commit hash and the changed file and commit information.
    # save the new dataframe to a csv file

    merged_df = pd.DataFrame(columns= ["hash"] + ["d_" + x for x in DELTA_METRIC_COLUMNS] + DELTA_METRIC_COLUMNS + CURRENT_STATE_ONLY_COLUMNS + COMMIT_INFO_COLUMNS + list(COMMIT_INFO_EXTENDED_COLUMNS.keys()))
    count = 0
    results = []
    for idx, row in commit_information.iterrows():

        commit_hash = row['commit_hash']

        # yeah, its hacky. I know.
        changed_files = row['fileschanged'].strip("[]'").replace("'", "").split(", ")
        
        for changed_file in changed_files:
            # Get metrics for current commit and file
            try:
                ck_metrics = get_CK_metrics(commit_hash, CK_METRICS_FOLDER, changed_files)
            except:
                #print("No commits")
                continue
            # Get parent commit metrics
            parent_hash = get_commit_parent_hash(commit_hash, commits)

            try:
                parent_ck_metrics = get_CK_metrics(parent_hash, CK_METRICS_FOLDER, changed_files)
            except:
                #print("No parent commits")
                continue

            
            if not ck_metrics.empty and not parent_ck_metrics.empty:
                # Calculate changes
                change_in_metrics = calculate_change_in_metrics(changed_file, ck_metrics, parent_ck_metrics)

                change_in_metrics = change_in_metrics.assign(
                    hash=commit_hash,
                    file=changed_file
                )
                
                results.append(change_in_metrics)
                count += 1
                print(count)
                if(count > 10):
                    print(len(results))

                    merged_df = pd.concat(results, ignore_index=True)
                    merged_df.to_csv("merged_df.csv", index=False)
                    print("Results saved to 'merged_df.csv'")
                    print("Close the program buddy...")
                    return

    # Concatenate all collected results into a single DataFrame at the end
    if results:
        merged_df = pd.concat(results, ignore_index=True)
        merged_df.to_csv("merged_df.csv", index=False)
        print("Results saved to 'merged_df.csv'")
    else:
        print("No results to save.")







In [74]:
main()

1
2
3
4
5
6
7
8
9
10
11
11
Results saved to 'merged_df.csv'
Close the program buddy...


In [57]:
%lprun -f main main()

1
2
3
4
5
6
7
8
9
10
11
Results saved to 'merged_df.csv'
Close the program buddy...


Timer unit: 1e-07 s

Total time: 18.6265 s
File: C:\Users\nipyh\AppData\Local\Temp\ipykernel_24540\4048654853.py
Function: main at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
     1                                           def main():
     2                                               # for each commit in commit_information, get the commit hash and changed files
     3                                               # then get the CK metrics for the commit and it's parent
     4                                               # calculate the change in CK metrics in a seperate function.
     5                                               # create a sepeare entry for every changed file in the commit
     6                                               # create a new dataframe with the change in CK metrics and the commit hash and the changed file and commit information.
     7                                               # save the new dataframe to a csv file
 

In [56]:
%lprun -f calculate_change_in_metrics main()

1
2
3
4
5
6
7
8
9
10
11
Results saved to 'merged_df.csv'
Close the program buddy...


Timer unit: 1e-07 s

Total time: 0.46972 s
File: C:\Users\nipyh\AppData\Local\Temp\ipykernel_24540\4286675479.py
Function: calculate_change_in_metrics at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
     1                                           def calculate_change_in_metrics(file: str, ck_metrics: pd.DataFrame, parent_ck_metrics: pd.DataFrame) -> pd.DataFrame:
     2                                               # Initialize an empty DataFrame for the result
     3        11     984494.0  89499.5     21.0      result = pd.DataFrame(columns=DELTA_METRIC_COLUMNS + CURRENT_STATE_ONLY_COLUMNS + COMMIT_INFO_COLUMNS + list(COMMIT_INFO_EXTENDED_COLUMNS.keys()))
     4                                           
     5                                               # Get the file row from CK metrics
     6        11      97751.0   8886.5      2.1      file_ck_metrics = ck_metrics[ck_metrics['file'] == file]
     7        11      84654.0   7695.8      1.8      file_p