In [1]:
#%load_ext line_profiler

In [2]:
# Disable pandas future warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
from pathlib import Path
import pandas as pd
import os
import re
import datetime

import pickle

In [4]:
# Constants
ROOT_DIR = Path("..")
DATASET_FOLDER = ROOT_DIR / 'dataset' / 'java' / 'broadleaf'
COMMITS_FILE = DATASET_FOLDER / 'commit_history.txt'
COMMIT_INFORMATION_FILE = DATASET_FOLDER / 'dataset.csv'
CK_METRICS_FOLDER = Path("D:") / 'CK'

# used in the commit information file
CAS_DELIMITER = 'CAS_DELIMITER'


##############
# CK Metrics #
##############

# Metrics to calculate deltas for
DELTA_METRIC_COLUMNS = ["cbo", "cboModified", "fanin", "fanout", "wmc", 
    "dit", "noc", "rfc", "lcom", "lcom*", "tcc", "lcc", 
    "totalMethodsQty", "staticMethodsQty", "publicMethodsQty", "privateMethodsQty", 
    "protectedMethodsQty", "defaultMethodsQty", "visibleMethodsQty", 
    "abstractMethodsQty", "finalMethodsQty", "synchronizedMethodsQty", 
    "totalFieldsQty", "staticFieldsQty", "publicFieldsQty", "privateFieldsQty", 
    "protectedFieldsQty", "defaultFieldsQty", "finalFieldsQty", "synchronizedFieldsQty", 
    "nosi", "loc", "returnQty", "loopQty", "comparisonsQty", "tryCatchQty", 
    "parenthesizedExpsQty", "stringLiteralsQty", "numbersQty", 
    "assignmentsQty", "mathOperationsQty", "variablesQty", "maxNestedBlocksQty", 
    "anonymousClassesQty", "innerClassesQty", "lambdasQty",
    "uniqueWordsQty", "modifiers", "logStatementsQty"]

CURRENT_STATE_ONLY_COLUMNS = [
    "class", "type"
]

######################
# Commit Gru Metrics #
######################

COMMIT_INFO_COLUMNS = [
    "fix",               # Boolean
    "classification",    # Categorical
    #"linked",           # Boolean                     | commit guru related
    "contains_bug",      # Boolean                     | truth value
    "entrophy",          # Numeric
    "la",                # Lines Added
    "ld",                # Lines Deleted
    #"fileschanged",      # Count of files changed # 
    "ndev",              # Number of developers
    "age",               # Average file age
    "exp",               # Developer experience
    "rexp",              # Recent experience
    "sexp",              # Subsystem experience
    "glm_probability",   # Numeric                      | all 0, not useful
    "author_date_unix_timestamp" # Date of commit
]

# Updated COMMIT_INFO_EXTENDED_COLUMNS with vectorized lambdas
COMMIT_INFO_EXTENDED_COLUMNS = {
    "time_of_day": lambda df: df["author_date_unix_timestamp"].dt.hour,
    "day_of_week": lambda df: df["author_date_unix_timestamp"].dt.dayofweek,
    "is_weekend": lambda df: df["author_date_unix_timestamp"].dt.dayofweek > 4,
    "net_lines_changed": lambda df: df["la"] - df["ld"],
    "absolute_lines_changed": lambda df: abs(df["la"] + df["ld"]),
    "lines_per_file": lambda df: (df["la"] + df["ld"]) / (df["fileschanged"].str.count(",") + 1),
    "author_experience": lambda df: df["exp"],
    "author_ownership": lambda df: df["ndev"],
    "changed_file_count": lambda df: df["fileschanged"].str.count(",") + 1,
    "entropy_bucket": lambda df: pd.cut(
        df["entrophy"], bins=3, labels=["low", "medium", "high"]
    ),
    "num_files_changed": lambda df: df["fileschanged"].str.count(",") + 1,
}


In [5]:
def get_commit_parent_hash(commit_hash: str, commits_df: pd.DataFrame) -> str:

    # find the commit index in the dataframe and return previous hash
    commit_index = commits_df[commits_df['hash'] == commit_hash].index[0]
    # retunr previous hash
    try:
        parent_hash = commits_df.iloc[commit_index + 1]['hash']
    except IndexError:
        parent_hash = None
    return parent_hash

In [6]:
# Precompute extended columns dynamically
def precompute_extended_columns(commit_info_df, extended_columns):
    for column, func in extended_columns.items():
        if callable(func):
            # Apply the function across the DataFrame without axis=1
            commit_info_df[column] = func(commit_info_df)
        else:
            raise ValueError(f"Function for column '{column}' must be callable")
    return commit_info_df

In [7]:
# read commit history
commits = pd.read_csv(COMMITS_FILE, header=None, names=['hash'])
commits

Unnamed: 0,hash
0,7b8f23ee797efd78092dffc89048dda0a4c09a98
1,916521e5543af19d06383acff1d1d2f635c5a301
2,fd1e0d8571c707aef7baf9fea03cd10e17a7fe0c
3,071f980fa00c1a64367a9e8d002398a88f18bb75
4,e56ef02e15d6645743b150f2e148c62ba85911ed
...,...
18653,044a99f0d8cfd3d11e85f45cd5bfc210cd509980
18654,59f3abc25a731e1d901416b0c06c88aa6fae1477
18655,c89a41d0b89c1cf5467b8eb5948132005b8301ee
18656,2bf902b5f0c7e6ad29a515406d92a0307b66e1aa


In [8]:
# read commit information file into df
commit_information = pd.read_csv(COMMIT_INFORMATION_FILE)

commit_information["author_date_unix_timestamp"] = pd.to_datetime(
    commit_information["author_date_unix_timestamp"], unit="s"
)

# Precompute the columns dynamically
commit_information = precompute_extended_columns(commit_information, COMMIT_INFO_EXTENDED_COLUMNS)

commit_information

Unnamed: 0,commit_hash,author_name,author_date_unix_timestamp,author_email,author_date,commit_message,fix,classification,linked,contains_bug,...,day_of_week,is_weekend,net_lines_changed,absolute_lines_changed,lines_per_file,author_experience,author_ownership,changed_file_count,entropy_bucket,num_files_changed
0,e296c698b83e320ff15c4ec1ca9258e188ff867c,KatrukOV,2024-10-21 13:22:25,KatrukOV@gmail.com,Mon Oct 21 16:22:25 2024 +0300,"QA-5314: created MessageService, BLCMessageUti...",False,,False,False,...,0,False,87.0,91.0,22.750000,2818.5,53.0,4,low,4
1,9aa562f432ccfec587895b633c908eaa85f85f36,Roman Mosiienko,2024-10-08 12:39:03,roman.mosiienko@gmail.com,Tue Oct 8 15:39:03 2024 +0300,BroadleafCommerce/QA#5293 (#3026)Fixed typo in...,True,Corrective,True,False,...,1,False,0.0,2.0,2.000000,26.0,52.0,1,low,1
2,83068da492974cd27ab0fcd85700af3e369d5490,KatrukOV,2024-10-07 15:53:10,KatrukOV@gmail.com,Mon Oct 7 18:53:10 2024 +0300,QA-5309: updated hibernate version; fixed test...,False,,False,False,...,0,False,0.0,4.0,2.000000,2815.5,52.0,2,low,2
3,ed8f87a110ddcae69668aef0a4d8d4f4d292574d,KatrukOV,2024-10-02 08:36:52,KatrukOV@gmail.com,Wed Oct 2 11:36:52 2024 +0300,QA-5308: fixed validateIfProductIsProdRecord()...,False,,False,False,...,2,False,4.0,8.0,8.000000,2814.0,52.0,1,low,1
4,9ca686b7a68e010d4d22d7ffb59d078d4c138109,Roman Mosiienko,2024-08-13 14:17:20,roman.mosiienko@gmail.com,Tue Aug 13 17:17:20 2024 +0300,BroadleafCommerce/QA#5294 (#3019)Added new con...,False,Feature Addition,False,False,...,1,False,4.0,6.0,3.000000,23.5,52.0,2,low,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10141,939598ee321b80c8272358a9967ab2bb05fb0bbb,Jonathan Ball,2009-01-08 20:42:13,jball@credera.com,Thu Jan 8 20:42:13 2009 +0000,add login to query for password change require...,False,Feature Addition,False,False,...,3,False,366.0,382.0,31.833333,237.5,1.0,12,low,12
10142,539acc6e622672d2fb011702ad47bdaaf35a7c79,Jeff Fischer,2009-01-07 19:10:55,jfischer@broadleafcommerce.org,Wed Jan 7 19:10:55 2009 +0000,,False,,False,False,...,2,False,372.0,372.0,14.880000,24.0,1.0,25,low,25
10143,d6a459082f1085aeedd5895b46ca7a1fd3a54b00,Jonathan Ball,2008-12-30 21:33:13,jball@credera.com,Tue Dec 30 21:33:13 2008 +0000,add methods to lookup user,False,Feature Addition,False,False,...,1,False,13.0,13.0,3.250000,106.5,2.0,4,low,4
10144,d8a646f6819a212257741cc7071998093a34b674,Jonathan Ball,2008-12-30 19:16:59,jball@credera.com,Tue Dec 30 19:16:59 2008 +0000,add UserService and implementation,False,Feature Addition,False,False,...,1,False,44.0,52.0,10.400000,19.0,2.0,5,low,5


In [9]:
import json

# Preload changed_files for each commit

# Create a dictionary to store preloaded changed files
_changed_files_cache = {}

def preload_changed_files(commits_df: pd.DataFrame) -> int:

    global _changed_files_cache

    """    # check if json file exists
    if os.path.exists('changed_files_cache.json'):
        with open('changed_files_cache.json', 'r') as f:
            _changed_files_cache = json.load(f)
            print(len(_changed_files_cache))

        return len(_changed_files_cache)"""
    
    # check if pickle file exists
    if os.path.exists('changed_files_cache.pkl'):
        with open('changed_files_cache.pkl', 'rb') as f:
            _changed_files_cache = pickle.load(f)
            print(len(_changed_files_cache))

        return len(_changed_files_cache)

    i = 0
    for commit_hash in commits_df['commit_hash']:
        if i % 100 == 0:  # Update progress
            print(f"\rLoading changed files: {i}/{len(commits_df)}", end='', flush=True)
        i += 1
        # Get the first (and only) value from the Series and then apply string operations
        files_str = commits_df.loc[commits_df['commit_hash'] == commit_hash, 'fileschanged'].iloc[0]
        changed_files = files_str.strip("[]'").replace("'", "").split(", ")
        _changed_files_cache[commit_hash] = changed_files


    # Save changed_files_cache
    #with open('changed_files_cache.json', 'w') as f:
        #json.dump(_changed_files_cache, f)
    
    #print("Saved changed_files_cache at as json file")

    return len(_changed_files_cache)


def get_changed_files(commit_hash: str, commits_df: pd.DataFrame) -> list[str]:
    global _changed_files_cache
    if commit_hash not in _changed_files_cache.keys():
        return []  # Return empty list if commit not found
    
    return _changed_files_cache[commit_hash]

# Create a dictionary to store preloaded metrics
_ck_metrics_cache = {}

def preload_ck_metrics(ck_metrics_folder: Path):
    #return
    
    global _ck_metrics_cache

    """    # check if json file exists
    if os.path.exists('ck_metrics_cache.json'):
        with open('ck_metrics_cache.json', 'r') as f:
            cache = json.load(f)
            _ck_metrics_cache = { key: pd.read_csv(value) for key, value in cache.items() }

        return len(_ck_metrics_cache)"""
    # load pickel file
    if os.path.exists('ck_metrics_cache.pkl'):
        with open('ck_metrics_cache.pkl', 'rb') as f:
            _ck_metrics_cache = pickle.load(f)
            return len(_ck_metrics_cache)
    
    regex_pattern = r'/home/buzluca/JIT-SDP/repositories/broadleaf(_worker_)?[0-9]*/'

    i = 0
    for csv_file in ck_metrics_folder.glob('*.csv'):
        if i % 100 == 0:
            print(f"\rLoading CK files: {i}", end='', flush=True)
        i+= 1
        commit_hash = csv_file.stem  # Get filename without extension
        df = pd.read_csv(csv_file)
        
        # Clean paths when loading the data
        df['file'] = df['file'].str.replace(regex_pattern, '', regex=True).str.replace('\\', '/')
        
        # this line was first intended to reduce the size of the data in ram
        # but we need all the data to calculate deltas, as we dont know which files are changed for preceeding and proceeding commits 
        #df = df[df['file'].isin(get_changed_files(commit_hash, commit_information))]
        
        _ck_metrics_cache[commit_hash] = df

    #serializable_ck_cache = {
    #    k: v.to_dict('records') if isinstance(v, pd.DataFrame) else v 
    #    for k, v in _ck_metrics_cache.items()
    #}

    #with open('ck_metrics_cache.json', 'w') as f:
        #json.dump(serializable_ck_cache, f)

    print("Saved ck_metrics_cache as json file")
    
    return len(_ck_metrics_cache)

# Modify the function to use cached data
def get_CK_metrics(commit_hash: str, ck_metrics_folder: Path, changed_file: str) -> pd.DataFrame:
    """    # load from disk
    df = pd.read_csv(ck_metrics_folder / f"{commit_hash}.csv")

    # Clean paths when loading the data
    df['file'] = df['file'].str.replace(r'/home/buzluca/JIT-SDP/repositories/broadleaf(_worker_)?[0-9]*/', '', regex=True).str.replace('\\', '/')

    return df[df['file'] == changed_file]"""


    #return
    
    if not _ck_metrics_cache:
        preload_ck_metrics(ck_metrics_folder)
    
    if commit_hash not in _ck_metrics_cache.keys():
        return pd.DataFrame()  # Return empty DataFrame if commit not found
        
    ############################################### BUNU DÜŞÜN !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 
    #e = _ck_metrics_cache[commit_hash]
    #a = 1
    result_df = _ck_metrics_cache[commit_hash][_ck_metrics_cache[commit_hash]['file'] == changed_file]
    if result_df.empty:
        # Create a DataFrame with zeros using the same columns as the original CK metrics
        zero_df = pd.DataFrame(0, index=[0], columns=_ck_metrics_cache[commit_hash].columns)
        return zero_df
    return result_df


In [10]:
with open('changed_files_cache.json', 'w') as f:
    json.dump(_changed_files_cache, f)

In [11]:
import pandas as pd

def calculate_change_in_metrics(
    ck_metrics: pd.DataFrame,
    parent_ck_metrics: pd.DataFrame,
    commit_information: pd.DataFrame
) -> pd.DataFrame:
    """
    Given CK metrics for the current commit (`ck_metrics`) and the parent commit (`parent_ck_metrics`),
    calculate delta metrics (`d_<metric>`) for columns in `DELTA_METRIC_COLUMNS`.
    Also copy over current metrics, CURRENT_STATE_ONLY_COLUMNS, and commit info.
    """

    # If either DataFrame is empty, or if "class" is missing, we can't align data
    if ck_metrics.empty or parent_ck_metrics.empty:
        return pd.DataFrame()
    if "class" not in ck_metrics.columns or "class" not in parent_ck_metrics.columns:
        return pd.DataFrame()

    # 1) Identify modified vs. new classes
    modified_classes = set(ck_metrics["class"]).intersection(parent_ck_metrics["class"])
    new_classes = set(ck_metrics["class"]) - set(parent_ck_metrics["class"])

    # 2) Filter dataframes to relevant rows
    common_class_ck = ck_metrics[ck_metrics["class"].isin(modified_classes)]
    common_class_parent = parent_ck_metrics[parent_ck_metrics["class"].isin(modified_classes)]
    new_class_ck = ck_metrics[ck_metrics["class"].isin(new_classes)]

    # If there's no overlap and no new classes, nothing to do
    if common_class_ck.empty and new_class_ck.empty:
        return pd.DataFrame()

    # 3) Make sure we have both the "class" column AND use it as the index
    #    (drop=False keeps "class" in the columns as well)
    common_class_ck_aligned = common_class_ck.set_index("class", drop=False)
    common_class_parent_aligned = common_class_parent.set_index("class", drop=False)
    new_class_ck_aligned = new_class_ck.set_index("class", drop=False)

    # 4) Compute delta metrics
    #    - "modified" classes: current - parent
    #    - "new" classes: same as current (no parent => delta = current)
    if not common_class_ck_aligned.empty and not common_class_parent_aligned.empty:
        delta_modified = common_class_ck_aligned[DELTA_METRIC_COLUMNS].subtract(
            common_class_parent_aligned[DELTA_METRIC_COLUMNS],
            #fill_value=0  # or omit if you prefer NaN for missing
        )
    else:
        # No modified classes, so create an empty DataFrame with the needed columns
        delta_modified = pd.DataFrame(columns=DELTA_METRIC_COLUMNS)

    # "New" classes => delta is the same as current
    delta_new = new_class_ck_aligned[DELTA_METRIC_COLUMNS]
    delta_combined = pd.concat([delta_modified, delta_new], axis=0)

    # 5) Current metric values (both modified + new)
    current_modified = common_class_ck_aligned[DELTA_METRIC_COLUMNS] if not common_class_ck_aligned.empty else pd.DataFrame(columns=DELTA_METRIC_COLUMNS)
    current_new = new_class_ck_aligned[DELTA_METRIC_COLUMNS] if not new_class_ck_aligned.empty else pd.DataFrame(columns=DELTA_METRIC_COLUMNS)
    current_metrics_combined = pd.concat([current_modified, current_new], axis=0)

    # 6) Current-state-only columns (e.g., ["class", "type"])
    #    Because we used drop=False above, "class" is still in the columns.
    current_state_modified = common_class_ck_aligned[CURRENT_STATE_ONLY_COLUMNS] if not common_class_ck_aligned.empty else pd.DataFrame(columns=CURRENT_STATE_ONLY_COLUMNS)
    current_state_new = new_class_ck_aligned[CURRENT_STATE_ONLY_COLUMNS] if not new_class_ck_aligned.empty else pd.DataFrame(columns=CURRENT_STATE_ONLY_COLUMNS)
    current_state_combined = pd.concat([current_state_modified, current_state_new], axis=0)

    # 7) Commit info: typically 1 row per commit
    if commit_information.empty:
        # Create empty placeholders for all commit columns
        commit_info_df = pd.DataFrame(columns=COMMIT_INFO_COLUMNS + list(COMMIT_INFO_EXTENDED_COLUMNS.keys()))
    else:
        # Filter just the relevant columns
        commit_info_df = commit_information[COMMIT_INFO_COLUMNS + list(COMMIT_INFO_EXTENDED_COLUMNS.keys())]
        if len(commit_info_df) == 1:
            # Replicate that one row for all classes
            commit_info_df = pd.concat([commit_info_df]*len(delta_combined), ignore_index=True)
        else:
            # Otherwise, you'll need a more specific matching strategy
            # (Assuming 1 commit => replicate row 0 is the simplest fallback)
            commit_info_df = pd.concat([commit_info_df.iloc[[0]]]*len(delta_combined), ignore_index=True)

    # 8) Build final DataFrame
    #    Start with the same index as delta_combined
    final_df = pd.DataFrame(index=delta_combined.index)

    # a) Add "d_<metric>" columns
    final_df[[f"d_{col}" for col in DELTA_METRIC_COLUMNS]] = delta_combined[DELTA_METRIC_COLUMNS]

    # b) Add current metrics
    final_df[DELTA_METRIC_COLUMNS] = current_metrics_combined[DELTA_METRIC_COLUMNS]

    # c) Add current-state columns (including "class" since we used drop=False)
    final_df[CURRENT_STATE_ONLY_COLUMNS] = current_state_combined[CURRENT_STATE_ONLY_COLUMNS]

    # d) Add commit info columns
    # commit_info is single row, so we can just copy it to all rows
    commit_info_df.index = final_df.index  # line up by row
    final_df = final_df.join(commit_info_df)

    # 9) Optionally reset index so "class" is a normal column, not the index
    final_df.reset_index(drop=True, inplace=True)

    return final_df


In [12]:
import logging
from datetime import datetime

def setup_logger():
    # Create logs directory if it doesn't exist
    log_dir = ROOT_DIR / 'logs'
    log_dir.mkdir(exist_ok=True)
    
    # Create a logger
    logger = logging.getLogger('commit_processor')
    logger.setLevel(logging.INFO)
    
    # Create file handler with timestamp in filename
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    log_file = log_dir / f'commit_processing_{timestamp}.log'
    file_handler = logging.FileHandler(log_file)
    
    # Create console handler
    console_handler = logging.StreamHandler()
    
    # Create formatter
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    file_handler.setFormatter(formatter)
    console_handler.setFormatter(formatter)
    
    # Add handlers to logger
    logger.addHandler(file_handler)
    logger.addHandler(console_handler)
    
    return logger



In [13]:
print("Preloading changed files...")
preload_changed_files(commit_information)
print("Changed files preloaded.")

Preloading changed files...
10146
Changed files preloaded.


In [14]:
print(len(_changed_files_cache))

10146


In [15]:
print("Preloading CK metrics...")
preload_ck_metrics(CK_METRICS_FOLDER)
print("CK metrics preloaded.")


Preloading CK metrics...
CK metrics preloaded.


In [16]:
print(len(_ck_metrics_cache))

13264


In [17]:
list(_ck_metrics_cache.keys())[:5]

['0006af0a508d85e5272619e42d84de6cd274ba65',
 '000c163f71928338e659bd059d8018f0b1b1df2a',
 '00162a14cd5f51b3ca5107972402dc8174773089',
 '001668c70ddafbd5710f8b28f8efa3c61a665b8c',
 '0016ea29ef69b8721dfd45c6e0b1647ec6f20794']

In [18]:
"""# Save caches as pickle as well
with open('changed_files_cache.pkl', 'wb') as f:
    pickle.dump(_changed_files_cache, f)

with open('ck_metrics_cache.pkl', 'wb') as f:
    pickle.dump(_ck_metrics_cache, f)"""

"# Save caches as pickle as well\nwith open('changed_files_cache.pkl', 'wb') as f:\n    pickle.dump(_changed_files_cache, f)\n\nwith open('ck_metrics_cache.pkl', 'wb') as f:\n    pickle.dump(_ck_metrics_cache, f)"

In [19]:
def is_valid_file(file: str) -> bool:
    return file.endswith('.java') and not file.endswith('package-info.java') and not "test" in file.lower() and not "example" in file.lower()

In [23]:
from concurrent.futures import ThreadPoolExecutor
import threading

def main():
    # for each commit in commit_information, get the commit hash and changed files
    # then get the CK metrics for the commit and it's parent
    # calculate the change in CK metrics in a seperate function.
    # create a sepeare entry for every changed file in the commit
    # create a new dataframe with the change in CK metrics and the commit hash and the changed file and commit information.
    # save the new dataframe to a csv file
    
    
    # Initialize logger
    logger = setup_logger()
    
    
    # Create locks
    results_lock = threading.Lock()
    logger_lock = threading.Lock()
    
    results = []

    
    def process_commit(row_data):
        idx, row = row_data
        commit_hash = row['commit_hash']
        
        with logger_lock:
            logger.info(f"Processing commit {commit_hash} | {idx + 1} of {len(commit_information)} commits | Number of merged commits: {len(results)}")
            
        #changed_files = row['fileschanged'].strip("[]'").replace("'", "").split(", ")

        changed_files = get_changed_files(commit_hash, commit_information)

        # filter changed files
        changed_files = [file for file in changed_files if is_valid_file(file)]
        
        changed_file_idx = 0

        for changed_file in changed_files:
            
            changed_file_idx += 1

            try:
                ck_metrics = get_CK_metrics(commit_hash, CK_METRICS_FOLDER, changed_file)
            except:
                continue
                
            parent_hash = get_commit_parent_hash(commit_hash, commits)
            
            try:
                parent_ck_metrics = get_CK_metrics(parent_hash, CK_METRICS_FOLDER, changed_file)
            except:
                continue
            
                
            if not ck_metrics.empty and not parent_ck_metrics.empty:
                commit_information_row = commit_information[commit_information['commit_hash'] == commit_hash]
                change_in_metrics = calculate_change_in_metrics(ck_metrics, parent_ck_metrics, commit_information_row)

                if change_in_metrics.empty:
                    # no source code file is modified or added
                    continue

                # print "a" if there is more than 5 nan values in change_in_metrics
                #if change_in_metrics.isnull().sum().sum() > 5:
                    #print("a")
                
                change_in_metrics.loc[:, ['hash', 'file']] = commit_hash, changed_file
                
                with results_lock:
                    results.append(change_in_metrics)
                    with logger_lock:
                        if changed_file_idx % 100 == 0:
                            logger.info(f"Merging commit {commit_hash} | {changed_file_idx} file of {len(changed_files)} changed files | Number of merged commits: {len(results)}")
                        #logger.info(f"Merged commit {commit_hash} | {changed_file_idx} file of {len(changed_files)} changed files, Number of merged commits: {len(results)}")
                

    
    # Use ThreadPoolExecutor to parallelize the processing
    """with ThreadPoolExecutor(max_workers=1) as executor:
        executor.map(process_commit, commit_information.iterrows())"""

    for i, row in commit_information.iterrows():
        if i>10:
            break
        process_commit((i, row))
    
    # Concatenate all collected results into a single DataFrame at the end
    if results:
        merged_df = pd.concat(results, ignore_index=True)
        merged_df.to_csv("merged_df.csv", index=False)
        print("Results saved to 'merged_df.csv'")
    else:
        print("No results to save.")




In [24]:
main()

2025-02-03 13:42:50,729 - INFO - Processing commit e296c698b83e320ff15c4ec1ca9258e188ff867c | 1 of 10146 commits | Number of merged commits: 0
2025-02-03 13:42:50,729 - INFO - Processing commit e296c698b83e320ff15c4ec1ca9258e188ff867c | 1 of 10146 commits | Number of merged commits: 0
2025-02-03 13:42:50,868 - INFO - Processing commit 9aa562f432ccfec587895b633c908eaa85f85f36 | 2 of 10146 commits | Number of merged commits: 4
2025-02-03 13:42:50,868 - INFO - Processing commit 9aa562f432ccfec587895b633c908eaa85f85f36 | 2 of 10146 commits | Number of merged commits: 4
2025-02-03 13:42:50,901 - INFO - Processing commit 83068da492974cd27ab0fcd85700af3e369d5490 | 3 of 10146 commits | Number of merged commits: 5
2025-02-03 13:42:50,901 - INFO - Processing commit 83068da492974cd27ab0fcd85700af3e369d5490 | 3 of 10146 commits | Number of merged commits: 5
2025-02-03 13:42:50,902 - INFO - Processing commit ed8f87a110ddcae69668aef0a4d8d4f4d292574d | 4 of 10146 commits | Number of merged commits: 5

Results saved to 'merged_df.csv'


In [23]:
# read the merged_df
df = pd.read_csv("merged_df.csv")
df

Unnamed: 0,d_cbo,d_cboModified,d_fanin,d_fanout,d_wmc,d_dit,d_noc,d_rfc,d_lcom,d_lcom*,...,net_lines_changed,absolute_lines_changed,lines_per_file,author_experience,author_ownership,changed_file_count,entropy_bucket,num_files_changed,hash,file
0,0,1,1,0,2,1,0,0,1,0.000000,...,87.0,91.0,22.7500,2818.5,53.0,4,low,4,e296c698b83e320ff15c4ec1ca9258e188ff867c,common/src/main/java/org/broadleafcommerce/com...
1,6,7,1,6,2,1,0,4,1,0.500000,...,87.0,91.0,22.7500,2818.5,53.0,4,low,4,e296c698b83e320ff15c4ec1ca9258e188ff867c,common/src/main/java/org/broadleafcommerce/com...
2,0,0,0,0,0,0,0,0,0,0.000000,...,87.0,91.0,22.7500,2818.5,53.0,4,low,4,e296c698b83e320ff15c4ec1ca9258e188ff867c,common/src/main/java/org/broadleafcommerce/com...
3,0,0,0,0,0,0,0,0,0,0.000000,...,87.0,91.0,22.7500,2818.5,53.0,4,low,4,e296c698b83e320ff15c4ec1ca9258e188ff867c,core/broadleaf-framework/src/main/java/org/bro...
4,0,0,0,0,0,0,0,0,0,0.000000,...,0.0,2.0,2.0000,26.0,52.0,1,low,1,9aa562f432ccfec587895b633c908eaa85f85f36,core/broadleaf-profile/src/main/java/org/broad...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3304,0,0,0,0,0,0,0,0,0,0.017857,...,9178.0,76858.0,27.5378,1416.0,52.0,2791,high,2791,5a913ddbb89da00ecb0bccc9a7de100629d645ad,core/broadleaf-profile/src/main/java/org/broad...
3305,0,0,0,0,0,0,0,0,2,0.008889,...,9178.0,76858.0,27.5378,1416.0,52.0,2791,high,2791,5a913ddbb89da00ecb0bccc9a7de100629d645ad,core/broadleaf-profile/src/main/java/org/broad...
3306,0,0,0,0,0,0,0,0,0,0.000000,...,9178.0,76858.0,27.5378,1416.0,52.0,2791,high,2791,5a913ddbb89da00ecb0bccc9a7de100629d645ad,core/broadleaf-profile/src/main/java/org/broad...
3307,0,0,0,0,0,0,0,0,0,0.000000,...,9178.0,76858.0,27.5378,1416.0,52.0,2791,high,2791,5a913ddbb89da00ecb0bccc9a7de100629d645ad,core/broadleaf-profile/src/main/java/org/broad...


In [None]:
#%lprun -f main main()

In [None]:
#%lprun -f calculate_change_in_metrics main()