In [1]:
import pandas as pd
import optuna
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
import cupy as cp

In [2]:
def train_test_split_by_date(x, y, test_size=0.2):
    """
    Split the input data into training and test sets by date.
    The test set should contain the newest test_size proportion of the data.
    """
    df = pd.concat([x, y], axis=1)
    df = df.sort_values(by='author_date_unix_timestamp')
    split_index = int((1 - test_size) * len(df))
    x_train = df.iloc[:split_index, :-1]
    y_train = df.iloc[:split_index, -1]
    x_test = df.iloc[split_index:, :-1]
    y_test = df.iloc[split_index:, -1]
    return x_train, x_test, y_train, y_test


In [3]:
def hyperparameter_optimization(x_train, y_train):
    """
    Optimize the hyperparameters of the XGBoost model.
    """

    # Convert the training data to GPU arrays using CuPy
    x_train_gpu = cp.asarray(x_train)
    y_train_gpu = cp.asarray(y_train)

    def objective(trial):
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 50, 200),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
            'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
            'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 1.0, log=True),
            'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1.0, log=True),
            'random_state': 42,
            'n_jobs': -1
        }
        
        # Configure the model for GPU training and prediction
        model = XGBClassifier(
            **params,
            tree_method="hist",  # new method since XGBoost 2.0.0
            device="cuda"        # use GPU for both training and prediction
        )
        
        model.fit(x_train_gpu, y_train_gpu)
        y_pred = model.predict(x_train_gpu)
        
        # Convert predictions and labels back to CPU arrays for scoring
        y_pred_cpu = cp.asnumpy(y_pred)
        y_train_cpu = cp.asnumpy(y_train_gpu)
        return roc_auc_score(y_train_cpu, y_pred_cpu)

    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=50)
    return study.best_params


In [4]:
# all columns in the dataset
INFO_COLUMNS = [
                "type", #"linked", "fileschanged",
                "ndev", "age", "exp", "rexp", "sexp",
                "glm_probability", "classification",
                "time_of_day", "day_of_week", "is_weekend",
                "author_experience", "author_ownership",
                "hash","file", "author_date_unix_timestamp"]

Y_COLUMN = ['contains_bug']

X_COLUMNS = ["d_cbo","d_cboModified","d_fanin",
             "d_fanout","d_wmc","d_dit","d_noc",
             "d_rfc","d_lcom","d_lcom*","d_tcc",
             "d_lcc","d_totalMethodsQty","d_staticMethodsQty",
             "d_publicMethodsQty","d_privateMethodsQty","d_protectedMethodsQty",
             "d_defaultMethodsQty","d_visibleMethodsQty","d_abstractMethodsQty",
             "d_finalMethodsQty","d_synchronizedMethodsQty","d_totalFieldsQty",
             "d_staticFieldsQty","d_publicFieldsQty","d_privateFieldsQty",
             "d_protectedFieldsQty","d_defaultFieldsQty","d_finalFieldsQty",
             "d_synchronizedFieldsQty","d_nosi","d_loc","d_returnQty","d_loopQty",
             "d_comparisonsQty","d_tryCatchQty","d_parenthesizedExpsQty","d_stringLiteralsQty",
             "d_numbersQty","d_assignmentsQty","d_mathOperationsQty","d_variablesQty",
             "d_maxNestedBlocksQty","d_anonymousClassesQty","d_innerClassesQty",
             "d_lambdasQty","d_uniqueWordsQty","d_modifiers","d_logStatementsQty",
             "cbo","cboModified","fanin","fanout","wmc","dit","noc","rfc","lcom","lcom*",
             "tcc","lcc","totalMethodsQty","staticMethodsQty","publicMethodsQty",
             "privateMethodsQty","protectedMethodsQty","defaultMethodsQty",
             "visibleMethodsQty","abstractMethodsQty","finalMethodsQty",
             "synchronizedMethodsQty","totalFieldsQty","staticFieldsQty",
             "publicFieldsQty","privateFieldsQty","protectedFieldsQty",
             "defaultFieldsQty","finalFieldsQty","synchronizedFieldsQty",
             "nosi","loc","returnQty","loopQty","comparisonsQty",
             "tryCatchQty","parenthesizedExpsQty","stringLiteralsQty",
             "numbersQty","assignmentsQty","mathOperationsQty",
             "variablesQty","maxNestedBlocksQty","anonymousClassesQty",
             "innerClassesQty","lambdasQty","uniqueWordsQty",
             "modifiers","logStatementsQty",
             #"fix",
             "entrophy",
             "la","ld",
             #"net_lines_changed","absolute_lines_changed",
             "lines_per_file",
             "changed_file_count",
             #"entropy_bucket",
             ]

In [5]:
# read the merged df

df = pd.read_csv('merged_datasets/new/tomcat_merged_df.csv')

print(df.shape[0])

126165


In [6]:
# print column which has missing values
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])


d_lcom*            6607
d_tcc             22281
d_lcc             22281
lcom*              6589
tcc               22114
lcc               22114
classification    59600
dtype: int64


In [7]:
#print distinc values in tcc column
print(df['tcc'].unique())
# and their counts
print(df['tcc'].value_counts())

# print nan count
print(df['tcc'].isnull().sum())

[-1.  0. nan]
tcc
 0.0    93641
-1.0    10410
Name: count, dtype: int64
22114


# DECISON
should we delete columns with possible N/A values or delete rows with N/A values ? 

if number of visible methods in a class is less than 2, then CK prefers to set it -1.
For some reason, some rows have NaN values, so set it to -1 as well as it is not applicable.

In [8]:
def fix_df(df):
    df['d_lcom*'] = df['d_lcom*'].fillna(-1)
    df['d_tcc'] = df['d_tcc'].fillna(-1)
    df['d_lcc'] = df['d_lcc'].fillna(-1)
    df['lcom*'] = df['lcom*'].fillna(-1)
    df['tcc'] = df['tcc'].fillna(-1)
    df['lcc'] = df['lcc'].fillna(-1)
    return df

df = fix_df(df)

In [9]:
DATASETS = ["merged_datasets/new/broadleaf_merged_df.csv",
            "merged_datasets/new/camel_merged_df.csv",
            "merged_datasets/new/dubbo_merged_df.csv",
            "merged_datasets/new/elasticsearch_merged_df.csv",
            "merged_datasets/new/guava_merged_df.csv",
            "merged_datasets/new/jdk_merged_df.csv",
            "merged_datasets/new/jgroups_merged_df.csv",
            "merged_datasets/new/kafka_merged_df.csv",
            "merged_datasets/new/spark_merged_df.csv",
            "merged_datasets/new/spring-boot_merged_df.csv",
            "merged_datasets/new/spring-framework_merged_df.csv",
            "merged_datasets/new/tomcat_merged_df.csv",]

In [10]:
from openpyxl import Workbook
from openpyxl.styles import Font, PatternFill
from openpyxl.utils import get_column_letter
from openpyxl.formatting.rule import ColorScaleRule

def write_formatted_report(ws, df, start_row, start_col):
    """
    Writes the DataFrame `df` into worksheet `ws` starting at (start_row, start_col).
    
    - The header row is formatted with bold text and a light gray fill.
    - For any row whose first cell (the "class" column) equals "accuracy"
      (case-insensitive), the function writes:
         • The label "accuracy" in the first column.
         • The accuracy value (assumed to be in the second value of the row)
           into the cell corresponding to the "f1-score" header.
         • All other numeric cells (except the support column) are left blank.
         • The "support" column (if present) is left unchanged.
    - After writing the table, a conditional color–scale is applied to all 
      numeric cells (i.e. all cells except the first "class" column and the 
      "support" column). Also, any empty cell in that range is cleared of fill.
      
    Float values are formatted with two decimals.
    
    Returns the total number of rows written (header + data).
    """
    header_font = Font(bold=True)
    header_fill = PatternFill(start_color="D9D9D9", end_color="D9D9D9", fill_type="solid")
    
    headers = list(df.columns)
    num_cols = len(headers)
    total_rows = df.shape[0] + 1  # header + data rows

    # Write the header row.
    for j, col_name in enumerate(headers, start=start_col):
        cell = ws.cell(row=start_row, column=j, value=col_name)
        cell.font = header_font
        cell.fill = header_fill

    # Write data rows.
    for i, row_data in enumerate(df.values, start=start_row + 1):
        # Check if this is the "accuracy" row.
        if str(row_data[0]).strip().lower() == "accuracy":
            # Write the "accuracy" label in the first column.
            ws.cell(row=i, column=start_col, value=row_data[0])
            # Find the column index for "f1-score" (case-insensitive) in headers.
            f1_idx = None
            for idx, h in enumerate(headers):
                if h.strip().lower() == "f1-score":
                    f1_idx = idx
                    break
            # Write the accuracy value into the "f1-score" column.
            if f1_idx is not None and len(row_data) > 1:
                accuracy_value = row_data[1]  # assume the accuracy value is in the second column
                cell = ws.cell(row=i, column=start_col + f1_idx, value=accuracy_value)
                if isinstance(accuracy_value, float):
                    cell.number_format = "0.00"
            # For the remaining columns:
            for j, h in enumerate(headers[1:], start=start_col + 1):
                # Skip the f1-score column (already written) and the support column.
                if (f1_idx is not None and j == start_col + f1_idx) or h.strip().lower() == "support":
                    continue
                ws.cell(row=i, column=j, value="")
        else:
            # Normal row: write each cell value.
            for j, value in enumerate(row_data, start=start_col):
                cell = ws.cell(row=i, column=j, value=value)
                if isinstance(value, float):
                    cell.number_format = "0.00"
    
    # Determine the range for numeric cells to apply conditional formatting.
    # We want to apply formatting to all numeric columns except:
    #   - the first column ("class")
    #   - the "support" column (if present)
    #
    # Find the index of "support" in headers (if it exists).
    support_idx = None
    for idx, h in enumerate(headers):
        if h.strip().lower() == "support":
            support_idx = idx
            break
    numeric_start_col = start_col + 1  # skip "class"
    if support_idx is not None:
        # Apply formatting to columns before the support column.
        numeric_end_col = start_col + support_idx - 1  # excludes the support column
    else:
        numeric_end_col = start_col + num_cols - 1

    numeric_start_row = start_row + 1
    numeric_end_row = start_row + df.shape[0]
    # Build range string (e.g. "B2:D10").
    cell_range = f"{get_column_letter(numeric_start_col)}{numeric_start_row}:{get_column_letter(numeric_end_col)}{numeric_end_row}"
    
    # Define a color scale: low values in red, mid in yellow, high in green.
    color_scale_rule = ColorScaleRule(
        start_type="num", start_value=0, start_color="F8696B",
        mid_type="num", mid_value=0.5, mid_color="FFEB84",
        end_type="num", end_value=1, end_color="63BE7B"
    )
    ws.conditional_formatting.add(cell_range, color_scale_rule)
    
    # Remove fill from any empty cells in the numeric range.
    for row in ws[cell_range]:
        for cell in row:
            if cell.value in [None, ""]:
                cell.fill = PatternFill(fill_type=None)
    
    return total_rows

###############################################################################
# Main export function.
###############################################################################
def export_results_to_excel(results, output_file):
    """
    Groups the classification reports (from the global dict `results`) by dataset
    and writes them all into a single worksheet.
    
    For each dataset, the random-split and date-split reports are written side by side,
    with a title row above each table (e.g. "Dataset1 random split" and "Dataset1 date split").
    Each dataset block is written on a new set of rows (separated by a blank gap).
    
    Finally, the workbook is saved to `output_file`.
    """
    # Group the reports by dataset name.
    # Expected keys: "<dataset_name>_random_split" and "<dataset_name>_date_split"
    grouped_results = {}
    for key, report in results.items():
        try:
            ds_name, split_type, _ = key.rsplit("_", 2)
        except ValueError:
            continue
        if ds_name not in grouped_results:
            grouped_results[ds_name] = {}
        grouped_results[ds_name][split_type] = report

    # Create a new workbook with one worksheet.
    wb = Workbook()
    ws = wb.active
    ws.title = "Classification Reports"
    
    current_row = 1  # starting row on the sheet
    gap_rows = 2     # blank rows between dataset blocks

    for ds_name, splits in grouped_results.items():
        # Convert the reports into DataFrames.
        if "random" in splits:
            df_random = pd.DataFrame(splits["random"]).T.reset_index().rename(columns={"index": "class"})
        else:
            df_random = None

        if "date" in splits:
            df_date = pd.DataFrame(splits["date"]).T.reset_index().rename(columns={"index": "class"})
        else:
            df_date = None

        # Determine starting column for the date table.
        if df_random is not None:
            left_table_cols = df_random.shape[1]
        else:
            left_table_cols = 0
        date_table_start_col = left_table_cols + 3 if df_random is not None else 1

        # Write title cells above each table.
        ws.cell(row=current_row, column=1, value=f"{ds_name} random split")
        ws.cell(row=current_row, column=date_table_start_col, value=f"{ds_name} date split")

        # Write the tables starting on the next row.
        start_table_row = current_row + 1
        rows_random = 0
        rows_date = 0
        if df_random is not None:
            rows_random = write_formatted_report(ws, df_random, start_table_row, 1)
        if df_date is not None:
            rows_date = write_formatted_report(ws, df_date, start_table_row, date_table_start_col)

        # Determine block height (title row + maximum table height).
        block_height = 1 + max(rows_random, rows_date)
        current_row += block_height + gap_rows

    wb.save(output_file)
    print(f"Workbook saved as {output_file}")


In [11]:
results = {}

def main():

    for dataset in DATASETS:
        
        dataset_name = dataset.split("new")[1][1:]
        print(f"Processing {dataset_name}")
        df = pd.read_csv(dataset)
        df = fix_df(df)

        # split data randomly
        x_train, x_test, y_train, y_test = train_test_split(df[X_COLUMNS + INFO_COLUMNS], df[Y_COLUMN], test_size=0.2, random_state=42)

        best_params = hyperparameter_optimization(x_train[X_COLUMNS], y_train)

        # Train final model with best parameters
        best_model = XGBClassifier(**best_params, tree_method="hist", device="cuda")

        best_model.fit(x_train[X_COLUMNS], y_train)

        # Evaluate
        y_pred = best_model.predict(x_test[X_COLUMNS])
        results[f"{dataset_name}_random_split"] = classification_report(y_test, y_pred, output_dict=True)


        # split data by date
        x_train, x_test, y_train, y_test = train_test_split_by_date(df[X_COLUMNS + INFO_COLUMNS], df[Y_COLUMN], test_size=0.2)

        best_params = hyperparameter_optimization(x_train[X_COLUMNS], y_train)

        # Train final model with best parameters
        best_model = XGBClassifier(**best_params, tree_method="hist", device="cuda")
        best_model.fit(x_train[X_COLUMNS], y_train)

        # Evaluate
        y_pred = best_model.predict(x_test[X_COLUMNS])
        results[f"{dataset_name}_date_split"] = classification_report(y_test, y_pred, output_dict=True)

    # After processing all datasets, export all reports to a single Excel file.
    export_results_to_excel(results, output_file="classification_reports.xlsx")

In [12]:
main()

Processing broadleaf_merged_df.csv


[I 2025-02-10 22:16:58,164] A new study created in memory with name: no-name-a27cdbc6-3fa1-4090-8fcc-b91df5eec80d
[I 2025-02-10 22:16:59,253] Trial 0 finished with value: 0.881792807034878 and parameters: {'n_estimators': 64, 'max_depth': 5, 'learning_rate': 0.022689849216518985, 'subsample': 0.7535277233296808, 'colsample_bytree': 0.9732065538616412, 'gamma': 2.7173430592847106e-05, 'reg_alpha': 0.00549401159868306, 'reg_lambda': 1.8523161097808435e-05}. Best is trial 0 with value: 0.881792807034878.
[I 2025-02-10 22:17:02,354] Trial 1 finished with value: 0.978785332599651 and parameters: {'n_estimators': 197, 'max_depth': 9, 'learning_rate': 0.1378525799241747, 'subsample': 0.7218895806571544, 'colsample_bytree': 0.5936747845745334, 'gamma': 3.480557902591777e-06, 'reg_alpha': 6.756377566529824e-08, 'reg_lambda': 0.2413214755039369}. Best is trial 1 with value: 0.978785332599651.
[I 2025-02-10 22:17:03,105] Trial 2 finished with value: 0.8980539346889755 and parameters: {'n_estimato

Processing camel_merged_df.csv


[I 2025-02-10 22:20:51,385] A new study created in memory with name: no-name-e13e2d05-b32c-4add-bff4-8c7de2b8f33d
[I 2025-02-10 22:20:52,817] Trial 0 finished with value: 0.8133242711101383 and parameters: {'n_estimators': 70, 'max_depth': 3, 'learning_rate': 0.15645939218402483, 'subsample': 0.6828656707652546, 'colsample_bytree': 0.6625372253323778, 'gamma': 2.015839564063922e-05, 'reg_alpha': 9.604304422094585e-05, 'reg_lambda': 0.1759391026475963}. Best is trial 0 with value: 0.8133242711101383.
[I 2025-02-10 22:20:54,820] Trial 1 finished with value: 0.925448525671955 and parameters: {'n_estimators': 64, 'max_depth': 8, 'learning_rate': 0.23504596503832645, 'subsample': 0.5363322050406532, 'colsample_bytree': 0.972291226215237, 'gamma': 2.4272768983235525e-08, 'reg_alpha': 0.0014453355014509245, 'reg_lambda': 7.011065726935077e-06}. Best is trial 1 with value: 0.925448525671955.
[I 2025-02-10 22:20:56,783] Trial 2 finished with value: 0.8431790610615667 and parameters: {'n_estimat

Processing dubbo_merged_df.csv


[I 2025-02-10 22:28:43,194] A new study created in memory with name: no-name-c076d0e1-1e13-4540-b530-00cd346cb846
[I 2025-02-10 22:28:43,824] Trial 0 finished with value: 0.8003237691551803 and parameters: {'n_estimators': 146, 'max_depth': 3, 'learning_rate': 0.025684354720272872, 'subsample': 0.7340928631120129, 'colsample_bytree': 0.8619289161868744, 'gamma': 0.05369482533257821, 'reg_alpha': 2.1969757310909494e-05, 'reg_lambda': 9.860791165814119e-06}. Best is trial 0 with value: 0.8003237691551803.
[I 2025-02-10 22:28:44,552] Trial 1 finished with value: 0.8661280851452638 and parameters: {'n_estimators': 73, 'max_depth': 7, 'learning_rate': 0.01026438286460239, 'subsample': 0.9310137371872931, 'colsample_bytree': 0.6700364874953615, 'gamma': 4.907087745432218e-05, 'reg_alpha': 0.012506467514610141, 'reg_lambda': 0.0008731326252797485}. Best is trial 1 with value: 0.8661280851452638.
[I 2025-02-10 22:28:45,895] Trial 2 finished with value: 0.900944546795736 and parameters: {'n_est

Processing elasticsearch_merged_df.csv


  df = pd.read_csv(dataset)
[I 2025-02-10 22:31:58,430] A new study created in memory with name: no-name-a090d1bd-1a86-4a6e-8922-4f46a23ed3ec
[I 2025-02-10 22:32:02,360] Trial 0 finished with value: 0.8553970046743898 and parameters: {'n_estimators': 149, 'max_depth': 7, 'learning_rate': 0.09515450652965993, 'subsample': 0.9257533128095701, 'colsample_bytree': 0.9706214882382291, 'gamma': 0.006367888869231772, 'reg_alpha': 0.0011230811335488726, 'reg_lambda': 0.6737676716593546}. Best is trial 0 with value: 0.8553970046743898.
[I 2025-02-10 22:32:08,923] Trial 1 finished with value: 0.9210600633377438 and parameters: {'n_estimators': 166, 'max_depth': 10, 'learning_rate': 0.10073818576417629, 'subsample': 0.5356405350227267, 'colsample_bytree': 0.6096604020590586, 'gamma': 7.725921593143714e-06, 'reg_alpha': 7.644204539805477e-07, 'reg_lambda': 0.007378484008678425}. Best is trial 1 with value: 0.9210600633377438.
[I 2025-02-10 22:32:15,061] Trial 2 finished with value: 0.9347335359855

Processing guava_merged_df.csv


[I 2025-02-10 22:41:59,824] A new study created in memory with name: no-name-0a2c43ab-f6ee-4152-a392-431a122cf027
[I 2025-02-10 22:42:02,542] Trial 0 finished with value: 0.9423215376470372 and parameters: {'n_estimators': 165, 'max_depth': 9, 'learning_rate': 0.052710252024705015, 'subsample': 0.7414276091147625, 'colsample_bytree': 0.8431923274800512, 'gamma': 0.029247189891760214, 'reg_alpha': 2.4979828681938547e-06, 'reg_lambda': 2.108907782259686e-08}. Best is trial 0 with value: 0.9423215376470372.
[I 2025-02-10 22:42:05,901] Trial 1 finished with value: 0.8630445587701796 and parameters: {'n_estimators': 193, 'max_depth': 9, 'learning_rate': 0.011446444383411334, 'subsample': 0.9503960563100267, 'colsample_bytree': 0.8540078843949712, 'gamma': 0.0004322709320112435, 'reg_alpha': 0.10579624537413311, 'reg_lambda': 0.010727884915786644}. Best is trial 0 with value: 0.9423215376470372.
[I 2025-02-10 22:42:07,665] Trial 2 finished with value: 0.8651988160530886 and parameters: {'n_e

Processing jdk_merged_df.csv


[I 2025-02-10 22:46:21,734] Trial 0 finished with value: 1.0 and parameters: {'n_estimators': 136, 'max_depth': 6, 'learning_rate': 0.13191341812872479, 'subsample': 0.8644117446231988, 'colsample_bytree': 0.7825987546966557, 'gamma': 0.019708841892134957, 'reg_alpha': 9.427742088014376e-05, 'reg_lambda': 1.6136525513418748e-06}. Best is trial 0 with value: 1.0.
[I 2025-02-10 22:46:22,186] Trial 1 finished with value: 1.0 and parameters: {'n_estimators': 123, 'max_depth': 6, 'learning_rate': 0.027017709335410364, 'subsample': 0.9007252727991971, 'colsample_bytree': 0.5010649824742933, 'gamma': 9.183958435150954e-05, 'reg_alpha': 4.953794820748465e-05, 'reg_lambda': 1.7824506470127734e-08}. Best is trial 0 with value: 1.0.
[I 2025-02-10 22:46:22,500] Trial 2 finished with value: 1.0 and parameters: {'n_estimators': 127, 'max_depth': 9, 'learning_rate': 0.04144947935774678, 'subsample': 0.6788362482659234, 'colsample_bytree': 0.9763457677913461, 'gamma': 0.000934460645876767, 'reg_alpha'

Processing jgroups_merged_df.csv


[I 2025-02-10 22:47:06,793] A new study created in memory with name: no-name-79efed71-019c-4c28-a912-7bafd116ba29
[I 2025-02-10 22:47:07,577] Trial 0 finished with value: 0.6218672541999449 and parameters: {'n_estimators': 147, 'max_depth': 3, 'learning_rate': 0.010129866022300066, 'subsample': 0.8951162391683545, 'colsample_bytree': 0.7083671479195082, 'gamma': 1.3149168355960443e-07, 'reg_alpha': 8.425249394406075e-07, 'reg_lambda': 4.386828919670189e-06}. Best is trial 0 with value: 0.6218672541999449.
[I 2025-02-10 22:47:09,436] Trial 1 finished with value: 0.8405950923101974 and parameters: {'n_estimators': 183, 'max_depth': 7, 'learning_rate': 0.13491737337071902, 'subsample': 0.5048857783812117, 'colsample_bytree': 0.7149546850413947, 'gamma': 0.03725989197248724, 'reg_alpha': 0.001412040883404597, 'reg_lambda': 0.00018320410353217225}. Best is trial 1 with value: 0.8405950923101974.
[I 2025-02-10 22:47:11,821] Trial 2 finished with value: 0.8316173476076114 and parameters: {'n_

Processing kafka_merged_df.csv


[I 2025-02-10 22:51:02,553] A new study created in memory with name: no-name-6cb9830c-60df-4145-8923-f9f98db8c70d
[I 2025-02-10 22:51:03,030] Trial 0 finished with value: 0.6885941493513201 and parameters: {'n_estimators': 59, 'max_depth': 5, 'learning_rate': 0.07191512916412714, 'subsample': 0.5727944331519766, 'colsample_bytree': 0.5989319056532882, 'gamma': 0.0008434726300839524, 'reg_alpha': 0.014165033134843174, 'reg_lambda': 1.273360176138736e-08}. Best is trial 0 with value: 0.6885941493513201.
[I 2025-02-10 22:51:04,230] Trial 1 finished with value: 0.7307425892061757 and parameters: {'n_estimators': 199, 'max_depth': 4, 'learning_rate': 0.08256386926692381, 'subsample': 0.8491870059180797, 'colsample_bytree': 0.545042247649068, 'gamma': 0.3878905513936345, 'reg_alpha': 7.678141583525408e-05, 'reg_lambda': 0.00011222404972365872}. Best is trial 1 with value: 0.7307425892061757.
[I 2025-02-10 22:51:06,005] Trial 2 finished with value: 0.9866459669950308 and parameters: {'n_estim

Processing spark_merged_df.csv


[I 2025-02-10 22:55:08,262] Trial 0 finished with value: 0.9973145656373013 and parameters: {'n_estimators': 125, 'max_depth': 7, 'learning_rate': 0.26657558804376, 'subsample': 0.6837500290599337, 'colsample_bytree': 0.5168953590378054, 'gamma': 0.0035194516029273945, 'reg_alpha': 1.8426752984261393e-07, 'reg_lambda': 1.4984027146752797e-06}. Best is trial 0 with value: 0.9973145656373013.
[I 2025-02-10 22:55:08,938] Trial 1 finished with value: 0.8595527154301179 and parameters: {'n_estimators': 59, 'max_depth': 9, 'learning_rate': 0.03120301202429089, 'subsample': 0.5611577898290891, 'colsample_bytree': 0.9364313340420876, 'gamma': 0.005895992524237241, 'reg_alpha': 3.7382328566897654e-08, 'reg_lambda': 5.839727926268241e-07}. Best is trial 0 with value: 0.9973145656373013.
[I 2025-02-10 22:55:09,379] Trial 2 finished with value: 0.782266797872017 and parameters: {'n_estimators': 55, 'max_depth': 6, 'learning_rate': 0.05339285285335536, 'subsample': 0.5694684748751522, 'colsample_by

Processing spring-boot_merged_df.csv


[I 2025-02-10 22:57:30,423] A new study created in memory with name: no-name-91ca2cb1-6abe-44ba-93a9-604af54fa818
[I 2025-02-10 22:57:31,418] Trial 0 finished with value: 0.8752671023631183 and parameters: {'n_estimators': 109, 'max_depth': 6, 'learning_rate': 0.1601411636375329, 'subsample': 0.9375637723827206, 'colsample_bytree': 0.7258362685727909, 'gamma': 1.3874427011574124e-08, 'reg_alpha': 1.0175974338127458e-06, 'reg_lambda': 0.0001688485562735201}. Best is trial 0 with value: 0.8752671023631183.
[I 2025-02-10 22:57:32,436] Trial 1 finished with value: 0.8684940100406957 and parameters: {'n_estimators': 133, 'max_depth': 5, 'learning_rate': 0.19150722409665819, 'subsample': 0.966799059932169, 'colsample_bytree': 0.7101881465383542, 'gamma': 0.018659732739956757, 'reg_alpha': 4.6821920076995184e-07, 'reg_lambda': 2.1413733023227156e-06}. Best is trial 0 with value: 0.8752671023631183.
[I 2025-02-10 22:57:33,593] Trial 2 finished with value: 0.8450816105398279 and parameters: {'n

Processing spring-framework_merged_df.csv


[I 2025-02-10 23:01:34,287] A new study created in memory with name: no-name-e4d8a0d4-0415-4271-951f-11bd7c8082bd
[I 2025-02-10 23:01:36,417] Trial 0 finished with value: 0.9036610264293276 and parameters: {'n_estimators': 112, 'max_depth': 9, 'learning_rate': 0.030173611050692394, 'subsample': 0.6276254578434008, 'colsample_bytree': 0.5127988223469047, 'gamma': 0.03071037160193882, 'reg_alpha': 7.891124792135172e-06, 'reg_lambda': 1.4525991165656807e-07}. Best is trial 0 with value: 0.9036610264293276.
[I 2025-02-10 23:01:39,229] Trial 1 finished with value: 0.9651191592133188 and parameters: {'n_estimators': 200, 'max_depth': 8, 'learning_rate': 0.16787718247757205, 'subsample': 0.9386638279250251, 'colsample_bytree': 0.6626007597701227, 'gamma': 0.00045906277933178123, 'reg_alpha': 1.0007433442249443e-08, 'reg_lambda': 0.002679454770414605}. Best is trial 1 with value: 0.9651191592133188.
[I 2025-02-10 23:01:39,867] Trial 2 finished with value: 0.8313720456750122 and parameters: {'n

Processing tomcat_merged_df.csv


[I 2025-02-10 23:06:09,689] A new study created in memory with name: no-name-dbe30539-4287-4304-93ed-f0b9c826dd75
[I 2025-02-10 23:06:10,470] Trial 0 finished with value: 0.7395166729734977 and parameters: {'n_estimators': 109, 'max_depth': 4, 'learning_rate': 0.26064697913242024, 'subsample': 0.9026510604555885, 'colsample_bytree': 0.9798816066931204, 'gamma': 0.006008775990540599, 'reg_alpha': 0.010042001423585266, 'reg_lambda': 2.7880505688961863e-08}. Best is trial 0 with value: 0.7395166729734977.
[I 2025-02-10 23:06:13,134] Trial 1 finished with value: 0.9377463191151074 and parameters: {'n_estimators': 115, 'max_depth': 10, 'learning_rate': 0.24170768371571902, 'subsample': 0.7357993447580651, 'colsample_bytree': 0.7419318602795082, 'gamma': 2.4420719945205565e-06, 'reg_alpha': 0.00042257280787303384, 'reg_lambda': 0.12915850560409473}. Best is trial 1 with value: 0.9377463191151074.
[I 2025-02-10 23:06:14,521] Trial 2 finished with value: 0.7275184646516298 and parameters: {'n_

Workbook saved as classification_reports.xlsx
