In [1]:
import pandas as pd
import optuna
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
import cupy as cp
import numpy as np
import json

In [2]:
def train_test_split_by_date(x, y, test_size=0.2):
    """
    Split the input data into training and test sets by date.
    The test set should contain the newest test_size proportion of the data.
    """
    df = pd.concat([x, y], axis=1)
    df = df.sort_values(by='author_date_unix_timestamp')
    split_index = int((1 - test_size) * len(df))
    x_train = df.iloc[:split_index, :-1]
    y_train = df.iloc[:split_index, -1]
    x_test = df.iloc[split_index:, :-1]
    y_test = df.iloc[split_index:, -1]
    return x_train, x_test, y_train, y_test


In [3]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
import cupy as cp
import optuna
from sklearn.metrics import average_precision_score

def hyperparameter_optimization(x_train, y_train, n_splits=5, n_trials=50):
    """
    Optimize the hyperparameters of the XGBoost model using cross-validation.

    Parameters:
      - x_train: Training features (pandas DataFrame)
      - y_train: Training labels (pandas Series)
      - n_splits: Number of cross-validation splits (default: 5)
      - n_trials: Number of hyperparameter search trials (default: 50)

    Returns:
      - The best hyperparameters found by Optuna.
    """
    # Convert the entire training set to NumPy arrays once.
    x_train_np = x_train.to_numpy()
    y_train_np = y_train.to_numpy().ravel()

    # Convert the NumPy arrays to GPU arrays (cupy.ndarray).
    x_train_gpu = cp.asarray(x_train_np)
    y_train_gpu = cp.asarray(y_train_np)

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    def objective(trial):
        # Define hyperparameter search space.
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 50, 300),
            'max_depth': trial.suggest_int('max_depth', 3, 15),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
            'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
            'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 1.0, log=True),
            'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1.0, log=True),
            'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1, 20),
            'random_state': 42,
            'n_jobs': -1
        }
        
        cv_scores = []
        # Perform stratified K-fold cross-validation using pre-converted arrays.
        for train_idx, valid_idx in skf.split(x_train_np, y_train_np):
            # Slice the pre-converted GPU arrays.
            X_tr_gpu = x_train_gpu[train_idx]
            y_tr_gpu = y_train_gpu[train_idx]
            X_val_gpu = x_train_gpu[valid_idx]
            # Use the original NumPy array for validation labels.
            y_val = y_train_np[valid_idx]
            
            # Configure and train the model.
            model = XGBClassifier(
                **params,
                tree_method="hist",  # GPU-optimized training
                device="cuda"        # use GPU for training and prediction
            )
            model.fit(X_tr_gpu, y_tr_gpu)
            
            # Predict on the validation fold.
            y_pred_prob = model.predict_proba(X_val_gpu)[:, 1]
            y_pred_cpu = cp.asnumpy(y_pred_prob)
            
            # Compute ROC AUC for the current fold.
            fold_score = average_precision_score(y_val, y_pred_cpu)
            cv_scores.append(fold_score)
        
        # Return the mean ROC AUC across folds.
        return np.mean(cv_scores)
    
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=n_trials)

    # Free GPU memory to clear up space for subsequent computations.
    cp._default_memory_pool.free_all_blocks()

    return study.best_params


In [4]:
# all columns in the dataset
INFO_COLUMNS = [
                "type", #"linked", "fileschanged",
                "ndev", "age", "exp", "rexp", "sexp",
                "glm_probability", "classification",
                "time_of_day", "day_of_week", "is_weekend",
                "author_experience", "author_ownership",
                "hash","file", "author_date_unix_timestamp"]

Y_COLUMN = ['contains_bug']

X_COLUMNS = ["d_cbo","d_cboModified","d_fanin",
             "d_fanout","d_wmc","d_dit","d_noc",
             "d_rfc","d_lcom","d_lcom*","d_tcc",
             "d_lcc","d_totalMethodsQty","d_staticMethodsQty",
             "d_publicMethodsQty","d_privateMethodsQty","d_protectedMethodsQty",
             "d_defaultMethodsQty","d_visibleMethodsQty","d_abstractMethodsQty",
             "d_finalMethodsQty","d_synchronizedMethodsQty","d_totalFieldsQty",
             "d_staticFieldsQty","d_publicFieldsQty","d_privateFieldsQty",
             "d_protectedFieldsQty","d_defaultFieldsQty","d_finalFieldsQty",
             "d_synchronizedFieldsQty","d_nosi","d_loc","d_returnQty","d_loopQty",
             "d_comparisonsQty","d_tryCatchQty","d_parenthesizedExpsQty","d_stringLiteralsQty",
             "d_numbersQty","d_assignmentsQty","d_mathOperationsQty","d_variablesQty",
             "d_maxNestedBlocksQty","d_anonymousClassesQty","d_innerClassesQty",
             "d_lambdasQty",
             #"d_uniqueWordsQty",
             "d_modifiers",
             #"d_logStatementsQty",
             "cbo","cboModified","fanin","fanout","wmc","dit","noc","rfc","lcom","lcom*",
             "tcc","lcc","totalMethodsQty","staticMethodsQty","publicMethodsQty",
             "privateMethodsQty","protectedMethodsQty","defaultMethodsQty",
             "visibleMethodsQty","abstractMethodsQty","finalMethodsQty",
             "synchronizedMethodsQty","totalFieldsQty","staticFieldsQty",
             "publicFieldsQty","privateFieldsQty","protectedFieldsQty",
             "defaultFieldsQty","finalFieldsQty","synchronizedFieldsQty",
             "nosi","loc","returnQty","loopQty","comparisonsQty",
             "tryCatchQty","parenthesizedExpsQty","stringLiteralsQty",
             "numbersQty","assignmentsQty","mathOperationsQty",
             "variablesQty","maxNestedBlocksQty","anonymousClassesQty",
             "innerClassesQty","lambdasQty",
             #"uniqueWordsQty", # Number of unique words in the source code
             "modifiers",
             #"logStatementsQty", # Number of log statements in the source code
             #"fix",
             "entrophy",
             "la","ld",
             #"net_lines_changed","absolute_lines_changed",
             "lines_per_file",
             "changed_file_count",
             #"entropy_bucket",
             ]

In [5]:
# read the merged df

df = pd.read_csv('merged_datasets/new/tomcat_merged_df.csv')

print(df.shape[0])

126165


In [6]:
# print column which has missing values
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])


d_lcom*            6607
d_tcc             22281
d_lcc             22281
lcom*              6589
tcc               22114
lcc               22114
classification    59600
dtype: int64


In [7]:
#print distinc values in tcc column
print(df['tcc'].unique())
# and their counts
print(df['tcc'].value_counts())

# print nan count
print(df['tcc'].isnull().sum())

[-1.  0. nan]
tcc
 0.0    93641
-1.0    10410
Name: count, dtype: int64
22114


# DECISON
should we delete columns with possible N/A values or delete rows with N/A values ? 

if number of visible methods in a class is less than 2, then CK prefers to set it -1.
For some reason, some rows have NaN values, so set it to -1 as well as it is not applicable.

In [None]:
from sklearn.preprocessing import StandardScaler

def fix_df(df):
    # Handle NA values
    df['d_lcom*'] = df['d_lcom*'].fillna(-1)
    df['d_tcc'] = df['d_tcc'].fillna(-1)
    df['d_lcc'] = df['d_lcc'].fillna(-1)
    df['lcom*'] = df['lcom*'].fillna(-1)
    df['tcc'] = df['tcc'].fillna(-1)
    df['lcc'] = df['lcc'].fillna(-1)
    
    # Create a scaler instance
    scaler = StandardScaler()
    
    # Scale only the numeric columns from X_COLUMNS
    numeric_cols = df[X_COLUMNS].select_dtypes(include=['int64', 'float64']).columns
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
    
    return df

df = fix_df(df)

In [9]:
DATASETS = ["merged_datasets/new/broadleaf_merged_df.csv",
            #"merged_datasets/new/camel_merged_df.csv",
            #"merged_datasets/new/dubbo_merged_df.csv",
            #"merged_datasets/new/elasticsearch_merged_df.csv",
            #"merged_datasets/new/guava_merged_df.csv",
            #"merged_datasets/new/jdk_merged_df.csv",
            #"merged_datasets/new/jgroups_merged_df.csv",
            #"merged_datasets/new/kafka_merged_df.csv",
            "merged_datasets/new/spark_merged_df.csv",
            #"merged_datasets/new/spring-boot_merged_df.csv",
            #"merged_datasets/new/spring-framework_merged_df.csv",
            #"merged_datasets/new/tomcat_merged_df.csv",
            ]

In [10]:
from openpyxl import Workbook
from openpyxl.styles import Font, PatternFill
from openpyxl.utils import get_column_letter
from openpyxl.formatting.rule import ColorScaleRule

def write_formatted_report(ws, df, start_row, start_col):
    """
    Writes the DataFrame `df` into worksheet `ws` starting at (start_row, start_col).
    
    - The header row is formatted with bold text and a light gray fill.
    - For any row whose first cell (the "class" column) equals "accuracy"
      (case-insensitive), the function writes:
         • The label "accuracy" in the first column.
         • The accuracy value (assumed to be in the second value of the row)
           into the cell corresponding to the "f1-score" header.
         • All other numeric cells (except the support column) are left blank.
         • The "support" column (if present) is left unchanged.
    - After writing the table, a conditional color–scale is applied to all 
      numeric cells (i.e. all cells except the first "class" column and the 
      "support" column). Also, any empty cell in that range is cleared of fill.
      
    Float values are formatted with two decimals.
    
    Returns the total number of rows written (header + data).
    """
    header_font = Font(bold=True)
    header_fill = PatternFill(start_color="D9D9D9", end_color="D9D9D9", fill_type="solid")
    
    headers = list(df.columns)
    num_cols = len(headers)
    total_rows = df.shape[0] + 1  # header + data rows

    # Write the header row.
    for j, col_name in enumerate(headers, start=start_col):
        cell = ws.cell(row=start_row, column=j, value=col_name)
        cell.font = header_font
        cell.fill = header_fill

    # Write data rows.
    for i, row_data in enumerate(df.values, start=start_row + 1):
        # Check if this is the "accuracy" row.
        if str(row_data[0]).strip().lower() == "accuracy":
            # Write the "accuracy" label in the first column.
            ws.cell(row=i, column=start_col, value=row_data[0])
            # Find the column index for "f1-score" (case-insensitive) in headers.
            f1_idx = None
            for idx, h in enumerate(headers):
                if h.strip().lower() == "f1-score":
                    f1_idx = idx
                    break
            # Write the accuracy value into the "f1-score" column.
            if f1_idx is not None and len(row_data) > 1:
                accuracy_value = row_data[1]  # assume the accuracy value is in the second column
                cell = ws.cell(row=i, column=start_col + f1_idx, value=accuracy_value)
                if isinstance(accuracy_value, float):
                    cell.number_format = "0.00"
            # For the remaining columns:
            for j, h in enumerate(headers[1:], start=start_col + 1):
                # Skip the f1-score column (already written) and the support column.
                if (f1_idx is not None and j == start_col + f1_idx) or h.strip().lower() == "support":
                    continue
                ws.cell(row=i, column=j, value="")
        else:
            # Normal row: write each cell value.
            for j, value in enumerate(row_data, start=start_col):
                cell = ws.cell(row=i, column=j, value=value)
                if isinstance(value, float):
                    cell.number_format = "0.00"
    
    # Determine the range for numeric cells to apply conditional formatting.
    # We want to apply formatting to all numeric columns except:
    #   - the first column ("class")
    #   - the "support" column (if present)
    #
    # Find the index of "support" in headers (if it exists).
    support_idx = None
    for idx, h in enumerate(headers):
        if h.strip().lower() == "support":
            support_idx = idx
            break
    numeric_start_col = start_col + 1  # skip "class"
    if support_idx is not None:
        # Apply formatting to columns before the support column.
        numeric_end_col = start_col + support_idx - 1  # excludes the support column
    else:
        numeric_end_col = start_col + num_cols - 1

    numeric_start_row = start_row + 1
    numeric_end_row = start_row + df.shape[0]
    # Build range string (e.g. "B2:D10").
    cell_range = f"{get_column_letter(numeric_start_col)}{numeric_start_row}:{get_column_letter(numeric_end_col)}{numeric_end_row}"
    
    # Define a color scale: low values in red, mid in yellow, high in green.
    color_scale_rule = ColorScaleRule(
        start_type="num", start_value=0, start_color="F8696B",
        mid_type="num", mid_value=0.5, mid_color="FFEB84",
        end_type="num", end_value=1, end_color="63BE7B"
    )
    ws.conditional_formatting.add(cell_range, color_scale_rule)
    
    # Remove fill from any empty cells in the numeric range.
    for row in ws[cell_range]:
        for cell in row:
            if cell.value in [None, ""]:
                cell.fill = PatternFill(fill_type=None)
    
    return total_rows

###############################################################################
# Main export function.
###############################################################################
def export_results_to_excel(results, output_file):
    """
    Groups the classification reports (from the global dict `results`) by dataset
    and writes them all into a single worksheet.
    
    For each dataset, the random-split and date-split reports are written side by side,
    with a title row above each table (e.g. "Dataset1 random split" and "Dataset1 date split").
    Each dataset block is written on a new set of rows (separated by a blank gap).
    
    Finally, the workbook is saved to `output_file`.
    """
    # Group the reports by dataset name.
    # Expected keys: "<dataset_name>_random_split" and "<dataset_name>_date_split"
    grouped_results = {}
    for key, report in results.items():
        try:
            ds_name, split_type, _ = key.rsplit("_", 2)
        except ValueError:
            continue
        if ds_name not in grouped_results:
            grouped_results[ds_name] = {}
        grouped_results[ds_name][split_type] = report

    # Create a new workbook with one worksheet.
    wb = Workbook()
    ws = wb.active
    ws.title = "Classification Reports"
    
    current_row = 1  # starting row on the sheet
    gap_rows = 2     # blank rows between dataset blocks

    for ds_name, splits in grouped_results.items():
        # Convert the reports into DataFrames.
        if "random" in splits:
            df_random = pd.DataFrame(splits["random"]).T.reset_index().rename(columns={"index": "class"})
        else:
            df_random = None

        if "date" in splits:
            df_date = pd.DataFrame(splits["date"]).T.reset_index().rename(columns={"index": "class"})
        else:
            df_date = None

        # Determine starting column for the date table.
        if df_random is not None:
            left_table_cols = df_random.shape[1]
        else:
            left_table_cols = 0
        date_table_start_col = left_table_cols + 3 if df_random is not None else 1

        # Write title cells above each table.
        ws.cell(row=current_row, column=1, value=f"{ds_name} random split")
        ws.cell(row=current_row, column=date_table_start_col, value=f"{ds_name} date split")

        # Write the tables starting on the next row.
        start_table_row = current_row + 1
        rows_random = 0
        rows_date = 0
        if df_random is not None:
            rows_random = write_formatted_report(ws, df_random, start_table_row, 1)
        if df_date is not None:
            rows_date = write_formatted_report(ws, df_date, start_table_row, date_table_start_col)

        # Determine block height (title row + maximum table height).
        block_height = 1 + max(rows_random, rows_date)
        current_row += block_height + gap_rows

    wb.save(output_file)
    print(f"Workbook saved as {output_file}")


In [None]:
from sklearn.feature_selection import SelectFromModel
from mrmr import mrmr_classif

def select_features(x, y, method="model", estimator=None, threshold="median", 
                    correlation_threshold=0.9, num_features=50):
    """
    Select features using different methods.

    Parameters:
      - x: Training features (pandas DataFrame)
      - y: Training labels (pandas Series)
      - method: Which feature selection method to use.
                Options are:
                  "model" : Model-based feature selection using SelectFromModel.
                  "cbfs"  : Correlation-based feature selection (CBFS).
                  "mrmr"  : Minimum Redundancy Maximum Relevance (MRMR) using mrmr_selection.
      - estimator: (For "model" method) An estimator to compute feature importances.
                   If None, an XGBClassifier is used.
      - threshold: (For "model" method) The threshold for feature importance in SelectFromModel.
                   Default is "median".
      - correlation_threshold: (For "cbfs" method) Threshold to drop highly correlated features.
      - num_features: (For "mrmr" method) Number of features to select.
      
    Returns:
      - A list of selected feature names.
    """
    method = method.lower()
    if method == "model":
        if estimator is None:
            estimator = XGBClassifier(tree_method="hist", device="cuda", random_state=42)
        estimator.fit(x, y)
        selector = SelectFromModel(estimator, threshold=threshold, prefit=True)
        selected_features = list(x.columns[selector.get_support()])
        return selected_features

    elif method == "cbfs":
        # Compute the absolute correlation matrix and remove one of any pair of highly correlated features.
        corr_matrix = x.corr().abs()
        # Use only the upper triangle of the correlation matrix.
        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
        to_drop = [column for column in upper.columns if any(upper[column] > correlation_threshold)]
        selected_features = [col for col in x.columns if col not in to_drop]
        return selected_features

    elif method == "mrmr":
        # Call mrmr_classif directly with the DataFrame (not x.values) so that it can access the column names.
        selected_features = mrmr_classif(x, y, K=num_features)
        return selected_features

    else:
        raise ValueError("Method must be one of 'model', 'cbfs', or 'mrmr'")

In [12]:
results = {}
best_param_dict = {}

In [16]:
def main():

    # mrmr, cbfs, model
    feature_selection_method = "mrmr"

    for dataset in DATASETS:

        dataset_name = dataset.split("new")[1][1:]

        for processed in results.keys():
            if dataset_name in processed:
                continue        

        print(f"Processing {dataset_name}")
        df = pd.read_csv(dataset)
        df = fix_df(df)

        # -------------------------
        # Random split approach
        # -------------------------
        x_train, x_test, y_train, y_test = train_test_split(df[X_COLUMNS + INFO_COLUMNS], df[Y_COLUMN],
                                                            test_size=0.2, random_state=42)
        # Perform feature selection on the training set using only X_COLUMNS
        selected_features_random = select_features(x_train[X_COLUMNS], y_train,
                                                   method=feature_selection_method, num_features=50)
        print(f"Selected features for {dataset_name} (random split): {selected_features_random}")

        # Optimize hyperparameters on the selected features
        best_params = hyperparameter_optimization(x_train[selected_features_random], y_train, n_splits=3, n_trials=30)
        best_model = XGBClassifier(**best_params, tree_method="hist", device="cuda")
        best_model.fit(x_train[selected_features_random], y_train)
        y_pred = best_model.predict(x_test[selected_features_random])
        results[f"{dataset_name}_random_split"] = classification_report(y_test, y_pred, output_dict=True)
        best_param_dict[dataset_name] = best_params

        # -------------------------
        # Date split approach
        # -------------------------
        x_train_date, x_test_date, y_train_date, y_test_date = train_test_split_by_date(
            df[X_COLUMNS + INFO_COLUMNS], df[Y_COLUMN], test_size=0.2)

        selected_features_date = select_features(x_train_date[X_COLUMNS], y_train_date,
                                                 method=feature_selection_method, num_features=50)
        print(f"Selected features for {dataset_name} (date split): {selected_features_date}")

        best_params = hyperparameter_optimization(x_train_date[selected_features_date], y_train_date, n_splits=3, n_trials=30)
        best_model = XGBClassifier(**best_params, tree_method="hist", device="cuda")
        best_model.fit(x_train_date[selected_features_date], y_train_date)
        y_pred = best_model.predict(x_test_date[selected_features_date])
        results[f"{dataset_name}_date_split"] = classification_report(y_test_date, y_pred, output_dict=True)
        best_param_dict[dataset_name] = best_params

    # Export the classification reports to Excel.
    export_results_to_excel(results, output_file="classification_reports.xlsx")

    # save best params as json
    with open('best_params.json', 'w') as f:
        json.dump(best_param_dict, f)

In [14]:
"""export_results_to_excel(results, output_file="classification_reports.xlsx")

# save best params as json
with open('best_params.json', 'w') as f:
    json.dump(best_param_dict, f)"""

'export_results_to_excel(results, output_file="classification_reports.xlsx")\n\n# save best params as json\nwith open(\'best_params.json\', \'w\') as f:\n    json.dump(best_param_dict, f)'

In [17]:
main()

Processing broadleaf_merged_df.csv


100%|██████████| 50/50 [00:13<00:00,  3.77it/s]
[I 2025-02-11 21:43:06,557] A new study created in memory with name: no-name-5a6a4b96-48ef-493b-b549-b47a2a956ba1


Selected features for broadleaf_merged_df.csv (random split): ['entrophy', 'd_synchronizedMethodsQty', 'changed_file_count', 'ld', 'dit', 'la', 'd_dit', 'lines_per_file', 'd_cbo', 'defaultMethodsQty', 'loc', 'd_cboModified', 'd_fanout', 'stringLiteralsQty', 'd_maxNestedBlocksQty', 'd_publicMethodsQty', 'modifiers', 'rfc', 'd_totalFieldsQty', 'anonymousClassesQty', 'd_totalMethodsQty', 'd_tcc', 'd_visibleMethodsQty', 'loopQty', 'd_staticMethodsQty', 'd_anonymousClassesQty', 'd_loc', 'cbo', 'd_fanin', 'variablesQty', 'd_protectedFieldsQty', 'fanout', 'd_stringLiteralsQty', 'd_returnQty', 'd_abstractMethodsQty', 'assignmentsQty', 'd_rfc', 'comparisonsQty', 'd_privateFieldsQty', 'd_modifiers', 'd_lcc', 'd_variablesQty', 'd_lcom*', 'wmc', 'd_finalFieldsQty', 'd_wmc', 'd_defaultMethodsQty', 'd_assignmentsQty', 'innerClassesQty', 'nosi']


[I 2025-02-11 21:43:21,982] Trial 0 finished with value: 0.9892938723112765 and parameters: {'n_estimators': 163, 'max_depth': 13, 'learning_rate': 0.05718652626584726, 'subsample': 0.7701021210956728, 'colsample_bytree': 0.6015063896135228, 'gamma': 0.03787959560764957, 'reg_alpha': 2.4954800957371677e-08, 'reg_lambda': 1.5842985608766603e-07, 'scale_pos_weight': 13.952370862922926}. Best is trial 0 with value: 0.9892938723112765.
[I 2025-02-11 21:43:28,165] Trial 1 finished with value: 0.9896235297816967 and parameters: {'n_estimators': 131, 'max_depth': 10, 'learning_rate': 0.12962155095545388, 'subsample': 0.9473179367014943, 'colsample_bytree': 0.5164951089086431, 'gamma': 4.545893110237135e-06, 'reg_alpha': 0.33464787305029314, 'reg_lambda': 4.5084408442302946e-05, 'scale_pos_weight': 16.64488831799694}. Best is trial 1 with value: 0.9896235297816967.
[I 2025-02-11 21:43:31,257] Trial 2 finished with value: 0.9729044302520343 and parameters: {'n_estimators': 92, 'max_depth': 9, '

Selected features for broadleaf_merged_df.csv (date split): ['entrophy', 'lines_per_file', 'changed_file_count', 'dit', 'ld', 'la', 'd_dit', 'd_cbo', 'd_fanout', 'loc', 'd_cboModified', 'abstractMethodsQty', 'd_staticMethodsQty', 'd_maxNestedBlocksQty', 'd_visibleMethodsQty', 'rfc', 'innerClassesQty', 'd_totalFieldsQty', 'd_totalMethodsQty', 'd_anonymousClassesQty', 'stringLiteralsQty', 'd_privateFieldsQty', 'loopQty', 'd_loc', 'modifiers', 'd_publicMethodsQty', 'protectedMethodsQty', 'd_stringLiteralsQty', 'fanin', 'd_defaultFieldsQty', 'd_synchronizedMethodsQty', 'variablesQty', 'd_protectedFieldsQty', 'd_returnQty', 'anonymousClassesQty', 'd_variablesQty', 'assignmentsQty', 'd_modifiers', 'd_rfc', 'nosi', 'd_assignmentsQty', 'd_fanin', 'comparisonsQty', 'd_wmc', 'tcc', 'wmc', 'tryCatchQty', 'lcom', 'synchronizedMethodsQty', 'lcc']


[I 2025-02-11 21:50:38,611] Trial 0 finished with value: 0.9830008395086764 and parameters: {'n_estimators': 267, 'max_depth': 12, 'learning_rate': 0.19812000938365465, 'subsample': 0.9159835709632226, 'colsample_bytree': 0.6281330851504283, 'gamma': 6.439779582047195e-06, 'reg_alpha': 0.4903084574915911, 'reg_lambda': 1.3650107257190959e-08, 'scale_pos_weight': 2.7538973143157346}. Best is trial 0 with value: 0.9830008395086764.
[I 2025-02-11 21:50:47,078] Trial 1 finished with value: 0.9752213133631304 and parameters: {'n_estimators': 295, 'max_depth': 8, 'learning_rate': 0.03441358825110564, 'subsample': 0.5171380334673845, 'colsample_bytree': 0.8551686186807641, 'gamma': 0.08434034757292588, 'reg_alpha': 2.5628100749843442e-08, 'reg_lambda': 6.44650146548965e-07, 'scale_pos_weight': 1.478734958426069}. Best is trial 0 with value: 0.9830008395086764.
[I 2025-02-11 21:50:53,228] Trial 2 finished with value: 0.978542530258871 and parameters: {'n_estimators': 191, 'max_depth': 8, 'lear

Processing spark_merged_df.csv


100%|██████████| 50/50 [00:04<00:00, 10.14it/s]
[I 2025-02-11 21:59:27,694] A new study created in memory with name: no-name-cc6430ce-7789-4015-aff7-792c54de607c


Selected features for spark_merged_df.csv (random split): ['la', 'ld', 'lines_per_file', 'd_defaultFieldsQty', 'changed_file_count', 'd_dit', 'noc', 'privateFieldsQty', 'd_cbo', 'entrophy', 'd_maxNestedBlocksQty', 'd_fanout', 'fanin', 'd_visibleMethodsQty', 'd_returnQty', 'modifiers', 'anonymousClassesQty', 'd_rfc', 'd_publicMethodsQty', 'd_cboModified', 'dit', 'd_lambdasQty', 'd_wmc', 'd_totalMethodsQty', 'totalFieldsQty', 'd_staticFieldsQty', 'finalFieldsQty', 'd_innerClassesQty', 'd_stringLiteralsQty', 'defaultMethodsQty', 'd_lcom*', 'cboModified', 'd_privateFieldsQty', 'staticMethodsQty', 'd_comparisonsQty', 'd_staticMethodsQty', 'd_loc', 'lcom', 'd_finalFieldsQty', 'd_protectedMethodsQty', 'd_assignmentsQty', 'cbo', 'd_tryCatchQty', 'd_lcom', 'fanout', 'd_variablesQty', 'defaultFieldsQty', 'd_totalFieldsQty', 'returnQty', 'd_noc']


[I 2025-02-11 21:59:31,696] Trial 0 finished with value: 0.9398332298349504 and parameters: {'n_estimators': 196, 'max_depth': 6, 'learning_rate': 0.07283682047473714, 'subsample': 0.6124533931684457, 'colsample_bytree': 0.8770931675185111, 'gamma': 8.117162910329642e-05, 'reg_alpha': 0.6207321729289004, 'reg_lambda': 0.5612626315341509, 'scale_pos_weight': 9.384802662615082}. Best is trial 0 with value: 0.9398332298349504.
[I 2025-02-11 21:59:37,574] Trial 1 finished with value: 0.9681248744450967 and parameters: {'n_estimators': 294, 'max_depth': 7, 'learning_rate': 0.19791548449961022, 'subsample': 0.7775918030165336, 'colsample_bytree': 0.816009229114006, 'gamma': 8.439445470864569e-05, 'reg_alpha': 1.9189910999749906e-05, 'reg_lambda': 5.549262864576904e-06, 'scale_pos_weight': 2.2973947694686467}. Best is trial 1 with value: 0.9681248744450967.
[I 2025-02-11 21:59:44,128] Trial 2 finished with value: 0.972877539426756 and parameters: {'n_estimators': 252, 'max_depth': 9, 'learnin

Selected features for spark_merged_df.csv (date split): ['la', 'maxNestedBlocksQty', 'lines_per_file', 'changed_file_count', 'd_dit', 'ld', 'd_defaultFieldsQty', 'd_cbo', 'fanin', 'd_maxNestedBlocksQty', 'd_fanout', 'finalFieldsQty', 'd_visibleMethodsQty', 'd_returnQty', 'entrophy', 'd_publicMethodsQty', 'totalFieldsQty', 'd_rfc', 'anonymousClassesQty', 'd_cboModified', 'd_totalMethodsQty', 'modifiers', 'privateFieldsQty', 'd_wmc', 'd_innerClassesQty', 'd_lcom*', 'publicFieldsQty', 'cboModified', 'd_comparisonsQty', 'noc', 'defaultFieldsQty', 'd_staticMethodsQty', 'd_staticFieldsQty', 'd_lambdasQty', 'dit', 'd_privateFieldsQty', 'd_loc', 'd_stringLiteralsQty', 'defaultMethodsQty', 'cbo', 'd_finalFieldsQty', 'fanout', 'd_assignmentsQty', 'lcom*', 'nosi', 'd_tryCatchQty', 'd_lcom', 'd_variablesQty', 'lcom', 'd_protectedMethodsQty']


[I 2025-02-11 22:02:26,651] Trial 0 finished with value: 0.9316065945779762 and parameters: {'n_estimators': 215, 'max_depth': 7, 'learning_rate': 0.017792424253524772, 'subsample': 0.7928179475583175, 'colsample_bytree': 0.6870705412735962, 'gamma': 0.012589127222859216, 'reg_alpha': 7.01810564026972e-05, 'reg_lambda': 0.006676883012706368, 'scale_pos_weight': 4.5937612521198385}. Best is trial 0 with value: 0.9316065945779762.
[I 2025-02-11 22:02:33,035] Trial 1 finished with value: 0.968586302351537 and parameters: {'n_estimators': 135, 'max_depth': 11, 'learning_rate': 0.07094179531971678, 'subsample': 0.6551770435743616, 'colsample_bytree': 0.6780562859993768, 'gamma': 0.0012995399367446469, 'reg_alpha': 0.7661230131285317, 'reg_lambda': 1.0392917698824152e-05, 'scale_pos_weight': 16.059405001239973}. Best is trial 1 with value: 0.968586302351537.
[I 2025-02-11 22:02:35,778] Trial 2 finished with value: 0.9627223080840239 and parameters: {'n_estimators': 80, 'max_depth': 8, 'learn

Workbook saved as classification_reports.xlsx
