In [1]:
import pandas as pd
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
import xgboost as xgb
import cupy as cp
import numpy as np
import json

In [2]:
def train_test_split_by_date(x, y, test_size=0.2):
    """
    Split the input data into training and test sets by date.
    The test set should contain the newest test_size proportion of the data.
    """
    df = pd.concat([x, y], axis=1)
    df = df.sort_values(by='author_date_unix_timestamp')
    split_index = int((1 - test_size) * len(df))
    x_train = df.iloc[:split_index, :-1]
    y_train = df.iloc[:split_index, -1]
    x_test = df.iloc[split_index:, :-1]
    y_test = df.iloc[split_index:, -1]
    return x_train, x_test, y_train, y_test


In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import average_precision_score

def hyperparameter_optimization(x_train, y_train, n_splits=5, n_trials=50):
    """
    Optimize the hyperparameters of the XGBoost model using cross-validation.

    Parameters:
      - x_train: Training features (pandas DataFrame)
      - y_train: Training labels (pandas Series)
      - n_splits: Number of cross-validation splits (default: 5)
      - n_trials: Number of hyperparameter search trials (default: 50)

    Returns:
      - The best hyperparameters found by Optuna.
    """
    # Convert the entire training set to NumPy arrays once.
    x_train_np = x_train.to_numpy()
    y_train_np = y_train.to_numpy().ravel()

    # Convert the NumPy arrays to GPU arrays (cupy.ndarray).
    x_train_gpu = cp.asarray(x_train_np)
    y_train_gpu = cp.asarray(y_train_np)

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    def objective(trial):
        # https://xgboosting.com/most-important-xgboost-hyperparameters-to-tune/
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
            'min_child_weight': trial.suggest_float('min_child_weight', 1.0, 7.0, log=True),
            'max_depth': trial.suggest_int('max_depth', 3, 15),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
            'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
            'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 1.0, log=True),
            'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1.0, log=True),
            'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1, 20),
            'random_state': 42,
            'n_jobs': -1
        }

        early_stopping_rounds = 50
        
        cv_scores = []
        # Perform stratified K-fold cross-validation using pre-converted arrays.
        for train_idx, valid_idx in skf.split(x_train_np, y_train_np):
            # Slice the pre-converted GPU arrays.
            X_tr_gpu = x_train_gpu[train_idx]
            y_tr_gpu = y_train_gpu[train_idx]
            X_val_gpu = x_train_gpu[valid_idx]
            # Use the original NumPy array for validation labels.
            y_val = y_train_np[valid_idx]
            
            # Configure and train the model.
            model = XGBClassifier(
                **params,
                early_stopping_rounds=early_stopping_rounds,
                tree_method="hist",  # GPU-optimized training
                device="cuda"        # use GPU for training and prediction
            )
            model.fit(X_tr_gpu,
                        y_tr_gpu,
                        eval_set=[(X_val_gpu, y_val)],
                        verbose=False
                      )
            
            # Predict on the validation fold.
            y_pred_prob = model.predict_proba(X_val_gpu)[:, 1]
            y_pred_cpu = cp.asnumpy(y_pred_prob)
            
            # Compute Brier score for the current fold
            fold_score = -np.mean((y_pred_cpu - y_val) ** 2)  # Negative since Optuna minimizes
            cv_scores.append(fold_score)
        
        # Return the mean ROC AUC across folds.
        return np.mean(cv_scores)
    
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=n_trials)

    # Free GPU memory to clear up space for subsequent computations.
    cp._default_memory_pool.free_all_blocks()

    return study.best_params


In [4]:
# all columns in the dataset
INFO_COLUMNS = [
                "type", #"linked", "fileschanged",
                "ndev", "age", "exp", "rexp", "sexp",
                "glm_probability", "classification",
                "time_of_day", "day_of_week", "is_weekend",
                "author_experience", "author_ownership",
                "hash","file", "author_date_unix_timestamp"]

Y_COLUMN = ['contains_bug']

X_COLUMNS = ["d_cbo","d_cboModified","d_fanin",
             "d_fanout","d_wmc","d_dit","d_noc",
             "d_rfc","d_lcom","d_lcom*","d_tcc",
             "d_lcc","d_totalMethodsQty","d_staticMethodsQty",
             "d_publicMethodsQty","d_privateMethodsQty","d_protectedMethodsQty",
             "d_defaultMethodsQty","d_visibleMethodsQty","d_abstractMethodsQty",
             "d_finalMethodsQty","d_synchronizedMethodsQty","d_totalFieldsQty",
             "d_staticFieldsQty","d_publicFieldsQty","d_privateFieldsQty",
             "d_protectedFieldsQty","d_defaultFieldsQty","d_finalFieldsQty",
             "d_synchronizedFieldsQty","d_nosi","d_loc","d_returnQty","d_loopQty",
             "d_comparisonsQty","d_tryCatchQty","d_parenthesizedExpsQty","d_stringLiteralsQty",
             "d_numbersQty","d_assignmentsQty","d_mathOperationsQty","d_variablesQty",
             "d_maxNestedBlocksQty","d_anonymousClassesQty","d_innerClassesQty",
             "d_lambdasQty",
             #"d_uniqueWordsQty",
             "d_modifiers",
             #"d_logStatementsQty",
             "cbo","cboModified","fanin","fanout","wmc","dit","noc","rfc","lcom","lcom*",
             "tcc","lcc","totalMethodsQty","staticMethodsQty","publicMethodsQty",
             "privateMethodsQty","protectedMethodsQty","defaultMethodsQty",
             "visibleMethodsQty","abstractMethodsQty","finalMethodsQty",
             "synchronizedMethodsQty","totalFieldsQty","staticFieldsQty",
             "publicFieldsQty","privateFieldsQty","protectedFieldsQty",
             "defaultFieldsQty","finalFieldsQty","synchronizedFieldsQty",
             "nosi","loc","returnQty","loopQty","comparisonsQty",
             "tryCatchQty","parenthesizedExpsQty","stringLiteralsQty",
             "numbersQty","assignmentsQty","mathOperationsQty",
             "variablesQty","maxNestedBlocksQty","anonymousClassesQty",
             "innerClassesQty","lambdasQty",
             #"uniqueWordsQty", # Number of unique words in the source code
             "modifiers",
             #"logStatementsQty", # Number of log statements in the source code
             #"fix",
             "entrophy",
             "la","ld",
             #"net_lines_changed","absolute_lines_changed",
             "lines_per_file",
             "changed_file_count",
             #"entropy_bucket",
             ]

In [5]:
import pandas as pd
# read the merged df

df = pd.read_csv('merged_datasets/new/tomcat_merged_df.csv')

print(df.shape[0])

126165


In [6]:
# print column which has missing values
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])


d_lcom*            6607
d_tcc             22281
d_lcc             22281
lcom*              6589
tcc               22114
lcc               22114
classification    59600
dtype: int64


In [7]:
#print distinc values in tcc column
print(df['tcc'].unique())
# and their counts
print(df['tcc'].value_counts())

# print nan count
print(df['tcc'].isnull().sum())

[-1.  0. nan]
tcc
 0.0    93641
-1.0    10410
Name: count, dtype: int64
22114


# DECISON
should we delete columns with possible N/A values or delete rows with N/A values ? 

if number of visible methods in a class is less than 2, then CK prefers to set it -1.
For some reason, some rows have NaN values, so set it to -1 as well as it is not applicable.

In [8]:
def fix_df(df):
    """
    XGBoost supports missing values, so we don't need to impute them.
    """

    return df



    # Handle NA values
    """
    df['d_lcom*'] = df['d_lcom*'].fillna(-1)
    df['d_tcc'] = df['d_tcc'].fillna(-1)
    df['d_lcc'] = df['d_lcc'].fillna(-1)
    df['lcom*'] = df['lcom*'].fillna(-1)
    df['tcc'] = df['tcc'].fillna(-1)
    df['lcc'] = df['lcc'].fillna(-1)
    """
    # drop these columns
    # remove from X_COLUMNS
    
    try:
        X_COLUMNS.remove('d_lcom*')
        X_COLUMNS.remove('lcom*')
        X_COLUMNS.remove('d_tcc')
        X_COLUMNS.remove('tcc')
        X_COLUMNS.remove('d_lcc')
        X_COLUMNS.remove('lcc')
    except:
        # already removed
        pass
    
    df = df.drop(columns=['d_lcom*', 'lcom*', 'd_tcc', 'tcc', 'd_lcc', 'lcc'])
    return df

df = fix_df(df)

In [9]:
DATASETS = ["merged_datasets/new/broadleaf_merged_df.csv",
            #"merged_datasets/new/camel_merged_df.csv",
            #"merged_datasets/new/dubbo_merged_df.csv",
            #"merged_datasets/new/elasticsearch_merged_df.csv",
            #"merged_datasets/new/guava_merged_df.csv",
            #"merged_datasets/new/jdk_merged_df.csv",
            #"merged_datasets/new/jgroups_merged_df.csv",
            #"merged_datasets/new/kafka_merged_df.csv",
            "merged_datasets/new/spark_merged_df.csv",
            #"merged_datasets/new/spring-boot_merged_df.csv",
            #"merged_datasets/new/spring-framework_merged_df.csv",
            #"merged_datasets/new/tomcat_merged_df.csv",
            ]

In [10]:
from openpyxl import Workbook
from openpyxl.styles import Font, PatternFill
from openpyxl.utils import get_column_letter
from openpyxl.formatting.rule import ColorScaleRule

def write_formatted_report(ws, df, start_row, start_col):
    """
    Writes the DataFrame `df` into worksheet `ws` starting at (start_row, start_col).
    
    - The header row is formatted with bold text and a light gray fill.
    - For any row whose first cell (the "class" column) equals "accuracy"
      (case-insensitive), the function writes:
         • The label "accuracy" in the first column.
         • The accuracy value (assumed to be in the second value of the row)
           into the cell corresponding to the "f1-score" header.
         • All other numeric cells (except the support column) are left blank.
         • The "support" column (if present) is left unchanged.
    - After writing the table, a conditional color–scale is applied to all 
      numeric cells (i.e. all cells except the first "class" column and the 
      "support" column). Also, any empty cell in that range is cleared of fill.
      
    Float values are formatted with two decimals.
    
    Returns the total number of rows written (header + data).
    """
    header_font = Font(bold=True)
    header_fill = PatternFill(start_color="D9D9D9", end_color="D9D9D9", fill_type="solid")
    
    headers = list(df.columns)
    num_cols = len(headers)
    total_rows = df.shape[0] + 1  # header + data rows

    # Write the header row.
    for j, col_name in enumerate(headers, start=start_col):
        cell = ws.cell(row=start_row, column=j, value=col_name)
        cell.font = header_font
        cell.fill = header_fill

    # Write data rows.
    for i, row_data in enumerate(df.values, start=start_row + 1):
        # Check if this is the "accuracy" row.
        if str(row_data[0]).strip().lower() == "accuracy":
            # Write the "accuracy" label in the first column.
            ws.cell(row=i, column=start_col, value=row_data[0])
            # Find the column index for "f1-score" (case-insensitive) in headers.
            f1_idx = None
            for idx, h in enumerate(headers):
                if h.strip().lower() == "f1-score":
                    f1_idx = idx
                    break
            # Write the accuracy value into the "f1-score" column.
            if f1_idx is not None and len(row_data) > 1:
                accuracy_value = row_data[1]  # assume the accuracy value is in the second column
                cell = ws.cell(row=i, column=start_col + f1_idx, value=accuracy_value)
                if isinstance(accuracy_value, float):
                    cell.number_format = "0.00"
            # For the remaining columns:
            for j, h in enumerate(headers[1:], start=start_col + 1):
                # Skip the f1-score column (already written) and the support column.
                if (f1_idx is not None and j == start_col + f1_idx) or h.strip().lower() == "support":
                    continue
                ws.cell(row=i, column=j, value="")
        else:
            # Normal row: write each cell value.
            for j, value in enumerate(row_data, start=start_col):
                cell = ws.cell(row=i, column=j, value=value)
                if isinstance(value, float):
                    cell.number_format = "0.00"
    
    # Determine the range for numeric cells to apply conditional formatting.
    # We want to apply formatting to all numeric columns except:
    #   - the first column ("class")
    #   - the "support" column (if present)
    #
    # Find the index of "support" in headers (if it exists).
    support_idx = None
    for idx, h in enumerate(headers):
        if h.strip().lower() == "support":
            support_idx = idx
            break
    numeric_start_col = start_col + 1  # skip "class"
    if support_idx is not None:
        # Apply formatting to columns before the support column.
        numeric_end_col = start_col + support_idx - 1  # excludes the support column
    else:
        numeric_end_col = start_col + num_cols - 1

    numeric_start_row = start_row + 1
    numeric_end_row = start_row + df.shape[0]
    # Build range string (e.g. "B2:D10").
    cell_range = f"{get_column_letter(numeric_start_col)}{numeric_start_row}:{get_column_letter(numeric_end_col)}{numeric_end_row}"
    
    # Define a color scale: low values in red, mid in yellow, high in green.
    color_scale_rule = ColorScaleRule(
        start_type="num", start_value=0, start_color="F8696B",
        mid_type="num", mid_value=0.5, mid_color="FFEB84",
        end_type="num", end_value=1, end_color="63BE7B"
    )
    ws.conditional_formatting.add(cell_range, color_scale_rule)
    
    # Remove fill from any empty cells in the numeric range.
    for row in ws[cell_range]:
        for cell in row:
            if cell.value in [None, ""]:
                cell.fill = PatternFill(fill_type=None)
    
    return total_rows

###############################################################################
# Main export function.
###############################################################################
def export_results_to_excel(results, output_file):
    """
    Groups the classification reports (from the global dict `results`) by dataset
    and writes them all into a single worksheet.
    
    For each dataset, the random-split and date-split reports are written side by side,
    with a title row above each table (e.g. "Dataset1 random split" and "Dataset1 date split").
    Each dataset block is written on a new set of rows (separated by a blank gap).
    
    Finally, the workbook is saved to `output_file`.
    """
    # Group the reports by dataset name.
    # Expected keys: "<dataset_name>_random_split" and "<dataset_name>_date_split"
    grouped_results = {}
    for key, report in results.items():
        try:
            ds_name, split_type, _ = key.rsplit("_", 2)
        except ValueError:
            continue
        if ds_name not in grouped_results:
            grouped_results[ds_name] = {}
        grouped_results[ds_name][split_type] = report

    # Create a new workbook with one worksheet.
    wb = Workbook()
    ws = wb.active
    ws.title = "Classification Reports"
    
    current_row = 1  # starting row on the sheet
    gap_rows = 2     # blank rows between dataset blocks

    for ds_name, splits in grouped_results.items():
        # Convert the reports into DataFrames.
        if "random" in splits:
            df_random = pd.DataFrame(splits["random"]).T.reset_index().rename(columns={"index": "class"})
        else:
            df_random = None

        if "date" in splits:
            df_date = pd.DataFrame(splits["date"]).T.reset_index().rename(columns={"index": "class"})
        else:
            df_date = None

        # Determine starting column for the date table.
        if df_random is not None:
            left_table_cols = df_random.shape[1]
        else:
            left_table_cols = 0
        date_table_start_col = left_table_cols + 3 if df_random is not None else 1

        # Write title cells above each table.
        ws.cell(row=current_row, column=1, value=f"{ds_name} random split")
        ws.cell(row=current_row, column=date_table_start_col, value=f"{ds_name} date split")

        # Write the tables starting on the next row.
        start_table_row = current_row + 1
        rows_random = 0
        rows_date = 0
        if df_random is not None:
            rows_random = write_formatted_report(ws, df_random, start_table_row, 1)
        if df_date is not None:
            rows_date = write_formatted_report(ws, df_date, start_table_row, date_table_start_col)

        # Determine block height (title row + maximum table height).
        block_height = 1 + max(rows_random, rows_date)
        current_row += block_height + gap_rows

    wb.save(output_file)
    print(f"Workbook saved as {output_file}")


In [11]:
from sklearn.feature_selection import SelectFromModel
from mrmr import mrmr_classif

def select_features(x, y, method="model", estimator=None, threshold="median", 
                    correlation_threshold=0.9, num_features=50):
    """
    Select features using different methods.

    Parameters:
      - x: Training features (pandas DataFrame)
      - y: Training labels (pandas Series)
      - method: Which feature selection method to use.
                Options are:
                  "model" : Model-based feature selection using SelectFromModel.
                  "cbfs"  : Correlation-based feature selection (CBFS).
                  "mrmr"  : Minimum Redundancy Maximum Relevance (MRMR) using mrmr_selection.
      - estimator: (For "model" method) An estimator to compute feature importances.
                   If None, an XGBClassifier is used.
      - threshold: (For "model" method) The threshold for feature importance in SelectFromModel.
                   Default is "median".
      - correlation_threshold: (For "cbfs" method) Threshold to drop highly correlated features.
      - num_features: (For "mrmr" method) Number of features to select.
      
    Returns:
      - A list of selected feature names.
    """
    method = method.lower()
    if method == "model":
        if estimator is None:
            estimator = XGBClassifier(tree_method="hist", device="cuda", random_state=42)
        estimator.fit(x, y)
        selector = SelectFromModel(estimator, threshold=threshold, prefit=True)
        selected_features = list(x.columns[selector.get_support()])
        return selected_features

    elif method == "cbfs":
        # Compute the absolute correlation matrix and remove one of any pair of highly correlated features.
        corr_matrix = x.corr().abs()
        # Use only the upper triangle of the correlation matrix.
        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
        to_drop = [column for column in upper.columns if any(upper[column] > correlation_threshold)]
        selected_features = [col for col in x.columns if col not in to_drop]
        return selected_features

    elif method == "mrmr":
        # Call mrmr_classif directly with the DataFrame (not x.values) so that it can access the column names.
        selected_features = mrmr_classif(x, y, K=num_features)
        return selected_features

    else:
        raise ValueError("Method must be one of 'model', 'cbfs', or 'mrmr'")

In [12]:
results = {}
best_param_dict = {}

In [13]:
from sklearn.preprocessing import MinMaxScaler

def main(result_prefix: str, feature_selection_method="mrmr", n_features = 50, n_splits=5, n_trials=50, correlation_threshold=0.9):


    for dataset in DATASETS:

        dataset_name = dataset.split("new")[1][1:]

        for processed in results.keys():
            if dataset_name in processed:
                continue        

        print(f"Processing {dataset_name}")
        df = pd.read_csv(dataset)
        df = fix_df(df)


        """
        TODO
        - It might not be best to split the dataset here. We should split the dataset in the hyperparameter optimization step.
        - We should also perform feature selection in the hyperparameter optimization step. MAYBE ???
        """
        

        # -------------------------
        # Random split approach
        # -------------------------
        x_train, x_test, y_train, y_test = train_test_split(df[X_COLUMNS + INFO_COLUMNS], df[Y_COLUMN],
                                                            test_size=0.2, random_state=42)
        
        # Scale after split to avoid leakage
        scaler = MinMaxScaler()
        x_train[X_COLUMNS] = scaler.fit_transform(x_train[X_COLUMNS])
        x_test[X_COLUMNS] = scaler.transform(x_test[X_COLUMNS])


        # Perform feature selection on the training set using only X_COLUMNS
        selected_features_random = select_features(x_train[X_COLUMNS], y_train,
                                                   method=feature_selection_method, num_features=n_features, correlation_threshold=correlation_threshold)
        print(f"Selected features for {dataset_name} (random split): {selected_features_random}")

        # Optimize hyperparameters on the selected features
        best_params = hyperparameter_optimization(x_train[selected_features_random], y_train, n_splits=n_splits, n_trials=n_trials)
        best_model = XGBClassifier(**best_params, tree_method="hist", device="cuda")
        best_model.fit(x_train[selected_features_random], y_train)
        y_pred = best_model.predict(x_test[selected_features_random])
        results[f"{dataset_name}_random_split"] = classification_report(y_test, y_pred, output_dict=True)
        best_param_dict[dataset_name] = best_params

        # -------------------------
        # Date split approach
        # -------------------------

        # Lets not use first %20(time) of the dataset
        # as the project evolves over time
        # we will use remaining %80 percent as training + test set 

        """df_trimmed = df.sort_values(by='author_date_unix_timestamp').iloc[int(df.shape[0]*0.2):]

        ########################experimental"""
        
        x_train_date, x_test_date, y_train_date, y_test_date = train_test_split_by_date(
            df[X_COLUMNS + INFO_COLUMNS], df[Y_COLUMN], test_size=0.2)
        
        # Scale splits
        x_train_date[X_COLUMNS] = scaler.fit_transform(x_train_date[X_COLUMNS])
        x_test_date[X_COLUMNS] = scaler.transform(x_test_date[X_COLUMNS])

        selected_features_date = select_features(x_train_date[X_COLUMNS], y_train_date,
                                                 method=feature_selection_method, num_features=n_features, correlation_threshold=correlation_threshold)
        print(f"Selected features for {dataset_name} (date split): {selected_features_date}")

        best_params = hyperparameter_optimization(x_train_date[selected_features_date], y_train_date, n_splits=n_splits, n_trials=n_trials)
        best_model = XGBClassifier(**best_params, tree_method="hist", device="cuda")
        best_model.fit(x_train_date[selected_features_date], y_train_date)
        y_pred = best_model.predict(x_test_date[selected_features_date])
        results[f"{dataset_name}_date_split"] = classification_report(y_test_date, y_pred, output_dict=True)
        best_param_dict[dataset_name] = best_params

    # Export the classification reports to Excel.
    export_results_to_excel(results, output_file=f"{result_prefix}_classification_reports.xlsx")

    # save best params as json
    with open(f'{result_prefix}_best_params.json', 'w') as f:
        json.dump(best_param_dict, f)

In [14]:
"""export_results_to_excel(results, output_file="classification_reports.xlsx")

# save best params as json
with open('best_params.json', 'w') as f:
    json.dump(best_param_dict, f)"""

'export_results_to_excel(results, output_file="classification_reports.xlsx")\n\n# save best params as json\nwith open(\'best_params.json\', \'w\') as f:\n    json.dump(best_param_dict, f)'

In [None]:
main("mrmr50", "mrmr", 50, 5, 30, None)

Processing broadleaf_merged_df.csv


100%|██████████| 50/50 [00:14<00:00,  3.57it/s]


Selected features for broadleaf_merged_df.csv (random split): ['entrophy', 'd_synchronizedMethodsQty', 'changed_file_count', 'ld', 'dit', 'la', 'd_dit', 'lines_per_file', 'd_cbo', 'defaultMethodsQty', 'loc', 'd_lcom*', 'd_cboModified', 'd_fanout', 'stringLiteralsQty', 'd_maxNestedBlocksQty', 'd_publicMethodsQty', 'modifiers', 'rfc', 'anonymousClassesQty', 'd_totalMethodsQty', 'd_totalFieldsQty', 'loopQty', 'd_visibleMethodsQty', 'd_staticMethodsQty', 'cbo', 'd_loc', 'd_anonymousClassesQty', 'variablesQty', 'd_fanin', 'fanout', 'd_protectedFieldsQty', 'd_stringLiteralsQty', 'abstractMethodsQty', 'assignmentsQty', 'd_returnQty', 'd_rfc', 'comparisonsQty', 'd_modifiers', 'innerClassesQty', 'd_privateFieldsQty', 'wmc', 'd_variablesQty', 'd_defaultMethodsQty', 'd_wmc', 'd_finalFieldsQty', 'd_assignmentsQty', 'nosi', 'protectedFieldsQty', 'd_abstractMethodsQty']


[I 2025-02-16 21:49:30,628] A new study created in memory with name: no-name-5665eafc-a7f1-4fd5-b4d4-14db535b480d
[I 2025-02-16 21:49:49,085] Trial 0 finished with value: -0.03696613386273384 and parameters: {'n_estimators': 244, 'min_child_weight': 1.4761138027825007, 'max_depth': 8, 'learning_rate': 0.277141222850642, 'subsample': 0.6055425561695358, 'colsample_bytree': 0.6994870764883856, 'gamma': 2.7983076138440932e-08, 'reg_alpha': 0.4733136042405537, 'reg_lambda': 4.284366767247417e-06, 'scale_pos_weight': 4.356685874573785}. Best is trial 0 with value: -0.03696613386273384.
[I 2025-02-16 21:50:13,476] Trial 1 finished with value: -0.0855434387922287 and parameters: {'n_estimators': 579, 'min_child_weight': 6.449568410144323, 'max_depth': 5, 'learning_rate': 0.01745541961978112, 'subsample': 0.7291357524498089, 'colsample_bytree': 0.856430612115892, 'gamma': 0.6653997847293184, 'reg_alpha': 0.5869559118320139, 'reg_lambda': 0.13955711359607084, 'scale_pos_weight': 3.8395271090322

Selected features for broadleaf_merged_df.csv (date split): ['entrophy', 'lines_per_file', 'changed_file_count', 'dit', 'ld', 'la', 'd_dit', 'd_cbo', 'd_lcom*', 'loc', 'd_fanout', 'abstractMethodsQty', 'd_cboModified', 'd_maxNestedBlocksQty', 'd_staticMethodsQty', 'rfc', 'd_visibleMethodsQty', 'innerClassesQty', 'd_totalMethodsQty', 'stringLiteralsQty', 'd_anonymousClassesQty', 'd_totalFieldsQty', 'loopQty', 'd_loc', 'd_publicMethodsQty', 'modifiers', 'protectedMethodsQty', 'd_privateFieldsQty', 'fanin', 'd_stringLiteralsQty', 'variablesQty', 'd_synchronizedMethodsQty', 'd_defaultFieldsQty', 'anonymousClassesQty', 'd_protectedFieldsQty', 'd_returnQty', 'assignmentsQty', 'd_variablesQty', 'd_modifiers', 'nosi', 'd_rfc', 'd_assignmentsQty', 'comparisonsQty', 'd_fanin', 'd_wmc', 'wmc', 'tryCatchQty', 'lcom', 'synchronizedMethodsQty', 'd_numbersQty']


[I 2025-02-16 22:13:44,737] Trial 0 finished with value: -0.09909402579069138 and parameters: {'n_estimators': 625, 'min_child_weight': 1.4083248914754132, 'max_depth': 3, 'learning_rate': 0.010854399679494468, 'subsample': 0.8509895127257788, 'colsample_bytree': 0.9629419804468828, 'gamma': 0.04521777564136125, 'reg_alpha': 1.1040773234475578e-08, 'reg_lambda': 7.425927153521369e-05, 'scale_pos_weight': 1.5579726389616362}. Best is trial 0 with value: -0.09909402579069138.
[I 2025-02-16 22:13:51,388] Trial 1 finished with value: -0.24796637892723083 and parameters: {'n_estimators': 202, 'min_child_weight': 2.853590616443156, 'max_depth': 3, 'learning_rate': 0.016787549828171572, 'subsample': 0.6908479499264948, 'colsample_bytree': 0.6083387674628218, 'gamma': 1.8768760351964107e-08, 'reg_alpha': 0.0006537848446137127, 'reg_lambda': 3.868806525666607e-06, 'scale_pos_weight': 9.103964189197939}. Best is trial 0 with value: -0.09909402579069138.
[I 2025-02-16 22:15:57,613] Trial 2 finish

In [63]:
pipelines = [
    # (result_prefix, feature_selection_method, n_features, n_splits, n_trials, correlation_threshold)
    ("mrmr100", "mrmr", 100, 5, 100, None), ## almost all columns
    ("mrmr50", "mrmr", 50, 5, 100, None),
    ("mrmr25", "mrmr", 25, 5, 100, None),
    ("mrmr10", "mrmr", 10, 5, 100, None),
    ("cbfs09", "cbfs", None, 5, 100, 0.9),
    ("cbfs05", "cbfs", None, 5, 100, 0.5),
    ("cbfs01", "cbfs", None, 5, 100, 0.1),
    ("model", "model", None, 5, 100, None),
]

for result_prefix, feature_selection_method, n_features, n_splits, n_trials, correlation_threshold in pipelines:
    print(f"Running pipeline: {result_prefix}")
    main(result_prefix, feature_selection_method, n_features, n_splits, n_trials, correlation_threshold)

Running pipeline: mrmr100
Processing broadleaf_merged_df.csv


100%|██████████| 97/97 [00:15<00:00,  6.08it/s]
[I 2025-02-16 18:16:54,244] A new study created in memory with name: no-name-6e5e97e4-2f29-4e65-a48a-e73b50b428db


Selected features for broadleaf_merged_df.csv (random split): ['entrophy', 'd_synchronizedMethodsQty', 'changed_file_count', 'ld', 'dit', 'la', 'd_dit', 'lines_per_file', 'd_cbo', 'defaultMethodsQty', 'loc', 'd_lcom*', 'd_cboModified', 'd_fanout', 'stringLiteralsQty', 'd_maxNestedBlocksQty', 'd_publicMethodsQty', 'modifiers', 'rfc', 'anonymousClassesQty', 'd_totalMethodsQty', 'd_totalFieldsQty', 'loopQty', 'd_visibleMethodsQty', 'd_staticMethodsQty', 'cbo', 'd_loc', 'd_anonymousClassesQty', 'variablesQty', 'd_fanin', 'fanout', 'd_protectedFieldsQty', 'd_stringLiteralsQty', 'abstractMethodsQty', 'assignmentsQty', 'd_returnQty', 'd_rfc', 'comparisonsQty', 'd_modifiers', 'innerClassesQty', 'd_privateFieldsQty', 'wmc', 'd_variablesQty', 'd_defaultMethodsQty', 'd_wmc', 'd_finalFieldsQty', 'd_assignmentsQty', 'nosi', 'protectedFieldsQty', 'd_abstractMethodsQty', 'lcom', 'protectedMethodsQty', 'd_staticFieldsQty', 'numbersQty', 'd_tryCatchQty', 'tryCatchQty', 'd_numbersQty', 'returnQty', 'd_n

[I 2025-02-16 18:17:10,151] Trial 0 finished with value: 0.9816748023567557 and parameters: {'min_child_weight': 3.4606531476395106, 'max_depth': 5, 'learning_rate': 0.060181309844508146, 'subsample': 0.9545287701510309, 'colsample_bytree': 0.8044004039168953, 'gamma': 0.0008984638269355165, 'reg_alpha': 7.79654839200332e-07, 'reg_lambda': 0.1608748127496966, 'scale_pos_weight': 19.607510073579743}. Best is trial 0 with value: 0.9816748023567557.
[I 2025-02-16 18:17:32,651] Trial 1 finished with value: 0.9844891625166454 and parameters: {'min_child_weight': 4.177232306465051, 'max_depth': 8, 'learning_rate': 0.03743053073587268, 'subsample': 0.6325860986800727, 'colsample_bytree': 0.9310879540549891, 'gamma': 0.0002510567401288779, 'reg_alpha': 0.10930810534596618, 'reg_lambda': 9.616431934884078e-05, 'scale_pos_weight': 17.122377948544617}. Best is trial 1 with value: 0.9844891625166454.
[I 2025-02-16 18:18:04,353] Trial 2 finished with value: 0.9901021782070811 and parameters: {'min_

Selected features for broadleaf_merged_df.csv (date split): ['entrophy', 'lines_per_file', 'changed_file_count', 'dit', 'ld', 'la', 'd_dit', 'd_cbo', 'd_lcom*', 'loc', 'd_fanout', 'abstractMethodsQty', 'd_cboModified', 'd_maxNestedBlocksQty', 'd_staticMethodsQty', 'rfc', 'd_visibleMethodsQty', 'innerClassesQty', 'd_totalMethodsQty', 'stringLiteralsQty', 'd_anonymousClassesQty', 'd_totalFieldsQty', 'loopQty', 'd_loc', 'd_publicMethodsQty', 'modifiers', 'protectedMethodsQty', 'd_privateFieldsQty', 'fanin', 'd_stringLiteralsQty', 'variablesQty', 'd_synchronizedMethodsQty', 'd_defaultFieldsQty', 'anonymousClassesQty', 'd_protectedFieldsQty', 'd_returnQty', 'assignmentsQty', 'd_variablesQty', 'd_modifiers', 'nosi', 'd_rfc', 'd_assignmentsQty', 'comparisonsQty', 'd_fanin', 'd_wmc', 'wmc', 'tryCatchQty', 'lcom', 'synchronizedMethodsQty', 'd_numbersQty', 'defaultMethodsQty', 'd_tryCatchQty', 'numbersQty', 'd_abstractMethodsQty', 'cbo', 'd_protectedMethodsQty', 'd_finalFieldsQty', 'fanout', 'd_

[I 2025-02-16 19:20:01,581] Trial 0 finished with value: 0.9752955055399948 and parameters: {'min_child_weight': 1.5586067800077466, 'max_depth': 11, 'learning_rate': 0.020090833287023598, 'subsample': 0.6535792501433972, 'colsample_bytree': 0.9356774798129577, 'gamma': 1.2892196230143814e-06, 'reg_alpha': 1.1236078022909087e-08, 'reg_lambda': 6.608232004948224e-07, 'scale_pos_weight': 18.506287594198266}. Best is trial 0 with value: 0.9752955055399948.
[I 2025-02-16 19:20:23,886] Trial 1 finished with value: 0.9626798197630689 and parameters: {'min_child_weight': 6.935316989682655, 'max_depth': 6, 'learning_rate': 0.03392651474608325, 'subsample': 0.5969765606913369, 'colsample_bytree': 0.8112658125128185, 'gamma': 0.0007901868881262388, 'reg_alpha': 0.0013642447766282343, 'reg_lambda': 0.000732318510973125, 'scale_pos_weight': 14.701425120692189}. Best is trial 0 with value: 0.9752955055399948.
[I 2025-02-16 19:21:03,766] Trial 2 finished with value: 0.9803391846497034 and parameters

Processing camel_merged_df.csv


100%|██████████| 97/97 [00:49<00:00,  1.94it/s]


Selected features for camel_merged_df.csv (random split): ['entrophy', 'd_innerClassesQty', 'changed_file_count', 'ld', 'd_tcc', 'la', 'd_publicMethodsQty', 'stringLiteralsQty', 'lines_per_file', 'nosi', 'dit', 'd_lcc', 'lcom*', 'd_parenthesizedExpsQty', 'd_mathOperationsQty', 'assignmentsQty', 'tcc', 'd_finalFieldsQty', 'd_defaultMethodsQty', 'fanout', 'wmc', 'd_maxNestedBlocksQty', 'maxNestedBlocksQty', 'lcc', 'loc', 'd_visibleMethodsQty', 'cbo', 'defaultFieldsQty', 'variablesQty', 'd_modifiers', 'd_protectedMethodsQty', 'rfc', 'returnQty', 'd_assignmentsQty', 'mathOperationsQty', 'd_totalMethodsQty', 'numbersQty', 'd_dit', 'protectedMethodsQty', 'cboModified', 'd_finalMethodsQty', 'd_lcom*', 'd_comparisonsQty', 'lcom', 'totalMethodsQty', 'comparisonsQty', 'd_staticFieldsQty', 'visibleMethodsQty', 'd_numbersQty', 'staticMethodsQty', 'protectedFieldsQty', 'loopQty', 'd_variablesQty', 'd_tryCatchQty', 'privateMethodsQty', 'defaultMethodsQty', 'd_privateMethodsQty', 'abstractMethodsQty'

[I 2025-02-16 20:47:40,699] A new study created in memory with name: no-name-4eccb81b-e1db-4600-aab5-8378ccda756f
[I 2025-02-16 20:48:51,271] Trial 0 finished with value: 0.9829138010401983 and parameters: {'min_child_weight': 1.6956357177750547, 'max_depth': 10, 'learning_rate': 0.06485476300902969, 'subsample': 0.9600685631999792, 'colsample_bytree': 0.9057854113009062, 'gamma': 1.2641099223549515e-08, 'reg_alpha': 0.9814787812981973, 'reg_lambda': 0.3036349845828794, 'scale_pos_weight': 12.010032763970608}. Best is trial 0 with value: 0.9829138010401983.
[I 2025-02-16 20:49:28,085] Trial 1 finished with value: 0.9082572782832254 and parameters: {'min_child_weight': 6.797169518591958, 'max_depth': 5, 'learning_rate': 0.019764711901302075, 'subsample': 0.5941064946669898, 'colsample_bytree': 0.8288892287645009, 'gamma': 2.7757729501625754e-07, 'reg_alpha': 0.0028543621656269592, 'reg_lambda': 0.020175705951680677, 'scale_pos_weight': 16.853479148665578}. Best is trial 0 with value: 0.

KeyboardInterrupt: 