# Settings

## Constants

In [1]:
import os

PATH_DATA = './Dataset'
sensor_data = ['UserInfo.csv', 'Service.csv', 'ContextualFactor.csv', 'Interruptibility.csv']

# Utility Functions

In [2]:
import pandas as pd
import numpy as np
from collections import defaultdict
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import LeaveOneGroupOut
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.svm import SVC
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, f1_score


# Load the Dataset into Dataframe

In [3]:
dataframes = {
    filename: pd.read_csv(os.path.join(PATH_DATA, filename)).reset_index(drop=True)
    for filename in sensor_data
}
dfService = dataframes['Service.csv']
dfContextualFactor = dataframes['ContextualFactor.csv']
dfUserInfo = dataframes['UserInfo.csv']
dfInterruptibility = dataframes['Interruptibility.csv']

# Preprocessing

In [4]:
# Select specific columns from dfContextualFactor, dfService, and dfUserInfo DataFrames
# Combine relevant columns to create a unified dataset
dfContextualFactor_selected_columns=dfContextualFactor[['uid','sid','activity1','activity2','activity3','userRoom','userPosition',]]
dfService_selected_columns=dfService[['weekOfExperiment','dayOfWeek','startTime', 'activityInquiry','availabilityInquiry','speechShadowing_1','speechShadowing_2','speechShadowing_3','speechShadowing_4','speechShadowing_5','continue-to-nextInquiry_1','continue-to-nextInquiry_2','continue-to-nextInquiry_3','continue-to-nextInquiry_4','endTime','endType']]
dfInterruptibility_selected_columns=dfInterruptibility[['SHORT_INTERACTION_availability', 'LONG_INTERACTION_availability']]

## Create binary columns for interaction types
## SHORT_INTERACTION: True if availabilityInquiry is not NaN
# dfCombinedAll['SHORT_INTERACTION_availability'] = dfCombinedAll['availabilityInquiry'].notna()
## LONG_INTERACTION: True if continue-to-nextInquiry_1 is not NaN
# dfCombinedAll['LONG_INTERACTION_availability'] = dfCombinedAll['continue-to-nextInquiry_1'].notna()

# ! For those who want to redefine LONG_INTERACTION with thresholds longer than 3 minutes,
# you can use the following columns:
# 5 minutes  => use continue-to-nextInquiry_2
# 7 minutes  => use continue-to-nextInquiry_3
# 9 minutes  => use continue-to-nextInquiry_4


dfCombinedAll=pd.concat([dfContextualFactor_selected_columns, dfService_selected_columns,dfInterruptibility_selected_columns], axis=1)

In [5]:
# Concatenate and merge the all columns from dfUserInfo (dfUserInfo.csv)
dfUserInfo_selected_columns = dfUserInfo[['uid', 'settingType', 'speakerRoom', 'speakerPosition']]
dfCombinedAll = pd.merge(dfCombinedAll, dfUserInfo_selected_columns, on='uid', how='left')

## Position Processing

In [6]:
# Define function to calculate proximity between user and speaker
def calculate_proximity(row):
    # Return 0 if user and speaker are in different rooms
    if row['userRoom'] != row['speakerRoom']:
        return 0
    # If in the same room
    elif row['userPosition'] == row['speakerPosition']:
        return 2  # Same position
    else:
        return 1  # Different positions (including missing position)

# Apply proximity calculation to create a new 'proximity' column
dfCombinedAll['proximity'] = dfCombinedAll.apply(calculate_proximity, axis=1)

print(dfCombinedAll[['userRoom', 'userPosition', 'speakerRoom', 'speakerPosition', 'proximity']].head(10))


      userRoom userPosition speakerRoom speakerPosition  proximity
0     Bed Room          Bed    Bed Room            Desk          1
1    Rest Room          NaN    Bed Room            Desk          0
2  Living Room          NaN    Bed Room            Desk          0
3  Living Room          NaN    Bed Room            Desk          0
4     Bed Room          Bed    Bed Room            Desk          1
5     Bed Room          Bed    Bed Room            Desk          1
6     Bed Room          Bed    Bed Room            Desk          1
7     Bed Room         Desk    Bed Room            Desk          2
8     Bed Room          Bed    Bed Room            Desk          1
9     Bed Room          Bed    Bed Room            Desk          1


## Activity and Time Processing

In [7]:
# Process activity columns for one-hot encoding
activity_cols = ['activity1', 'activity2', 'activity3']
df_activity = dfContextualFactor[activity_cols].copy()

# Get unique activities across all activity columns, excluding NaN
all_unique_activities = pd.unique(df_activity.values.ravel())
all_unique_activities = [x for x in all_unique_activities if pd.notna(x)]

# Create a DataFrame for one-hot encoding of activities
dfActivity_one_hot_encoding = pd.DataFrame(0, index=df_activity.index, columns=['act_' + str(val) for val in all_unique_activities])

# Perform one-hot encoding for each activity column
for col in activity_cols:
    for val in all_unique_activities:
        dfActivity_one_hot_encoding['act_' + str(val)] |= (df_activity[col] == val).astype(int)

# Concatenate one-hot encoded activity columns to dfCombinedAll
dfActivity_one_hot_encoding
dfCombinedAll = pd.concat([dfCombinedAll, dfActivity_one_hot_encoding], axis=1)

In [8]:
# Convert startTime to datetime and extract total minutes since midnight
dfCombinedAll['startTime'] = pd.to_datetime(dfCombinedAll['startTime'], format='%H:%M:%S', errors='coerce')
dfCombinedAll['minute'] = dfCombinedAll['startTime'].dt.hour * 60 + dfCombinedAll['startTime'].dt.minute

# Map days of the week to numerical values (MON=0, TUE=1, ..., SUN=6)
day_map = {'MON': 0, 'TUE': 1, 'WED': 2, 'THU': 3, 'FRI': 4, 'SAT': 5, 'SUN': 6}
dfCombinedAll['dayOfWeek'] = dfCombinedAll['dayOfWeek'].map(day_map)

# Bin minutes into 30-minute intervals for temporal analysis
dfCombinedAll['minute_bin'] = (dfCombinedAll['minute'] // 30).astype(int)

In [9]:
dfCombinedAll.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2901 entries, 0 to 2900
Data columns (total 43 columns):
 #   Column                                    Non-Null Count  Dtype         
---  ------                                    --------------  -----         
 0   uid                                       2901 non-null   int64         
 1   sid                                       2901 non-null   int64         
 2   activity1                                 2901 non-null   object        
 3   activity2                                 112 non-null    object        
 4   activity3                                 3 non-null      object        
 5   userRoom                                  2901 non-null   object        
 6   userPosition                              2414 non-null   object        
 7   weekOfExperiment                          2901 non-null   int64         
 8   dayOfWeek                                 2901 non-null   int64         
 9   startTime                     

# Feature Extraction

In [10]:
# Select features for response prediction
dfFeatresForResponse = dfCombinedAll[['uid',
    'act_Taking a Nap / Sleeping','act_Hygiene','act_Eating','act_Using Media','act_Social Interaction',
    'act_Returning from Outside / Other Rooms','act_Studying / Working','act_Others','act_House Chores',
    'act_Self Caring','act_Visiting Outside / Other Rooms','act_Resting',
    'settingType','userRoom','userPosition','speakerRoom','speakerPosition',
    'minute_bin','dayOfWeek','SHORT_INTERACTION_availability','LONG_INTERACTION_availability']].copy()

# Encode categorical columns using LabelEncoder
categorical_columns = ['settingType', 'userRoom', 'userPosition', 'speakerRoom', 'speakerPosition', 'minute_bin']
label_encoders = defaultdict(LabelEncoder)

# Apply label encoding to each categorical column
for col in categorical_columns:
    dfFeatresForResponse[col] = label_encoders[col].fit_transform(dfFeatresForResponse[col])

# Create a copy of the encoded data for further processing
encoded_data = dfFeatresForResponse.copy()


encoded_data.info()
encoded_data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2901 entries, 0 to 2900
Data columns (total 22 columns):
 #   Column                                    Non-Null Count  Dtype
---  ------                                    --------------  -----
 0   uid                                       2901 non-null   int64
 1   act_Taking a Nap / Sleeping               2901 non-null   int64
 2   act_Hygiene                               2901 non-null   int64
 3   act_Eating                                2901 non-null   int64
 4   act_Using Media                           2901 non-null   int64
 5   act_Social Interaction                    2901 non-null   int64
 6   act_Returning from Outside / Other Rooms  2901 non-null   int64
 7   act_Studying / Working                    2901 non-null   int64
 8   act_Others                                2901 non-null   int64
 9   act_House Chores                          2901 non-null   int64
 10  act_Self Caring                           2901 non-null   in

Unnamed: 0,uid,act_Taking a Nap / Sleeping,act_Hygiene,act_Eating,act_Using Media,act_Social Interaction,act_Returning from Outside / Other Rooms,act_Studying / Working,act_Others,act_House Chores,...,act_Resting,settingType,userRoom,userPosition,speakerRoom,speakerPosition,minute_bin,dayOfWeek,SHORT_INTERACTION_availability,LONG_INTERACTION_availability
0,1,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,12,0,True,False
1,1,0,1,0,0,0,0,0,0,0,...,0,0,7,3,0,1,15,0,False,False
2,1,0,0,1,0,0,0,0,0,0,...,0,0,5,3,0,1,12,1,False,False
3,1,0,0,1,0,0,0,0,0,0,...,0,0,5,3,0,1,13,1,False,False
4,1,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,18,1,True,False


## Label: SHORT_INTERACTION

In [11]:
# Label distribution before balancing
print(encoded_data['SHORT_INTERACTION_availability'].value_counts())

SHORT_INTERACTION_availability
True     2158
False     743
Name: count, dtype: int64


### Model Building and LOSO CV

In [12]:
# Prepare features (X) and target variables (y) for SHORT_INTERACTION
X = encoded_data.drop(columns=['SHORT_INTERACTION_availability', 'LONG_INTERACTION_availability', 'uid'], axis=1) # Drop target and unrelated columns
y = encoded_data['SHORT_INTERACTION_availability'] # Target variable
groups = encoded_data['uid'] # Group by user ID for Leave-One-Group-Out CV

# Initialize Leave-One-Group-Out cross-validator
logo = LeaveOneGroupOut()

# Initialize SMOTE for oversampling to handle class imbalance
smote = SMOTE(random_state=42)

# Define models to evaluate
models1 = {
    'Random Forest': RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42),
    'XGBoost': XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42),
    'LightGBM': LGBMClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42),
    'CatBoost': CatBoostClassifier(iterations=100, depth=5, learning_rate=0.1, loss_function='Logloss', cat_features=[0], random_seed=42),
    'SVM': SVC(random_state=42),
    'Dummy': DummyClassifier(strategy="stratified", random_state=42)
}

results1_logo = {}

# Loop over each model
for model_name, model1 in models1.items():
    accuracies = []
    f1_scores = []

    # Loop over each fold in Leave-One-Group-Out cross-validation
    for i, (train_index, test_index) in enumerate(logo.split(X, y, groups)):
        # Split the data into training and testing sets for the current fold
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Apply SMOTE to the training data to balance class distribution
        X_train_oversampled, y_train_oversampled = smote.fit_resample(X_train, y_train)

        # Train the model on the oversampled training data
        model1.fit(X_train_oversampled, y_train_oversampled.ravel())

        # Predict the target on the test data
        y_pred = model1.predict(X_test)

        # Evaluate the prediction
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='macro')

        accuracies.append(accuracy)
        f1_scores.append(f1)
    # Compute average accuracy and F1-score across all folds
    avg_accuracy = sum(accuracies) / len(accuracies)
    avg_f1_score = sum(f1_scores) / len(f1_scores)

    results1_logo[model_name] = {
        'Average Accuracy': avg_accuracy,
        'Average F1-Score (macro)': avg_f1_score
    }


  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.r

[LightGBM] [Info] Number of positive: 2104, number of negative: 2104
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000345 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 93
[LightGBM] [Info] Number of data points in the train set: 4208, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2120, number of negative: 2120
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000274 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 95
[LightGBM] [Info] Number of data points in the train set: 4240, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[Lig

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


[LightGBM] [Info] Number of positive: 2088, number of negative: 2088
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000309 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 93
[LightGBM] [Info] Number of data points in the train set: 4176, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2128, number of negative: 2128
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000237 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 94
[LightGBM] [Info] Number of data points in the train set: 4256, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[Lig

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


[LightGBM] [Info] Number of positive: 2081, number of negative: 2081
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000257 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 95
[LightGBM] [Info] Number of data points in the train set: 4162, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2102, number of negative: 2102
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000293 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 95
[LightGBM] [Info] Number of data points in the train set: 4204, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[Lig

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


[LightGBM] [Info] Number of positive: 1998, number of negative: 1998
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000311 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 95
[LightGBM] [Info] Number of data points in the train set: 3996, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2120, number of negative: 2120
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000312 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 95
[LightGBM] [Info] Number of data points in the train set: 4240, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[Lig

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


[LightGBM] [Info] Number of positive: 2116, number of negative: 2116
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000333 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 94
[LightGBM] [Info] Number of data points in the train set: 4232, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2111, number of negative: 2111
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000279 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 93
[LightGBM] [Info] Number of data points in the train set: 4222, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[Lig

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


[LightGBM] [Info] Number of positive: 2015, number of negative: 2015
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000306 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 93
[LightGBM] [Info] Number of data points in the train set: 4030, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2107, number of negative: 2107
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000308 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 95
[LightGBM] [Info] Number of data points in the train set: 4214, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
0:	l

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


79:	learn: 0.3507210	total: 68ms	remaining: 17ms
80:	learn: 0.3503302	total: 68.9ms	remaining: 16.2ms
81:	learn: 0.3500618	total: 70.1ms	remaining: 15.4ms
82:	learn: 0.3493520	total: 71ms	remaining: 14.5ms
83:	learn: 0.3487957	total: 71.8ms	remaining: 13.7ms
84:	learn: 0.3482058	total: 72.7ms	remaining: 12.8ms
85:	learn: 0.3475061	total: 73.6ms	remaining: 12ms
86:	learn: 0.3465514	total: 74.5ms	remaining: 11.1ms
87:	learn: 0.3461146	total: 75.3ms	remaining: 10.3ms
88:	learn: 0.3457076	total: 76.3ms	remaining: 9.43ms
89:	learn: 0.3452931	total: 77.1ms	remaining: 8.57ms
90:	learn: 0.3449917	total: 78.2ms	remaining: 7.73ms
91:	learn: 0.3444687	total: 78.9ms	remaining: 6.86ms
92:	learn: 0.3438488	total: 79.8ms	remaining: 6ms
93:	learn: 0.3434822	total: 80.5ms	remaining: 5.14ms
94:	learn: 0.3430775	total: 81.3ms	remaining: 4.28ms
95:	learn: 0.3421117	total: 82.1ms	remaining: 3.42ms
96:	learn: 0.3414470	total: 82.9ms	remaining: 2.56ms
97:	learn: 0.3410231	total: 83.9ms	remaining: 1.71ms
98:	

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


4:	learn: 0.5481456	total: 4.31ms	remaining: 81.9ms
5:	learn: 0.5305464	total: 5.56ms	remaining: 87.1ms
6:	learn: 0.5161991	total: 6.45ms	remaining: 85.7ms
7:	learn: 0.5031961	total: 7.29ms	remaining: 83.9ms
8:	learn: 0.4886982	total: 8.15ms	remaining: 82.5ms
9:	learn: 0.4785169	total: 9.01ms	remaining: 81.1ms
10:	learn: 0.4649798	total: 9.91ms	remaining: 80.2ms
11:	learn: 0.4544028	total: 10.8ms	remaining: 79.3ms
12:	learn: 0.4441123	total: 11.6ms	remaining: 77.6ms
13:	learn: 0.4362772	total: 12.4ms	remaining: 76.5ms
14:	learn: 0.4300315	total: 13.5ms	remaining: 76.7ms
15:	learn: 0.4225105	total: 14.4ms	remaining: 75.7ms
16:	learn: 0.4156728	total: 15.3ms	remaining: 74.9ms
17:	learn: 0.4103054	total: 16.2ms	remaining: 73.9ms
18:	learn: 0.4055727	total: 17.1ms	remaining: 72.7ms
19:	learn: 0.4020202	total: 18ms	remaining: 71.9ms
20:	learn: 0.3981921	total: 18.8ms	remaining: 70.9ms
21:	learn: 0.3952969	total: 19.7ms	remaining: 69.9ms
22:	learn: 0.3924299	total: 20.6ms	remaining: 69.1ms
2

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


0:	learn: 0.6539688	total: 1ms	remaining: 99.3ms
1:	learn: 0.6201180	total: 1.86ms	remaining: 91.1ms
2:	learn: 0.5921067	total: 2.83ms	remaining: 91.6ms
3:	learn: 0.5642549	total: 3.76ms	remaining: 90.1ms
4:	learn: 0.5409878	total: 4.64ms	remaining: 88.1ms
5:	learn: 0.5257600	total: 5.48ms	remaining: 85.9ms
6:	learn: 0.5103737	total: 6.41ms	remaining: 85.1ms
7:	learn: 0.4988321	total: 7.2ms	remaining: 82.8ms
8:	learn: 0.4836065	total: 8ms	remaining: 80.9ms
9:	learn: 0.4718635	total: 8.88ms	remaining: 79.9ms
10:	learn: 0.4604390	total: 9.69ms	remaining: 78.4ms
11:	learn: 0.4530065	total: 10.7ms	remaining: 78.2ms
12:	learn: 0.4464572	total: 11.6ms	remaining: 77.3ms
13:	learn: 0.4391641	total: 12.4ms	remaining: 76.1ms
14:	learn: 0.4322036	total: 13.1ms	remaining: 74.4ms
15:	learn: 0.4276202	total: 14ms	remaining: 73.6ms
16:	learn: 0.4227952	total: 14.8ms	remaining: 72.5ms
17:	learn: 0.4191438	total: 15.7ms	remaining: 71.7ms
18:	learn: 0.4162484	total: 16.6ms	remaining: 70.7ms
19:	learn: 0

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


60:	learn: 0.3557964	total: 58.3ms	remaining: 37.3ms
61:	learn: 0.3550575	total: 59.4ms	remaining: 36.4ms
62:	learn: 0.3546774	total: 60.6ms	remaining: 35.6ms
63:	learn: 0.3540003	total: 61.5ms	remaining: 34.6ms
64:	learn: 0.3535265	total: 62.8ms	remaining: 33.8ms
65:	learn: 0.3529900	total: 63.8ms	remaining: 32.9ms
66:	learn: 0.3521823	total: 64.8ms	remaining: 31.9ms
67:	learn: 0.3518269	total: 65.7ms	remaining: 30.9ms
68:	learn: 0.3513936	total: 66.6ms	remaining: 29.9ms
69:	learn: 0.3504291	total: 67.5ms	remaining: 28.9ms
70:	learn: 0.3494797	total: 68.6ms	remaining: 28ms
71:	learn: 0.3485132	total: 69.4ms	remaining: 27ms
72:	learn: 0.3477403	total: 70.2ms	remaining: 26ms
73:	learn: 0.3472096	total: 71.1ms	remaining: 25ms
74:	learn: 0.3464789	total: 71.9ms	remaining: 24ms
75:	learn: 0.3458540	total: 72.7ms	remaining: 23ms
76:	learn: 0.3455360	total: 73.6ms	remaining: 22ms
77:	learn: 0.3450806	total: 74.4ms	remaining: 21ms
78:	learn: 0.3447760	total: 75.3ms	remaining: 20ms
79:	learn: 

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


0:	learn: 0.6511075	total: 927us	remaining: 91.8ms
1:	learn: 0.6164987	total: 1.83ms	remaining: 89.9ms
2:	learn: 0.5851852	total: 2.59ms	remaining: 83.7ms
3:	learn: 0.5618373	total: 3.38ms	remaining: 81.2ms
4:	learn: 0.5419235	total: 4.13ms	remaining: 78.5ms
5:	learn: 0.5235975	total: 5.03ms	remaining: 78.8ms
6:	learn: 0.5088547	total: 5.85ms	remaining: 77.7ms
7:	learn: 0.4914763	total: 6.68ms	remaining: 76.9ms
8:	learn: 0.4752153	total: 7.49ms	remaining: 75.7ms
9:	learn: 0.4616958	total: 8.25ms	remaining: 74.3ms
10:	learn: 0.4510291	total: 9.04ms	remaining: 73.1ms
11:	learn: 0.4418446	total: 9.77ms	remaining: 71.7ms
12:	learn: 0.4330143	total: 10.6ms	remaining: 70.6ms
13:	learn: 0.4247581	total: 11.4ms	remaining: 69.9ms
14:	learn: 0.4200676	total: 12.2ms	remaining: 69.1ms
15:	learn: 0.4156013	total: 12.9ms	remaining: 67.9ms
16:	learn: 0.4095088	total: 13.7ms	remaining: 67.1ms
17:	learn: 0.4038418	total: 14.5ms	remaining: 66ms
18:	learn: 0.3996332	total: 15.3ms	remaining: 65.1ms
19:	le

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


9:	learn: 0.4792264	total: 8.84ms	remaining: 79.6ms
10:	learn: 0.4706169	total: 9.82ms	remaining: 79.5ms
11:	learn: 0.4639315	total: 10.6ms	remaining: 77.9ms
12:	learn: 0.4555868	total: 11.5ms	remaining: 76.9ms
13:	learn: 0.4501081	total: 12.4ms	remaining: 75.9ms
14:	learn: 0.4418611	total: 13.3ms	remaining: 75.5ms
15:	learn: 0.4353924	total: 14.2ms	remaining: 74.5ms
16:	learn: 0.4297994	total: 15.1ms	remaining: 73.7ms
17:	learn: 0.4261484	total: 16ms	remaining: 73ms
18:	learn: 0.4214450	total: 16.9ms	remaining: 72.1ms
19:	learn: 0.4173074	total: 17.8ms	remaining: 71.1ms
20:	learn: 0.4134664	total: 18.8ms	remaining: 70.7ms
21:	learn: 0.4110007	total: 20ms	remaining: 70.8ms
22:	learn: 0.4084425	total: 21ms	remaining: 70.5ms
23:	learn: 0.4061171	total: 21.9ms	remaining: 69.2ms
24:	learn: 0.4042943	total: 22.7ms	remaining: 68.1ms
25:	learn: 0.4014424	total: 23.6ms	remaining: 67.1ms
26:	learn: 0.3994619	total: 24.4ms	remaining: 66ms
27:	learn: 0.3980785	total: 25.2ms	remaining: 64.8ms
28:	

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


87:	learn: 0.3476706	total: 72.1ms	remaining: 9.83ms
88:	learn: 0.3472945	total: 73ms	remaining: 9.03ms
89:	learn: 0.3469436	total: 74ms	remaining: 8.22ms
90:	learn: 0.3463272	total: 74.9ms	remaining: 7.41ms
91:	learn: 0.3458911	total: 75.8ms	remaining: 6.59ms
92:	learn: 0.3454224	total: 76.7ms	remaining: 5.77ms
93:	learn: 0.3449521	total: 77.5ms	remaining: 4.95ms
94:	learn: 0.3440611	total: 78.4ms	remaining: 4.13ms
95:	learn: 0.3437448	total: 79.4ms	remaining: 3.31ms
96:	learn: 0.3428132	total: 80.3ms	remaining: 2.48ms
97:	learn: 0.3417884	total: 81.1ms	remaining: 1.66ms
98:	learn: 0.3411920	total: 82ms	remaining: 828us
99:	learn: 0.3404872	total: 82.9ms	remaining: 0us
0:	learn: 0.6545565	total: 1.06ms	remaining: 105ms
1:	learn: 0.6241633	total: 1.91ms	remaining: 93.6ms
2:	learn: 0.5969089	total: 2.78ms	remaining: 90ms
3:	learn: 0.5732355	total: 3.55ms	remaining: 85.3ms
4:	learn: 0.5550818	total: 4.39ms	remaining: 83.4ms
5:	learn: 0.5401909	total: 5.25ms	remaining: 82.2ms
6:	learn: 0.

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


19:	learn: 0.4166657	total: 18.6ms	remaining: 74.3ms
20:	learn: 0.4138298	total: 19.6ms	remaining: 73.7ms
21:	learn: 0.4099417	total: 20.6ms	remaining: 73ms
22:	learn: 0.4082540	total: 21.6ms	remaining: 72.4ms
23:	learn: 0.4055576	total: 22.5ms	remaining: 71.3ms
24:	learn: 0.4025586	total: 23.4ms	remaining: 70.2ms
25:	learn: 0.4010780	total: 24.3ms	remaining: 69.2ms
26:	learn: 0.3996038	total: 25.2ms	remaining: 68ms
27:	learn: 0.3982322	total: 26ms	remaining: 66.9ms
28:	learn: 0.3947744	total: 27ms	remaining: 66ms
29:	learn: 0.3930053	total: 27.7ms	remaining: 64.7ms
30:	learn: 0.3913654	total: 28.7ms	remaining: 63.8ms
31:	learn: 0.3898291	total: 29.6ms	remaining: 62.8ms
32:	learn: 0.3876651	total: 30.4ms	remaining: 61.6ms
33:	learn: 0.3862298	total: 31.3ms	remaining: 60.7ms
34:	learn: 0.3833349	total: 32.2ms	remaining: 59.9ms
35:	learn: 0.3820288	total: 33.2ms	remaining: 59.1ms
36:	learn: 0.3806306	total: 34.1ms	remaining: 58ms
37:	learn: 0.3797931	total: 34.9ms	remaining: 57ms
38:	lea

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


0:	learn: 0.6555754	total: 1.05ms	remaining: 104ms
1:	learn: 0.6241006	total: 2.1ms	remaining: 103ms
2:	learn: 0.5980922	total: 3ms	remaining: 97ms
3:	learn: 0.5754017	total: 3.89ms	remaining: 93.5ms
4:	learn: 0.5568439	total: 4.75ms	remaining: 90.2ms
5:	learn: 0.5348792	total: 5.72ms	remaining: 89.7ms
6:	learn: 0.5208745	total: 6.6ms	remaining: 87.7ms
7:	learn: 0.5078473	total: 7.51ms	remaining: 86.3ms
8:	learn: 0.4954945	total: 8.39ms	remaining: 84.8ms
9:	learn: 0.4865803	total: 9.19ms	remaining: 82.7ms
10:	learn: 0.4746157	total: 9.97ms	remaining: 80.7ms
11:	learn: 0.4646863	total: 10.8ms	remaining: 79.2ms
12:	learn: 0.4567433	total: 11.7ms	remaining: 78.3ms
13:	learn: 0.4490223	total: 12.5ms	remaining: 76.9ms
14:	learn: 0.4419669	total: 13.3ms	remaining: 75.7ms
15:	learn: 0.4378865	total: 14.2ms	remaining: 74.4ms
16:	learn: 0.4333556	total: 15.1ms	remaining: 73.6ms
17:	learn: 0.4285101	total: 15.9ms	remaining: 72.5ms
18:	learn: 0.4237976	total: 16.7ms	remaining: 71.3ms
19:	learn: 0

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


19:	learn: 0.4210047	total: 17.6ms	remaining: 70.4ms
20:	learn: 0.4194196	total: 18.3ms	remaining: 68.8ms
21:	learn: 0.4156883	total: 19.5ms	remaining: 69.1ms
22:	learn: 0.4120444	total: 20.4ms	remaining: 68.4ms
23:	learn: 0.4096410	total: 21.4ms	remaining: 67.8ms
24:	learn: 0.4067295	total: 22.2ms	remaining: 66.7ms
25:	learn: 0.4037889	total: 23.1ms	remaining: 65.7ms
26:	learn: 0.4012963	total: 23.9ms	remaining: 64.6ms
27:	learn: 0.3983484	total: 24.8ms	remaining: 63.8ms
28:	learn: 0.3963301	total: 25.7ms	remaining: 62.9ms
29:	learn: 0.3951055	total: 26.6ms	remaining: 62.1ms
30:	learn: 0.3932242	total: 27.5ms	remaining: 61.2ms
31:	learn: 0.3918935	total: 28.4ms	remaining: 60.2ms
32:	learn: 0.3898359	total: 29.3ms	remaining: 59.5ms
33:	learn: 0.3884491	total: 30.1ms	remaining: 58.4ms
34:	learn: 0.3872304	total: 30.9ms	remaining: 57.5ms
35:	learn: 0.3859143	total: 31.8ms	remaining: 56.5ms
36:	learn: 0.3837333	total: 32.7ms	remaining: 55.7ms
37:	learn: 0.3822696	total: 33.8ms	remaining: 

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


74:	learn: 0.3486951	total: 71.7ms	remaining: 23.9ms
75:	learn: 0.3481953	total: 72.8ms	remaining: 23ms
76:	learn: 0.3476190	total: 73.7ms	remaining: 22ms
77:	learn: 0.3470452	total: 74.9ms	remaining: 21.1ms
78:	learn: 0.3465479	total: 76ms	remaining: 20.2ms
79:	learn: 0.3463637	total: 76.8ms	remaining: 19.2ms
80:	learn: 0.3458557	total: 78.1ms	remaining: 18.3ms
81:	learn: 0.3452338	total: 79ms	remaining: 17.3ms
82:	learn: 0.3444425	total: 80.1ms	remaining: 16.4ms
83:	learn: 0.3437048	total: 81ms	remaining: 15.4ms
84:	learn: 0.3432747	total: 82ms	remaining: 14.5ms
85:	learn: 0.3425466	total: 82.9ms	remaining: 13.5ms
86:	learn: 0.3421537	total: 84ms	remaining: 12.5ms
87:	learn: 0.3417065	total: 85.3ms	remaining: 11.6ms
88:	learn: 0.3411427	total: 86.6ms	remaining: 10.7ms
89:	learn: 0.3405726	total: 87.8ms	remaining: 9.75ms
90:	learn: 0.3399640	total: 89ms	remaining: 8.8ms
91:	learn: 0.3392894	total: 90.1ms	remaining: 7.84ms
92:	learn: 0.3387391	total: 91.3ms	remaining: 6.87ms
93:	learn:

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


0:	learn: 0.6539924	total: 1.05ms	remaining: 104ms
1:	learn: 0.6247162	total: 1.89ms	remaining: 92.5ms
2:	learn: 0.5950981	total: 2.63ms	remaining: 85.1ms
3:	learn: 0.5743052	total: 3.51ms	remaining: 84.2ms
4:	learn: 0.5560487	total: 4.38ms	remaining: 83.3ms
5:	learn: 0.5401743	total: 5.13ms	remaining: 80.4ms
6:	learn: 0.5206125	total: 6.01ms	remaining: 79.9ms
7:	learn: 0.5042045	total: 6.86ms	remaining: 78.9ms
8:	learn: 0.4910786	total: 7.66ms	remaining: 77.5ms
9:	learn: 0.4796726	total: 8.49ms	remaining: 76.4ms
10:	learn: 0.4683951	total: 9.41ms	remaining: 76.1ms
11:	learn: 0.4590041	total: 10.7ms	remaining: 78.1ms
12:	learn: 0.4510206	total: 11.8ms	remaining: 79ms
13:	learn: 0.4427894	total: 12.7ms	remaining: 78ms
14:	learn: 0.4381560	total: 13.7ms	remaining: 77.4ms
15:	learn: 0.4324047	total: 14.5ms	remaining: 76.1ms
16:	learn: 0.4268654	total: 15.3ms	remaining: 74.9ms
17:	learn: 0.4219865	total: 16.2ms	remaining: 73.9ms
18:	learn: 0.4181830	total: 17.1ms	remaining: 72.9ms
19:	lear

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


0:	learn: 0.6564588	total: 966us	remaining: 95.7ms
1:	learn: 0.6235117	total: 1.84ms	remaining: 90.3ms
2:	learn: 0.5928634	total: 2.58ms	remaining: 83.3ms
3:	learn: 0.5715604	total: 3.43ms	remaining: 82.3ms
4:	learn: 0.5515084	total: 4.23ms	remaining: 80.4ms
5:	learn: 0.5328525	total: 5.23ms	remaining: 81.9ms
6:	learn: 0.5138486	total: 6.35ms	remaining: 84.3ms
7:	learn: 0.4988853	total: 7.22ms	remaining: 83ms
8:	learn: 0.4844758	total: 8ms	remaining: 80.9ms
9:	learn: 0.4771154	total: 8.55ms	remaining: 76.9ms
10:	learn: 0.4690129	total: 9.34ms	remaining: 75.6ms
11:	learn: 0.4598755	total: 10.2ms	remaining: 75.1ms
12:	learn: 0.4507046	total: 11.1ms	remaining: 74.5ms
13:	learn: 0.4430011	total: 12ms	remaining: 73.6ms
14:	learn: 0.4366333	total: 12.9ms	remaining: 72.9ms
15:	learn: 0.4309415	total: 13.7ms	remaining: 72.2ms
16:	learn: 0.4244507	total: 14.6ms	remaining: 71.1ms
17:	learn: 0.4196403	total: 15.4ms	remaining: 70.3ms
18:	learn: 0.4150978	total: 16.2ms	remaining: 69.2ms
19:	learn: 

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.r

### K-Fold CV

In [13]:
from sklearn.model_selection import KFold

# Initialize 5-fold cross-validator
kfold = KFold(n_splits=5, shuffle=False)

results1_kfold = {}

# Loop over each model
for model_name, model1 in models1.items():
    accuracies = []
    f1_scores = []

    # Loop over each fold in 5-fold cross-validation
    for train_index, test_index in kfold.split(X, y):
        # Split the data into training and testing sets for the current fold
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Apply SMOTE to the training data to balance class distribution
        X_train_oversampled, y_train_oversampled = smote.fit_resample(X_train, y_train)

        # Train the model on the oversampled training data
        model1.fit(X_train_oversampled, y_train_oversampled.ravel())

        # Predict the target on the test data
        y_pred = model1.predict(X_test)

        # Evaluate the prediction
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='macro')

        accuracies.append(accuracy)
        f1_scores.append(f1)

    # Compute average accuracy and F1-score across all folds
    avg_accuracy = sum(accuracies) / len(accuracies)
    avg_f1_score = sum(f1_scores) / len(f1_scores)

    results1_kfold[model_name] = {
        'Average Accuracy': avg_accuracy,
        'Average F1-Score (macro)': avg_f1_score
    }


  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.r

[LightGBM] [Info] Number of positive: 1716, number of negative: 1716
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000251 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 91
[LightGBM] [Info] Number of data points in the train set: 3432, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1778, number of negative: 1778
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000262 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 92
[LightGBM] [Info] Number of data points in the train set: 3556, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[Lig

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


0:	learn: 0.6536531	total: 1.21ms	remaining: 119ms
1:	learn: 0.6227726	total: 2.68ms	remaining: 131ms
2:	learn: 0.5884056	total: 3.6ms	remaining: 116ms
3:	learn: 0.5639487	total: 4.54ms	remaining: 109ms
4:	learn: 0.5394201	total: 5.58ms	remaining: 106ms
5:	learn: 0.5217324	total: 6.67ms	remaining: 104ms
6:	learn: 0.5069026	total: 7.73ms	remaining: 103ms
7:	learn: 0.4902600	total: 8.81ms	remaining: 101ms
8:	learn: 0.4794030	total: 9.97ms	remaining: 101ms
9:	learn: 0.4703887	total: 11.2ms	remaining: 100ms
10:	learn: 0.4618540	total: 12.3ms	remaining: 99.7ms
11:	learn: 0.4519918	total: 13.3ms	remaining: 97.2ms
12:	learn: 0.4435051	total: 14.2ms	remaining: 94.9ms
13:	learn: 0.4333382	total: 15ms	remaining: 92.2ms
14:	learn: 0.4237951	total: 16ms	remaining: 90.6ms
15:	learn: 0.4157161	total: 16.9ms	remaining: 88.5ms
16:	learn: 0.4089142	total: 17.7ms	remaining: 86.6ms
17:	learn: 0.4035767	total: 18.6ms	remaining: 84.9ms
18:	learn: 0.3988060	total: 19.6ms	remaining: 83.4ms
19:	learn: 0.39494

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


0:	learn: 0.6529720	total: 797us	remaining: 78.9ms
1:	learn: 0.6170222	total: 1.74ms	remaining: 85.1ms
2:	learn: 0.5874868	total: 2.5ms	remaining: 80.8ms
3:	learn: 0.5622815	total: 3.26ms	remaining: 78.1ms
4:	learn: 0.5436234	total: 3.99ms	remaining: 75.7ms
5:	learn: 0.5282429	total: 4.75ms	remaining: 74.3ms
6:	learn: 0.5077982	total: 5.49ms	remaining: 72.9ms
7:	learn: 0.4919153	total: 6.23ms	remaining: 71.7ms
8:	learn: 0.4823170	total: 7.03ms	remaining: 71ms
9:	learn: 0.4678202	total: 7.81ms	remaining: 70.3ms
10:	learn: 0.4587949	total: 8.57ms	remaining: 69.4ms
11:	learn: 0.4504857	total: 9.31ms	remaining: 68.3ms
12:	learn: 0.4399476	total: 10.1ms	remaining: 67.6ms
13:	learn: 0.4298470	total: 10.9ms	remaining: 66.7ms
14:	learn: 0.4216719	total: 11.6ms	remaining: 65.7ms
15:	learn: 0.4143371	total: 12.3ms	remaining: 64.8ms
16:	learn: 0.4103875	total: 13.1ms	remaining: 64ms
17:	learn: 0.4067765	total: 13.9ms	remaining: 63.1ms
18:	learn: 0.4027866	total: 14.6ms	remaining: 62.4ms
19:	learn

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


50:	learn: 0.3556333	total: 39.6ms	remaining: 38.1ms
51:	learn: 0.3550356	total: 40.4ms	remaining: 37.3ms
52:	learn: 0.3545033	total: 41.2ms	remaining: 36.6ms
53:	learn: 0.3533526	total: 42.1ms	remaining: 35.9ms
54:	learn: 0.3521534	total: 42.9ms	remaining: 35.1ms
55:	learn: 0.3513438	total: 43.7ms	remaining: 34.3ms
56:	learn: 0.3509386	total: 44.5ms	remaining: 33.6ms
57:	learn: 0.3500363	total: 45.4ms	remaining: 32.8ms
58:	learn: 0.3491584	total: 46.1ms	remaining: 32.1ms
59:	learn: 0.3483795	total: 47ms	remaining: 31.4ms
60:	learn: 0.3477386	total: 47.9ms	remaining: 30.6ms
61:	learn: 0.3465586	total: 49.1ms	remaining: 30.1ms
62:	learn: 0.3461872	total: 50.1ms	remaining: 29.4ms
63:	learn: 0.3452851	total: 51.2ms	remaining: 28.8ms
64:	learn: 0.3444363	total: 52.2ms	remaining: 28.1ms
65:	learn: 0.3431280	total: 53ms	remaining: 27.3ms
66:	learn: 0.3424332	total: 53.7ms	remaining: 26.5ms
67:	learn: 0.3413738	total: 54.5ms	remaining: 25.7ms
68:	learn: 0.3406611	total: 55.3ms	remaining: 24.8

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


## Label: LONG_INTERACTION

In [14]:
# Label distribution before balancing
print(encoded_data['LONG_INTERACTION_availability'].value_counts())

LONG_INTERACTION_availability
False    1468
True     1433
Name: count, dtype: int64


### Model building and LOSO CV

In [15]:
# Prepare features (X) and target variable (y) for predicting LONG_INTERACTION
X = encoded_data.drop(columns=['LONG_INTERACTION_availability', 'SHORT_INTERACTION_availability','uid'], axis=1)
y = encoded_data['LONG_INTERACTION_availability']
groups = encoded_data['uid']

# Initialize Leave-One-Group-Out cross-validator
logo = LeaveOneGroupOut()

# Initialize SMOTE for oversampling to handle class imbalance
smote = SMOTE(random_state=42)

# Define models to evaluate
models2 = {
    'Random Forest': RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42),
    'XGBoost': XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42),
    'LightGBM': LGBMClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42),
    'CatBoost': CatBoostClassifier(iterations=100, depth=5, learning_rate=0.1, loss_function='Logloss', cat_features=[0], random_seed=42),
    'SVM': SVC(random_state=42),
    'Dummy': DummyClassifier(strategy="stratified", random_state=42)
}

results2_logo = {}

# Loop over each model
for model_name, model2 in models2.items():
    accuracies = []
    f1_scores = []

    # Loop over each fold in Leave-One-Group-Out cross-validation
    for i, (train_index, test_index) in enumerate(logo.split(X, y, groups)):
        # Split the data into training and testing sets for the current fold
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Apply SMOTE to the training data to balance class distribution
        X_train_oversampled, y_train_oversampled = smote.fit_resample(X_train, y_train)

        # Train the model on the oversampled training data
        model2.fit(X_train_oversampled, y_train_oversampled.ravel())

        # Predict the target on the test data
        y_pred = model2.predict(X_test)

        # Evaluate the prediction
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='macro')

        accuracies.append(accuracy)
        f1_scores.append(f1)

    # Compute average accuracy and F1-score across all folds
    avg_accuracy = sum(accuracies) / len(accuracies)
    avg_f1_score = sum(f1_scores) / len(f1_scores)

    results2_logo[model_name] = {
        'Average Accuracy': avg_accuracy,
        'Average F1-Score (macro)': avg_f1_score
    }

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.r

[LightGBM] [Info] Number of positive: 1425, number of negative: 1425
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000203 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 90
[LightGBM] [Info] Number of data points in the train set: 2850, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1444, number of negative: 1444
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000250 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 92
[LightGBM] [Info] Number of data points in the train set: 2888, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[Lig

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


[LightGBM] [Info] Number of positive: 1365, number of negative: 1365
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000248 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 92
[LightGBM] [Info] Number of data points in the train set: 2730, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1443, number of negative: 1443
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000226 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 90
[LightGBM] [Info] Number of data points in the train set: 2886, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[Lig

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


[LightGBM] [Info] Number of positive: 1458, number of negative: 1458
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000250 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 92
[LightGBM] [Info] Number of data points in the train set: 2916, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1452, number of negative: 1452
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000213 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 92
[LightGBM] [Info] Number of data points in the train set: 2904, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[Lig

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


[LightGBM] [Info] Number of positive: 1402, number of negative: 1402
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000263 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 92
[LightGBM] [Info] Number of data points in the train set: 2804, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1406, number of negative: 1406
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000277 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 93
[LightGBM] [Info] Number of data points in the train set: 2812, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[Lig

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


[LightGBM] [Info] Number of positive: 1407, number of negative: 1407
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000258 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 93
[LightGBM] [Info] Number of data points in the train set: 2814, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1456, number of negative: 1456
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000240 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 92
[LightGBM] [Info] Number of data points in the train set: 2912, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[Lig

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


[LightGBM] [Info] Number of positive: 1445, number of negative: 1445
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000272 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 92
[LightGBM] [Info] Number of data points in the train set: 2890, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1423, number of negative: 1423
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000209 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 92
[LightGBM] [Info] Number of data points in the train set: 2846, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[Lig

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


[LightGBM] [Info] Number of positive: 1417, number of negative: 1417
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000260 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 91
[LightGBM] [Info] Number of data points in the train set: 2834, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1424, number of negative: 1424
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000217 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 92
[LightGBM] [Info] Number of data points in the train set: 2848, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[Lig

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


[LightGBM] [Info] Number of positive: 1424, number of negative: 1424
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000252 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 92
[LightGBM] [Info] Number of data points in the train set: 2848, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1445, number of negative: 1445
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000207 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 90
[LightGBM] [Info] Number of data points in the train set: 2890, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


[LightGBM] [Info] Number of positive: 1410, number of negative: 1410
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000239 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 92
[LightGBM] [Info] Number of data points in the train set: 2820, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
0:	learn: 0.6728474	total: 947us	remaining: 93.8ms
1:	learn: 0.6569232	total: 2.02ms	remaining: 98.8ms
2:	learn: 0.6434571	total: 2.95ms	remaining: 95.3ms
3:	learn: 0.6337466	total: 3.97ms	remaining: 95.2ms
4:	learn: 0.6217621	total: 5.17ms	remaining: 98.2ms
5:	learn: 0.6149952	total: 6.24ms	remaining: 97.8ms
6:	learn: 0.6061168	total: 7.26ms	remaining: 96.4ms
7:	learn: 0.5991345	total: 8.32ms	remaining: 95.6ms
8:	learn: 0.5924261	total: 9.31ms	remaining: 94.2ms
9:	learn: 0.5863084	total: 10.3ms	r

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


0:	learn: 0.6735247	total: 1.25ms	remaining: 123ms
1:	learn: 0.6560164	total: 2.48ms	remaining: 122ms
2:	learn: 0.6416915	total: 3.62ms	remaining: 117ms
3:	learn: 0.6296269	total: 5.62ms	remaining: 135ms
4:	learn: 0.6203130	total: 6.85ms	remaining: 130ms
5:	learn: 0.6100476	total: 8.28ms	remaining: 130ms
6:	learn: 0.6025061	total: 9.39ms	remaining: 125ms
7:	learn: 0.5951878	total: 10.5ms	remaining: 120ms
8:	learn: 0.5892003	total: 11.5ms	remaining: 116ms
9:	learn: 0.5821452	total: 12.6ms	remaining: 113ms
10:	learn: 0.5767876	total: 13.7ms	remaining: 111ms
11:	learn: 0.5725558	total: 14.7ms	remaining: 108ms
12:	learn: 0.5691174	total: 16ms	remaining: 107ms
13:	learn: 0.5661488	total: 17.1ms	remaining: 105ms
14:	learn: 0.5630470	total: 18.2ms	remaining: 103ms
15:	learn: 0.5597161	total: 19.3ms	remaining: 101ms
16:	learn: 0.5562396	total: 20.8ms	remaining: 101ms
17:	learn: 0.5540058	total: 22.2ms	remaining: 101ms
18:	learn: 0.5530561	total: 23ms	remaining: 97.9ms
19:	learn: 0.5512618	tota

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


4:	learn: 0.6202614	total: 4.16ms	remaining: 79ms
5:	learn: 0.6121645	total: 5.22ms	remaining: 81.8ms
6:	learn: 0.6034897	total: 5.97ms	remaining: 79.3ms
7:	learn: 0.5976901	total: 6.5ms	remaining: 74.8ms
8:	learn: 0.5903029	total: 7.25ms	remaining: 73.3ms
9:	learn: 0.5838691	total: 8.03ms	remaining: 72.2ms
10:	learn: 0.5787111	total: 8.77ms	remaining: 70.9ms
11:	learn: 0.5748820	total: 9.55ms	remaining: 70ms
12:	learn: 0.5705860	total: 10.3ms	remaining: 68.9ms
13:	learn: 0.5669498	total: 11.1ms	remaining: 68.1ms
14:	learn: 0.5632034	total: 11.8ms	remaining: 67.1ms
15:	learn: 0.5599262	total: 12.6ms	remaining: 66.1ms
16:	learn: 0.5568026	total: 13.3ms	remaining: 64.8ms
17:	learn: 0.5538432	total: 14ms	remaining: 63.9ms
18:	learn: 0.5517841	total: 14.7ms	remaining: 62.8ms
19:	learn: 0.5497205	total: 15.5ms	remaining: 61.9ms
20:	learn: 0.5480061	total: 16.3ms	remaining: 61.3ms
21:	learn: 0.5455385	total: 17ms	remaining: 60.4ms
22:	learn: 0.5437689	total: 17.8ms	remaining: 59.4ms
23:	lear

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


85:	learn: 0.5012128	total: 63.4ms	remaining: 10.3ms
86:	learn: 0.5007915	total: 64.2ms	remaining: 9.59ms
87:	learn: 0.5005799	total: 65ms	remaining: 8.87ms
88:	learn: 0.5003772	total: 66.1ms	remaining: 8.17ms
89:	learn: 0.4999771	total: 67.1ms	remaining: 7.45ms
90:	learn: 0.4997219	total: 68.1ms	remaining: 6.73ms
91:	learn: 0.4992525	total: 69.1ms	remaining: 6.01ms
92:	learn: 0.4990412	total: 70ms	remaining: 5.27ms
93:	learn: 0.4984383	total: 70.9ms	remaining: 4.52ms
94:	learn: 0.4982220	total: 71.7ms	remaining: 3.77ms
95:	learn: 0.4975875	total: 72.4ms	remaining: 3.02ms
96:	learn: 0.4971693	total: 73.1ms	remaining: 2.26ms
97:	learn: 0.4967961	total: 73.9ms	remaining: 1.51ms
98:	learn: 0.4961670	total: 74.7ms	remaining: 754us
99:	learn: 0.4958072	total: 75.4ms	remaining: 0us
0:	learn: 0.6737706	total: 1.09ms	remaining: 108ms
1:	learn: 0.6603303	total: 1.64ms	remaining: 80.4ms
2:	learn: 0.6467684	total: 2.38ms	remaining: 77.1ms
3:	learn: 0.6356922	total: 3.1ms	remaining: 74.3ms
4:	lear

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


0:	learn: 0.6746781	total: 1.15ms	remaining: 113ms
1:	learn: 0.6574521	total: 2.09ms	remaining: 102ms
2:	learn: 0.6422497	total: 2.92ms	remaining: 94.5ms
3:	learn: 0.6319662	total: 3.8ms	remaining: 91.3ms
4:	learn: 0.6224665	total: 4.66ms	remaining: 88.5ms
5:	learn: 0.6113831	total: 5.57ms	remaining: 87.2ms
6:	learn: 0.6040724	total: 6.41ms	remaining: 85.2ms
7:	learn: 0.5967782	total: 7.2ms	remaining: 82.8ms
8:	learn: 0.5908748	total: 7.95ms	remaining: 80.3ms
9:	learn: 0.5835709	total: 8.67ms	remaining: 78ms
10:	learn: 0.5776205	total: 9.4ms	remaining: 76ms
11:	learn: 0.5717681	total: 10.2ms	remaining: 74.5ms
12:	learn: 0.5685123	total: 10.9ms	remaining: 73ms
13:	learn: 0.5658185	total: 11.7ms	remaining: 71.6ms
14:	learn: 0.5621214	total: 12.4ms	remaining: 70.3ms
15:	learn: 0.5585388	total: 13.3ms	remaining: 69.7ms
16:	learn: 0.5557762	total: 14.1ms	remaining: 68.9ms
17:	learn: 0.5536294	total: 14.9ms	remaining: 68.1ms
18:	learn: 0.5526334	total: 15.4ms	remaining: 65.9ms
19:	learn: 0.5

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


0:	learn: 0.6746947	total: 850us	remaining: 84.2ms
1:	learn: 0.6573923	total: 1.78ms	remaining: 87.2ms
2:	learn: 0.6422426	total: 2.61ms	remaining: 84.5ms
3:	learn: 0.6295911	total: 3.34ms	remaining: 80.2ms
4:	learn: 0.6194261	total: 4.21ms	remaining: 79.9ms
5:	learn: 0.6104675	total: 4.9ms	remaining: 76.8ms
6:	learn: 0.6020905	total: 5.6ms	remaining: 74.4ms
7:	learn: 0.5946437	total: 6.3ms	remaining: 72.5ms
8:	learn: 0.5885906	total: 7.01ms	remaining: 70.9ms
9:	learn: 0.5820377	total: 7.75ms	remaining: 69.8ms
10:	learn: 0.5759542	total: 8.44ms	remaining: 68.3ms
11:	learn: 0.5731522	total: 9ms	remaining: 66ms
12:	learn: 0.5679960	total: 9.7ms	remaining: 64.9ms
13:	learn: 0.5652933	total: 10.4ms	remaining: 64ms
14:	learn: 0.5612146	total: 11.1ms	remaining: 63ms
15:	learn: 0.5583449	total: 11.8ms	remaining: 62.2ms
16:	learn: 0.5556422	total: 12.6ms	remaining: 61.3ms
17:	learn: 0.5530919	total: 13.3ms	remaining: 60.5ms
18:	learn: 0.5493503	total: 14ms	remaining: 59.7ms
19:	learn: 0.546834

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


99:	learn: 0.4926842	total: 71.6ms	remaining: 0us
0:	learn: 0.6748476	total: 899us	remaining: 89.1ms
1:	learn: 0.6572455	total: 1.59ms	remaining: 78ms
2:	learn: 0.6435172	total: 2.26ms	remaining: 73.2ms
3:	learn: 0.6301061	total: 2.94ms	remaining: 70.5ms
4:	learn: 0.6173365	total: 3.64ms	remaining: 69.2ms
5:	learn: 0.6070298	total: 4.32ms	remaining: 67.7ms
6:	learn: 0.5987754	total: 4.97ms	remaining: 66.1ms
7:	learn: 0.5907567	total: 5.65ms	remaining: 65ms
8:	learn: 0.5844378	total: 6.34ms	remaining: 64.1ms
9:	learn: 0.5786278	total: 7ms	remaining: 63ms
10:	learn: 0.5724043	total: 7.66ms	remaining: 62ms
11:	learn: 0.5698764	total: 8.24ms	remaining: 60.4ms
12:	learn: 0.5648238	total: 8.99ms	remaining: 60.1ms
13:	learn: 0.5613392	total: 9.92ms	remaining: 60.9ms
14:	learn: 0.5584593	total: 10.9ms	remaining: 62ms
15:	learn: 0.5546959	total: 11.8ms	remaining: 61.8ms
16:	learn: 0.5522554	total: 12.5ms	remaining: 60.9ms
17:	learn: 0.5493932	total: 13.2ms	remaining: 60.1ms
18:	learn: 0.5464846

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


2:	learn: 0.6435126	total: 3.18ms	remaining: 103ms
3:	learn: 0.6331943	total: 4.14ms	remaining: 99.4ms
4:	learn: 0.6214315	total: 5.08ms	remaining: 96.5ms
5:	learn: 0.6116167	total: 6ms	remaining: 94ms
6:	learn: 0.6036884	total: 7.05ms	remaining: 93.7ms
7:	learn: 0.5960313	total: 8.14ms	remaining: 93.7ms
8:	learn: 0.5897789	total: 8.99ms	remaining: 90.9ms
9:	learn: 0.5847535	total: 9.72ms	remaining: 87.5ms
10:	learn: 0.5787925	total: 10.5ms	remaining: 84.7ms
11:	learn: 0.5741101	total: 11.3ms	remaining: 82.7ms
12:	learn: 0.5698550	total: 12.2ms	remaining: 81.9ms
13:	learn: 0.5670281	total: 13.2ms	remaining: 81ms
14:	learn: 0.5639403	total: 14.1ms	remaining: 80ms
15:	learn: 0.5611994	total: 15ms	remaining: 78.8ms
16:	learn: 0.5581649	total: 15.9ms	remaining: 77.6ms
17:	learn: 0.5550112	total: 16.6ms	remaining: 75.7ms
18:	learn: 0.5524010	total: 17.3ms	remaining: 73.8ms
19:	learn: 0.5498369	total: 18.1ms	remaining: 72.4ms
20:	learn: 0.5476266	total: 19.2ms	remaining: 72.3ms
21:	learn: 0.

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


2:	learn: 0.6434758	total: 3.02ms	remaining: 97.6ms
3:	learn: 0.6325670	total: 3.96ms	remaining: 94.9ms
4:	learn: 0.6184659	total: 4.83ms	remaining: 91.8ms
5:	learn: 0.6102935	total: 5.65ms	remaining: 88.5ms
6:	learn: 0.6028184	total: 6.53ms	remaining: 86.8ms
7:	learn: 0.5954728	total: 7.42ms	remaining: 85.4ms
8:	learn: 0.5891020	total: 8.32ms	remaining: 84.2ms
9:	learn: 0.5816283	total: 9.03ms	remaining: 81.3ms
10:	learn: 0.5741992	total: 9.77ms	remaining: 79ms
11:	learn: 0.5687855	total: 10.6ms	remaining: 77.6ms
12:	learn: 0.5652814	total: 11.6ms	remaining: 77.7ms
13:	learn: 0.5607407	total: 12.5ms	remaining: 76.7ms
14:	learn: 0.5575863	total: 13.3ms	remaining: 75.3ms
15:	learn: 0.5549009	total: 14.1ms	remaining: 73.9ms
16:	learn: 0.5520903	total: 15ms	remaining: 73.1ms
17:	learn: 0.5491349	total: 15.9ms	remaining: 72.2ms
18:	learn: 0.5464677	total: 16.6ms	remaining: 71ms
19:	learn: 0.5444772	total: 17.4ms	remaining: 69.6ms
20:	learn: 0.5419119	total: 18.2ms	remaining: 68.4ms
21:	lea

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


89:	learn: 0.4986635	total: 68.5ms	remaining: 7.62ms
90:	learn: 0.4983879	total: 69.4ms	remaining: 6.86ms
91:	learn: 0.4979787	total: 70.2ms	remaining: 6.1ms
92:	learn: 0.4974495	total: 71.1ms	remaining: 5.35ms
93:	learn: 0.4970255	total: 71.9ms	remaining: 4.59ms
94:	learn: 0.4966578	total: 72.7ms	remaining: 3.82ms
95:	learn: 0.4962707	total: 73.4ms	remaining: 3.06ms
96:	learn: 0.4955750	total: 74.1ms	remaining: 2.29ms
97:	learn: 0.4951806	total: 74.8ms	remaining: 1.53ms
98:	learn: 0.4947706	total: 75.5ms	remaining: 762us
99:	learn: 0.4943733	total: 76.3ms	remaining: 0us
0:	learn: 0.6710035	total: 1.2ms	remaining: 119ms
1:	learn: 0.6545732	total: 1.95ms	remaining: 95.3ms
2:	learn: 0.6422235	total: 2.63ms	remaining: 85.1ms
3:	learn: 0.6315317	total: 3.33ms	remaining: 80ms
4:	learn: 0.6228412	total: 3.97ms	remaining: 75.5ms
5:	learn: 0.6153912	total: 4.63ms	remaining: 72.5ms
6:	learn: 0.6070677	total: 5.33ms	remaining: 70.8ms
7:	learn: 0.6003154	total: 6.01ms	remaining: 69.1ms
8:	learn: 

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


0:	learn: 0.6708933	total: 1.04ms	remaining: 104ms
1:	learn: 0.6554796	total: 2.05ms	remaining: 101ms
2:	learn: 0.6418966	total: 2.94ms	remaining: 95.2ms
3:	learn: 0.6289007	total: 3.89ms	remaining: 93.5ms
4:	learn: 0.6198123	total: 4.85ms	remaining: 92.2ms
5:	learn: 0.6123786	total: 5.78ms	remaining: 90.6ms
6:	learn: 0.6023939	total: 6.76ms	remaining: 89.9ms
7:	learn: 0.5956772	total: 7.49ms	remaining: 86.2ms
8:	learn: 0.5894870	total: 8.21ms	remaining: 83.1ms
9:	learn: 0.5834599	total: 8.97ms	remaining: 80.7ms
10:	learn: 0.5772831	total: 9.64ms	remaining: 78ms
11:	learn: 0.5732325	total: 10.4ms	remaining: 76.1ms
12:	learn: 0.5676417	total: 11.1ms	remaining: 74.2ms
13:	learn: 0.5638405	total: 11.8ms	remaining: 72.5ms
14:	learn: 0.5599196	total: 12.5ms	remaining: 71ms
15:	learn: 0.5568653	total: 13.2ms	remaining: 69.4ms
16:	learn: 0.5543577	total: 13.9ms	remaining: 68ms
17:	learn: 0.5519960	total: 14.6ms	remaining: 66.7ms
18:	learn: 0.5491424	total: 15.4ms	remaining: 65.5ms
19:	learn: 

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


0:	learn: 0.6739167	total: 962us	remaining: 95.3ms
1:	learn: 0.6569857	total: 1.74ms	remaining: 85.1ms
2:	learn: 0.6447385	total: 2.62ms	remaining: 84.8ms
3:	learn: 0.6335758	total: 3.54ms	remaining: 85ms
4:	learn: 0.6221434	total: 4.49ms	remaining: 85.3ms
5:	learn: 0.6131105	total: 5.3ms	remaining: 83.1ms
6:	learn: 0.6030607	total: 6.25ms	remaining: 83ms
7:	learn: 0.5973908	total: 7.17ms	remaining: 82.5ms
8:	learn: 0.5898852	total: 7.87ms	remaining: 79.6ms
9:	learn: 0.5820247	total: 8.55ms	remaining: 76.9ms
10:	learn: 0.5766723	total: 9.26ms	remaining: 75ms
11:	learn: 0.5712420	total: 9.99ms	remaining: 73.3ms
12:	learn: 0.5669776	total: 10.7ms	remaining: 71.6ms
13:	learn: 0.5628364	total: 11.4ms	remaining: 70ms
14:	learn: 0.5592393	total: 12.1ms	remaining: 68.7ms
15:	learn: 0.5563815	total: 12.9ms	remaining: 67.8ms
16:	learn: 0.5537473	total: 13.7ms	remaining: 66.9ms
17:	learn: 0.5516063	total: 14.5ms	remaining: 66.1ms
18:	learn: 0.5492925	total: 15.3ms	remaining: 65.2ms
19:	learn: 0.

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


73:	learn: 0.5030455	total: 55.1ms	remaining: 19.3ms
74:	learn: 0.5026702	total: 56ms	remaining: 18.7ms
75:	learn: 0.5022147	total: 57ms	remaining: 18ms
76:	learn: 0.5017477	total: 57.7ms	remaining: 17.2ms
77:	learn: 0.5014188	total: 58.5ms	remaining: 16.5ms
78:	learn: 0.5012624	total: 59.2ms	remaining: 15.7ms
79:	learn: 0.5006921	total: 59.9ms	remaining: 15ms
80:	learn: 0.5003697	total: 60.6ms	remaining: 14.2ms
81:	learn: 0.5000732	total: 61.3ms	remaining: 13.5ms
82:	learn: 0.4993752	total: 62.1ms	remaining: 12.7ms
83:	learn: 0.4991525	total: 63ms	remaining: 12ms
84:	learn: 0.4987317	total: 63.9ms	remaining: 11.3ms
85:	learn: 0.4983181	total: 64.7ms	remaining: 10.5ms
86:	learn: 0.4977586	total: 65.6ms	remaining: 9.81ms
87:	learn: 0.4974926	total: 66.3ms	remaining: 9.05ms
88:	learn: 0.4972103	total: 67.1ms	remaining: 8.29ms
89:	learn: 0.4967334	total: 67.8ms	remaining: 7.53ms
90:	learn: 0.4959122	total: 68.5ms	remaining: 6.77ms
91:	learn: 0.4951555	total: 69.2ms	remaining: 6.02ms
92:	l

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


0:	learn: 0.6754471	total: 1.01ms	remaining: 99.8ms
1:	learn: 0.6566711	total: 1.79ms	remaining: 87.8ms
2:	learn: 0.6439721	total: 2.65ms	remaining: 85.6ms
3:	learn: 0.6308258	total: 3.51ms	remaining: 84.2ms
4:	learn: 0.6211945	total: 4.24ms	remaining: 80.5ms
5:	learn: 0.6113630	total: 5.2ms	remaining: 81.5ms
6:	learn: 0.6042297	total: 6.06ms	remaining: 80.5ms
7:	learn: 0.5964287	total: 6.99ms	remaining: 80.3ms
8:	learn: 0.5908933	total: 7.91ms	remaining: 80ms
9:	learn: 0.5838353	total: 8.77ms	remaining: 79ms
10:	learn: 0.5783850	total: 9.78ms	remaining: 79.1ms
11:	learn: 0.5740401	total: 10.7ms	remaining: 78.4ms
12:	learn: 0.5704543	total: 11.5ms	remaining: 77.2ms
13:	learn: 0.5676013	total: 12.3ms	remaining: 75.4ms
14:	learn: 0.5644495	total: 13ms	remaining: 73.6ms
15:	learn: 0.5612579	total: 13.7ms	remaining: 72ms
16:	learn: 0.5581067	total: 14.5ms	remaining: 70.6ms
17:	learn: 0.5552219	total: 15.2ms	remaining: 69ms
18:	learn: 0.5533608	total: 15.9ms	remaining: 67.8ms
19:	learn: 0.5

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.r

### K-Fold CV

In [16]:
from sklearn.model_selection import KFold

# Initialize 5-fold cross-validator
kfold = KFold(n_splits=5, shuffle=False)

results2_kfold = {}

# Loop over each model
for model_name, model2 in models2.items():
    accuracies = []
    f1_scores = []

    # Loop over each fold in 5-fold cross-validation
    for train_index, test_index in kfold.split(X, y):
        # Split the data into training and testing sets for the current fold
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Apply SMOTE to the training data to balance class distribution
        X_train_oversampled, y_train_oversampled = smote.fit_resample(X_train, y_train)

        # Train the model on the oversampled training data
        model2.fit(X_train_oversampled, y_train_oversampled.ravel())

        # Predict the target on the test data
        y_pred = model2.predict(X_test)

        # Evaluate the prediction
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='macro')

        accuracies.append(accuracy)
        f1_scores.append(f1)

    # Compute average accuracy and F1-score across all folds
    avg_accuracy = sum(accuracies) / len(accuracies)
    avg_f1_score = sum(f1_scores) / len(f1_scores)

    results2_kfold[model_name] = {
        'Average Accuracy': avg_accuracy,
        'Average F1-Score (macro)': avg_f1_score
    }

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.r

[LightGBM] [Info] Number of positive: 1188, number of negative: 1188
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000184 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 88
[LightGBM] [Info] Number of data points in the train set: 2376, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1173, number of negative: 1173
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000189 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 90
[LightGBM] [Info] Number of data points in the train set: 2346, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[Lig

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


[LightGBM] [Info] Number of positive: 1201, number of negative: 1201
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000172 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 90
[LightGBM] [Info] Number of data points in the train set: 2402, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
0:	learn: 0.6715702	total: 1.18ms	remaining: 117ms
1:	learn: 0.6538396	total: 2.12ms	remaining: 104ms
2:	learn: 0.6392834	total: 3ms	remaining: 97.1ms
3:	learn: 0.6272745	total: 3.91ms	remaining: 93.8ms
4:	learn: 0.6170082	total: 4.94ms	remaining: 93.9ms
5:	learn: 0.6092339	total: 5.7ms	remaining: 89.2ms
6:	learn: 0.6027320	total: 6.64ms	remaining: 88.2ms
7:	learn: 0.5971089	total: 7.54ms	remaining: 86.7ms
8:	learn: 0.5889696	total: 8.53ms	remaining: 86.2ms
9:	learn: 0.5837884	total: 9.42ms	remain

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


0:	learn: 0.6709850	total: 800us	remaining: 79.2ms
1:	learn: 0.6530890	total: 1.57ms	remaining: 76.8ms
2:	learn: 0.6385725	total: 2.25ms	remaining: 72.9ms
3:	learn: 0.6239373	total: 2.96ms	remaining: 71ms
4:	learn: 0.6103134	total: 3.62ms	remaining: 68.8ms
5:	learn: 0.6008130	total: 4.24ms	remaining: 66.4ms
6:	learn: 0.5925854	total: 4.86ms	remaining: 64.5ms
7:	learn: 0.5861364	total: 5.48ms	remaining: 63.1ms
8:	learn: 0.5789559	total: 6.2ms	remaining: 62.6ms
9:	learn: 0.5715430	total: 6.9ms	remaining: 62.1ms
10:	learn: 0.5669046	total: 7.51ms	remaining: 60.7ms
11:	learn: 0.5635532	total: 8.18ms	remaining: 60ms
12:	learn: 0.5596333	total: 8.8ms	remaining: 58.9ms
13:	learn: 0.5562383	total: 9.45ms	remaining: 58.1ms
14:	learn: 0.5515176	total: 10.1ms	remaining: 57.3ms
15:	learn: 0.5482678	total: 10.8ms	remaining: 56.6ms
16:	learn: 0.5446591	total: 11.4ms	remaining: 55.8ms
17:	learn: 0.5421223	total: 12ms	remaining: 54.8ms
18:	learn: 0.5396655	total: 12.7ms	remaining: 54.2ms
19:	learn: 0.

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


0:	learn: 0.6715660	total: 841us	remaining: 83.3ms
1:	learn: 0.6555662	total: 1.61ms	remaining: 79ms
2:	learn: 0.6381368	total: 2.31ms	remaining: 74.6ms
3:	learn: 0.6248515	total: 2.93ms	remaining: 70.3ms
4:	learn: 0.6128915	total: 3.56ms	remaining: 67.6ms
5:	learn: 0.6055659	total: 4.24ms	remaining: 66.4ms
6:	learn: 0.5984308	total: 4.94ms	remaining: 65.6ms
7:	learn: 0.5916192	total: 5.64ms	remaining: 64.9ms
8:	learn: 0.5851955	total: 6.29ms	remaining: 63.6ms
9:	learn: 0.5796052	total: 6.94ms	remaining: 62.5ms
10:	learn: 0.5756915	total: 7.56ms	remaining: 61.2ms
11:	learn: 0.5702822	total: 8.21ms	remaining: 60.2ms
12:	learn: 0.5645748	total: 8.84ms	remaining: 59.2ms
13:	learn: 0.5608572	total: 9.51ms	remaining: 58.4ms
14:	learn: 0.5560588	total: 10.2ms	remaining: 57.6ms
15:	learn: 0.5532133	total: 10.9ms	remaining: 57.1ms
16:	learn: 0.5492534	total: 11.6ms	remaining: 56.4ms
17:	learn: 0.5464919	total: 12.2ms	remaining: 55.7ms
18:	learn: 0.5429196	total: 12.9ms	remaining: 55.2ms
19:	le

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


In [17]:
from tabulate import tabulate

# Models list
models = [
    'Random Forest', 'Gradient Boosting', 'XGBoost',
    'LightGBM', 'CatBoost', 'SVM', 'Dummy'
]

# Prepare table data
table_data = []
for model in models:
    row = [model]
    
    # Shell 1: SHORT_INTERACTION, Leave-One-Group-Out
    row.append(results1_logo.get(model, {}).get('Average Accuracy', 0.0))
    row.append(results1_logo.get(model, {}).get('Average F1-Score (macro)', 0.0))
    
    # Shell 3: LONG_INTERACTION, Leave-One-Group-Out
    row.append(results2_logo.get(model, {}).get('Average Accuracy', 0.0))
    row.append(results2_logo.get(model, {}).get('Average F1-Score (macro)', 0.0))

    # Shell 2: SHORT_INTERACTION, 5-fold
    row.append(results1_kfold.get(model, {}).get('Average Accuracy', 0.0))
    row.append(results1_kfold.get(model, {}).get('Average F1-Score (macro)', 0.0))
    
    # Shell 4: LONG_INTERACTION, 5-fold
    row.append(results2_kfold.get(model, {}).get('Average Accuracy', 0.0))
    row.append(results2_kfold.get(model, {}).get('Average F1-Score (macro)', 0.0))
    
    table_data.append(row)

# Define headers
headers = [
    'Model',
    'LOSO CV\nShort\nInteraction\nAcc', 'LOSO CV\nShort\nInteraction\nF1',
    'LOSO CV\nLong\nInteraction\nAcc', 'LOSO CV\nLong\nInteraction\nF1',
    '5-fold CV\nShort\nInteraction\nAcc', '5-fold CV\nShort\nInteraction\nF1',
    '5-fold CV\nLong\nInteraction\nAcc', '5-fold CV\nLong\nInteraction\nF1'
]

# Print the table
print("Machine learning model performance")
print(tabulate(table_data, headers=headers, tablefmt='fancy_grid', floatfmt='.3f', numalign="decimal"))


Machine learning model performance
╒═══════════════════╤═══════════════╤═══════════════╤═══════════════╤═══════════════╤═══════════════╤═══════════════╤═══════════════╤═══════════════╕
│ Model             │       LOSO CV │       LOSO CV │       LOSO CV │       LOSO CV │     5-fold CV │     5-fold CV │     5-fold CV │     5-fold CV │
│                   │         Short │         Short │          Long │          Long │         Short │         Short │          Long │          Long │
│                   │   Interaction │   Interaction │   Interaction │   Interaction │   Interaction │   Interaction │   Interaction │   Interaction │
│                   │      Accuracy │            F1 │      Accuracy │            F1 │      Accuracy │            F1 │      Accuracy │            F1 │
╞═══════════════════╪═══════════════╪═══════════════╪═══════════════╪═══════════════╪═══════════════╪═══════════════╪═══════════════╪═══════════════╡
│ Random Forest     │         0.834 │         0.738 │         0.6