# Settings

## Constants

In [1]:
import os

PATH_DATA = './Dataset'
sensor_data = ['UserInfo.csv', 'Service.csv', 'ContextualFactor.csv', 'Interruptibility.csv']

# Utility Functions

In [2]:
import pandas as pd
import numpy as np
from collections import defaultdict
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import LeaveOneGroupOut
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.svm import SVC
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, f1_score


# Load the Dataset into Dataframe

In [3]:
dataframes = {
    filename: pd.read_csv(os.path.join(PATH_DATA, filename)).reset_index(drop=True)
    for filename in sensor_data
}
dfService = dataframes['Service.csv']
dfContextualFactor = dataframes['ContextualFactor.csv']
dfUserInfo = dataframes['UserInfo.csv']
dfInterruptibility = dataframes['Interruptibility.csv']

# Preprocessing

In [4]:
# Select specific columns from dfContextualFactor, dfService, and dfUserInfo DataFrames
# Combine relevant columns to create a unified dataset
dfContextualFactor_selected_columns=dfContextualFactor[['uid','sid','activity1','activity2','activity3','userLocation','userPosition',]]
dfService_selected_columns=dfService[['weekOfExperiment','dayOfWeek','startTime', 'activityInquiry','availabilityInquiry','speechShadowing_1','speechShadowing_2','speechShadowing_3','speechShadowing_4','speechShadowing_5','continue-to-nextInquiry_1','continue-to-nextInquiry_2','continue-to-nextInquiry_3','continue-to-nextInquiry_4','endTime','endType']]
dfInterruptibility_selected_columns=dfInterruptibility[['SHORT_INTERACTION_interruptibility', 'LONG_INTERACTION_interruptibility']]

## Create binary columns for interaction types
## SHORT_INTERACTION: True if availabilityInquiry is not NaN
# dfCombinedAll['SHORT_INTERACTION_interruptibility'] = dfCombinedAll['availabilityInquiry'].notna()
## LONG_INTERACTION: True if continue-to-nextInquiry_1 is not NaN
# dfCombinedAll['LONG_INTERACTION_interruptibility'] = dfCombinedAll['continue-to-nextInquiry_1'].notna()

# ! For those who want to redefine LONG_INTERACTION with thresholds longer than 3 minutes,
# you can use the following columns:
# 5 minutes  => use continue-to-nextInquiry_2
# 7 minutes  => use continue-to-nextInquiry_3
# 9 minutes  => use continue-to-nextInquiry_4


dfCombinedAll=pd.concat([dfContextualFactor_selected_columns, dfService_selected_columns,dfInterruptibility_selected_columns], axis=1)

In [5]:
# Concatenate and merge the all columns from dfUserInfo (dfUserInfo.csv)
dfUserInfo_selected_columns = dfUserInfo[['uid', 'homeType', 'speakerLocation', 'speakerPosition']]
dfCombinedAll = pd.merge(dfCombinedAll, dfUserInfo_selected_columns, on='uid', how='left')

## Position Processing

In [6]:
# Define function to calculate proximity between user and speaker
def calculate_proximity(row):
    # Return 0 if user and speaker are in different locations
    if row['userLocation'] != row['speakerLocation']:
        return 0
    # If in the same locations
    elif row['userPosition'] == row['speakerPosition']:
        return 2  # Same position
    else:
        return 1  # Different positions (including missing position)

# Apply proximity calculation to create a new 'proximity' column
dfCombinedAll['proximity'] = dfCombinedAll.apply(calculate_proximity, axis=1)

print(dfCombinedAll[['userLocation', 'userPosition', 'speakerLocation', 'speakerPosition', 'proximity']].head(10))


  userLocation userPosition speakerLocation speakerPosition  proximity
0     Bed Room          Bed        Bed Room            Desk          1
1    Rest Room          NaN        Bed Room            Desk          0
2  Living Room          NaN        Bed Room            Desk          0
3  Living Room          NaN        Bed Room            Desk          0
4     Bed Room          Bed        Bed Room            Desk          1
5     Bed Room          Bed        Bed Room            Desk          1
6     Bed Room          Bed        Bed Room            Desk          1
7     Bed Room         Desk        Bed Room            Desk          2
8     Bed Room          Bed        Bed Room            Desk          1
9     Bed Room          Bed        Bed Room            Desk          1


## Activity and Time Processing

In [7]:
# Process activity columns for one-hot encoding
activity_cols = ['activity1', 'activity2', 'activity3']
df_activity = dfContextualFactor[activity_cols].copy()

# Get unique activities across all activity columns, excluding NaN
all_unique_activities = pd.unique(df_activity.values.ravel())
all_unique_activities = [x for x in all_unique_activities if pd.notna(x)]

# Create a DataFrame for one-hot encoding of activities
dfActivity_one_hot_encoding = pd.DataFrame(0, index=df_activity.index, columns=['act_' + str(val) for val in all_unique_activities])

# Perform one-hot encoding for each activity column
for col in activity_cols:
    for val in all_unique_activities:
        dfActivity_one_hot_encoding['act_' + str(val)] |= (df_activity[col] == val).astype(int)

# Concatenate one-hot encoded activity columns to dfCombinedAll
dfActivity_one_hot_encoding
dfCombinedAll = pd.concat([dfCombinedAll, dfActivity_one_hot_encoding], axis=1)

In [8]:
# Convert startTime to datetime and extract total minutes since midnight
dfCombinedAll['startTime'] = pd.to_datetime(dfCombinedAll['startTime'], format='%H:%M:%S', errors='coerce')
dfCombinedAll['minute'] = dfCombinedAll['startTime'].dt.hour * 60 + dfCombinedAll['startTime'].dt.minute

# Map days of the week to numerical values (MON=0, TUE=1, ..., SUN=6)
day_map = {'MON': 0, 'TUE': 1, 'WED': 2, 'THU': 3, 'FRI': 4, 'SAT': 5, 'SUN': 6}
dfCombinedAll['dayOfWeek'] = dfCombinedAll['dayOfWeek'].map(day_map)

# Bin minutes into 30-minute intervals for temporal analysis
dfCombinedAll['minute_bin'] = (dfCombinedAll['minute'] // 30).astype(int)

In [9]:
dfCombinedAll.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2830 entries, 0 to 2829
Data columns (total 43 columns):
 #   Column                                    Non-Null Count  Dtype         
---  ------                                    --------------  -----         
 0   uid                                       2830 non-null   int64         
 1   sid                                       2830 non-null   int64         
 2   activity1                                 2830 non-null   object        
 3   activity2                                 110 non-null    object        
 4   activity3                                 3 non-null      object        
 5   userLocation                              2830 non-null   object        
 6   userPosition                              2343 non-null   object        
 7   weekOfExperiment                          2830 non-null   int64         
 8   dayOfWeek                                 2830 non-null   int64         
 9   startTime                     

# Feature Extraction

In [15]:
# Select features for response prediction
dfFeatresForResponse = dfCombinedAll[['uid',
    'act_Taking a Nap / Sleeping','act_Hygiene','act_Eating','act_Using Media','act_Social Interaction',
    'act_Returning from Outside / Other Rooms','act_Studying / Working','act_Others','act_House Chores',
    'act_Self Caring','act_Visiting Outside / Other Rooms','act_Resting',
    'homeType','userLocation','userPosition','speakerLocation','speakerPosition',
    'minute_bin','dayOfWeek','SHORT_INTERACTION_interruptibility','LONG_INTERACTION_interruptibility']].copy()

# Encode categorical columns using LabelEncoder
categorical_columns = ['homeType', 'userLocation', 'userPosition', 'speakerLocation', 'speakerPosition', 'minute_bin','SHORT_INTERACTION_interruptibility','LONG_INTERACTION_interruptibility']
label_encoders = defaultdict(LabelEncoder)

# Apply label encoding to each categorical column
for col in categorical_columns:
    dfFeatresForResponse[col] = label_encoders[col].fit_transform(dfFeatresForResponse[col])

# Create a copy of the encoded data for further processing
encoded_data = dfFeatresForResponse.copy()


encoded_data.info()
encoded_data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2830 entries, 0 to 2829
Data columns (total 22 columns):
 #   Column                                    Non-Null Count  Dtype
---  ------                                    --------------  -----
 0   uid                                       2830 non-null   int64
 1   act_Taking a Nap / Sleeping               2830 non-null   int64
 2   act_Hygiene                               2830 non-null   int64
 3   act_Eating                                2830 non-null   int64
 4   act_Using Media                           2830 non-null   int64
 5   act_Social Interaction                    2830 non-null   int64
 6   act_Returning from Outside / Other Rooms  2830 non-null   int64
 7   act_Studying / Working                    2830 non-null   int64
 8   act_Others                                2830 non-null   int64
 9   act_House Chores                          2830 non-null   int64
 10  act_Self Caring                           2830 non-null   in

Unnamed: 0,uid,act_Taking a Nap / Sleeping,act_Hygiene,act_Eating,act_Using Media,act_Social Interaction,act_Returning from Outside / Other Rooms,act_Studying / Working,act_Others,act_House Chores,...,act_Resting,homeType,userLocation,userPosition,speakerLocation,speakerPosition,minute_bin,dayOfWeek,SHORT_INTERACTION_interruptibility,LONG_INTERACTION_interruptibility
0,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,12,0,0,1
1,1,0,1,0,0,0,0,0,0,0,...,0,0,3,3,0,1,15,0,1,1
2,1,0,0,1,0,0,0,0,0,0,...,0,0,1,3,0,1,12,1,1,1
3,1,0,0,1,0,0,0,0,0,0,...,0,0,1,3,0,1,13,1,1,1
4,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,18,1,0,1


## Label: SHORT_INTERACTION

In [16]:
# Label distribution before balancing
print(encoded_data['SHORT_INTERACTION_interruptibility'].value_counts())

SHORT_INTERACTION_interruptibility
0    2088
1     742
Name: count, dtype: int64


### Model Building and LOSO CV

In [17]:
# Prepare features (X) and target variables (y) for SHORT_INTERACTION
X = encoded_data.drop(columns=['SHORT_INTERACTION_interruptibility', 'LONG_INTERACTION_interruptibility', 'uid'], axis=1) # Drop target and unrelated columns
y = encoded_data['SHORT_INTERACTION_interruptibility'] # Target variable
groups = encoded_data['uid'] # Group by user ID for Leave-One-Group-Out CV

# Initialize Leave-One-Group-Out cross-validator
logo = LeaveOneGroupOut()

# Initialize SMOTE for oversampling to handle class imbalance
smote = SMOTE(random_state=42)

# Define models to evaluate
models1 = {
    'Random Forest': RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42),
    'XGBoost': XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42),
    'LightGBM': LGBMClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42),
    'CatBoost': CatBoostClassifier(iterations=100, depth=5, learning_rate=0.1, loss_function='Logloss', cat_features=[0], random_seed=42),
    'SVM': SVC(random_state=42),
    'Dummy': DummyClassifier(strategy="stratified", random_state=42)
}

results1_logo = {}

# Loop over each model
for model_name, model1 in models1.items():
    accuracies = []
    f1_scores = []

    # Loop over each fold in Leave-One-Group-Out cross-validation
    for i, (train_index, test_index) in enumerate(logo.split(X, y, groups)):
        # Split the data into training and testing sets for the current fold
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Apply SMOTE to the training data to balance class distribution
        X_train_oversampled, y_train_oversampled = smote.fit_resample(X_train, y_train)

        # Train the model on the oversampled training data
        model1.fit(X_train_oversampled, y_train_oversampled.ravel())

        # Predict the target on the test data
        y_pred = model1.predict(X_test)

        # Evaluate the prediction
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='macro')

        accuracies.append(accuracy)
        f1_scores.append(f1)
    # Compute average accuracy and F1-score across all folds
    avg_accuracy = sum(accuracies) / len(accuracies)
    avg_f1_score = sum(f1_scores) / len(f1_scores)

    results1_logo[model_name] = {
        'Average Accuracy': avg_accuracy,
        'Average F1-Score (macro)': avg_f1_score
    }


  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.r

[LightGBM] [Info] Number of positive: 2034, number of negative: 2034
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000544 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 87
[LightGBM] [Info] Number of data points in the train set: 4068, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2050, number of negative: 2050
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000277 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 89
[LightGBM] [Info] Number of data points in the train set: 4100, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[Lig

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


[LightGBM] [Info] Number of positive: 1974, number of negative: 1974
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000243 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 89
[LightGBM] [Info] Number of data points in the train set: 3948, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2044, number of negative: 2044
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000311 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 89
[LightGBM] [Info] Number of data points in the train set: 4088, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[Lig

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


[LightGBM] [Info] Number of positive: 2026, number of negative: 2026
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000251 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 89
[LightGBM] [Info] Number of data points in the train set: 4052, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1984, number of negative: 1984
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000283 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 88
[LightGBM] [Info] Number of data points in the train set: 3968, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[Lig

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


[LightGBM] [Info] Number of positive: 1986, number of negative: 1986
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000298 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 89
[LightGBM] [Info] Number of data points in the train set: 3972, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1969, number of negative: 1969
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000271 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 89
[LightGBM] [Info] Number of data points in the train set: 3938, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[Lig

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


[LightGBM] [Info] Number of positive: 2050, number of negative: 2050
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000246 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 89
[LightGBM] [Info] Number of data points in the train set: 4100, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2046, number of negative: 2046
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000299 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 89
[LightGBM] [Info] Number of data points in the train set: 4092, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[Lig

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


[LightGBM] [Info] Number of positive: 2041, number of negative: 2041
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000286 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 88
[LightGBM] [Info] Number of data points in the train set: 4082, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1936, number of negative: 1936
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000281 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 89
[LightGBM] [Info] Number of data points in the train set: 3872, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[Lig

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


[LightGBM] [Info] Number of positive: 2037, number of negative: 2037
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000314 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 89
[LightGBM] [Info] Number of data points in the train set: 4074, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
0:	learn: 0.6538049	total: 142ms	remaining: 14s
1:	learn: 0.6209557	total: 143ms	remaining: 7.02s
2:	learn: 0.5955366	total: 145ms	remaining: 4.68s
3:	learn: 0.5744659	total: 146ms	remaining: 3.5s
4:	learn: 0.5578011	total: 147ms	remaining: 2.79s
5:	learn: 0.5426109	total: 148ms	remaining: 2.32s
6:	learn: 0.5268839	total: 149ms	remaining: 1.98s
7:	learn: 0.5087759	total: 150ms	remaining: 1.73s
8:	learn: 0.4934059	total: 151ms	remaining: 1.53s
9:	learn: 0.4805050	total: 152ms	remaining: 1.37s
10:	l

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


51:	learn: 0.3750508	total: 43.4ms	remaining: 40.1ms
52:	learn: 0.3745020	total: 44.3ms	remaining: 39.3ms
53:	learn: 0.3735739	total: 45.2ms	remaining: 38.5ms
54:	learn: 0.3728167	total: 46ms	remaining: 37.6ms
55:	learn: 0.3717077	total: 46.9ms	remaining: 36.8ms
56:	learn: 0.3711587	total: 47.7ms	remaining: 36ms
57:	learn: 0.3704560	total: 48.6ms	remaining: 35.2ms
58:	learn: 0.3695699	total: 49.5ms	remaining: 34.4ms
59:	learn: 0.3687136	total: 50.4ms	remaining: 33.6ms
60:	learn: 0.3680116	total: 51.3ms	remaining: 32.8ms
61:	learn: 0.3673180	total: 52.3ms	remaining: 32.1ms
62:	learn: 0.3667499	total: 53.3ms	remaining: 31.3ms
63:	learn: 0.3660141	total: 54.2ms	remaining: 30.5ms
64:	learn: 0.3650474	total: 55ms	remaining: 29.6ms
65:	learn: 0.3645594	total: 55.9ms	remaining: 28.8ms
66:	learn: 0.3640317	total: 56.7ms	remaining: 27.9ms
67:	learn: 0.3635384	total: 57.5ms	remaining: 27ms
68:	learn: 0.3627954	total: 58.4ms	remaining: 26.2ms
69:	learn: 0.3620459	total: 59.2ms	remaining: 25.4ms
7

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


0:	learn: 0.6569628	total: 903us	remaining: 89.4ms
1:	learn: 0.6192980	total: 1.72ms	remaining: 84.4ms
2:	learn: 0.5927637	total: 2.48ms	remaining: 80ms
3:	learn: 0.5695522	total: 3.26ms	remaining: 78.2ms
4:	learn: 0.5471581	total: 4.01ms	remaining: 76.2ms
5:	learn: 0.5299836	total: 4.77ms	remaining: 74.8ms
6:	learn: 0.5161869	total: 5.54ms	remaining: 73.5ms
7:	learn: 0.4995754	total: 6.29ms	remaining: 72.3ms
8:	learn: 0.4846708	total: 7.03ms	remaining: 71.1ms
9:	learn: 0.4725771	total: 7.78ms	remaining: 70ms
10:	learn: 0.4634851	total: 8.58ms	remaining: 69.4ms
11:	learn: 0.4562664	total: 9.34ms	remaining: 68.5ms
12:	learn: 0.4472927	total: 10.1ms	remaining: 67.8ms
13:	learn: 0.4415561	total: 10.9ms	remaining: 67ms
14:	learn: 0.4364838	total: 11.7ms	remaining: 66.5ms
15:	learn: 0.4303000	total: 12.5ms	remaining: 65.6ms
16:	learn: 0.4250963	total: 13.3ms	remaining: 64.9ms
17:	learn: 0.4215521	total: 14.1ms	remaining: 64.1ms
18:	learn: 0.4176563	total: 14.8ms	remaining: 63.3ms
19:	learn:

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


0:	learn: 0.6560235	total: 1.69ms	remaining: 167ms
1:	learn: 0.6244673	total: 2.53ms	remaining: 124ms
2:	learn: 0.5985210	total: 3.46ms	remaining: 112ms
3:	learn: 0.5750324	total: 4.33ms	remaining: 104ms
4:	learn: 0.5543673	total: 5.19ms	remaining: 98.6ms
5:	learn: 0.5390228	total: 6.07ms	remaining: 95.1ms
6:	learn: 0.5251281	total: 6.91ms	remaining: 91.8ms
7:	learn: 0.5075731	total: 7.76ms	remaining: 89.3ms
8:	learn: 0.4957946	total: 8.69ms	remaining: 87.9ms
9:	learn: 0.4827633	total: 9.57ms	remaining: 86.2ms
10:	learn: 0.4761932	total: 10.5ms	remaining: 84.8ms
11:	learn: 0.4690705	total: 11.4ms	remaining: 83.7ms
12:	learn: 0.4606881	total: 12.2ms	remaining: 81.5ms
13:	learn: 0.4524680	total: 13ms	remaining: 80ms
14:	learn: 0.4456319	total: 13.9ms	remaining: 78.5ms
15:	learn: 0.4416264	total: 14.7ms	remaining: 77.1ms
16:	learn: 0.4355563	total: 15.6ms	remaining: 76ms
17:	learn: 0.4318271	total: 16.4ms	remaining: 74.8ms
18:	learn: 0.4256403	total: 17.2ms	remaining: 73.3ms
19:	learn: 0.

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


51:	learn: 0.3765660	total: 51.6ms	remaining: 47.7ms
52:	learn: 0.3759805	total: 52.5ms	remaining: 46.6ms
53:	learn: 0.3755223	total: 53.4ms	remaining: 45.5ms
54:	learn: 0.3746809	total: 54.4ms	remaining: 44.5ms
55:	learn: 0.3737108	total: 55.2ms	remaining: 43.4ms
56:	learn: 0.3727513	total: 56.1ms	remaining: 42.3ms
57:	learn: 0.3719813	total: 56.9ms	remaining: 41.2ms
58:	learn: 0.3709414	total: 57.7ms	remaining: 40.1ms
59:	learn: 0.3698586	total: 58.5ms	remaining: 39ms
60:	learn: 0.3691002	total: 59.4ms	remaining: 38ms
61:	learn: 0.3686462	total: 60.1ms	remaining: 36.8ms
62:	learn: 0.3681756	total: 61ms	remaining: 35.8ms
63:	learn: 0.3675122	total: 61.8ms	remaining: 34.8ms
64:	learn: 0.3667453	total: 62.7ms	remaining: 33.7ms
65:	learn: 0.3657932	total: 63.5ms	remaining: 32.7ms
66:	learn: 0.3650639	total: 64.4ms	remaining: 31.7ms
67:	learn: 0.3639777	total: 65.2ms	remaining: 30.7ms
68:	learn: 0.3631623	total: 66.1ms	remaining: 29.7ms
69:	learn: 0.3624963	total: 66.9ms	remaining: 28.7ms

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


0:	learn: 0.6562240	total: 898us	remaining: 88.9ms
1:	learn: 0.6222441	total: 1.74ms	remaining: 85.5ms
2:	learn: 0.5963689	total: 2.52ms	remaining: 81.5ms
3:	learn: 0.5729618	total: 3.32ms	remaining: 79.8ms
4:	learn: 0.5524940	total: 4.1ms	remaining: 78ms
5:	learn: 0.5368208	total: 4.87ms	remaining: 76.3ms
6:	learn: 0.5243666	total: 5.66ms	remaining: 75.1ms
7:	learn: 0.5089660	total: 6.46ms	remaining: 74.3ms
8:	learn: 0.4987089	total: 7.24ms	remaining: 73.2ms
9:	learn: 0.4844502	total: 8.08ms	remaining: 72.7ms
10:	learn: 0.4722589	total: 8.92ms	remaining: 72.2ms
11:	learn: 0.4646444	total: 9.72ms	remaining: 71.3ms
12:	learn: 0.4565881	total: 10.5ms	remaining: 70.5ms
13:	learn: 0.4484137	total: 11.3ms	remaining: 69.5ms
14:	learn: 0.4452372	total: 12ms	remaining: 68.2ms
15:	learn: 0.4406256	total: 13.1ms	remaining: 68.8ms
16:	learn: 0.4340956	total: 14.2ms	remaining: 69.2ms
17:	learn: 0.4286368	total: 15.2ms	remaining: 69.2ms
18:	learn: 0.4244340	total: 16.3ms	remaining: 69.5ms
19:	learn

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


7:	learn: 0.5081137	total: 6.87ms	remaining: 79ms
8:	learn: 0.4952955	total: 7.9ms	remaining: 79.9ms
9:	learn: 0.4832148	total: 8.74ms	remaining: 78.7ms
10:	learn: 0.4750189	total: 9.55ms	remaining: 77.3ms
11:	learn: 0.4678822	total: 10.4ms	remaining: 76ms
12:	learn: 0.4595964	total: 11.2ms	remaining: 74.9ms
13:	learn: 0.4537853	total: 12ms	remaining: 73.8ms
14:	learn: 0.4462659	total: 12.8ms	remaining: 72.7ms
15:	learn: 0.4410297	total: 13.7ms	remaining: 71.8ms
16:	learn: 0.4362086	total: 14.5ms	remaining: 71ms
17:	learn: 0.4313570	total: 15.5ms	remaining: 70.5ms
18:	learn: 0.4275728	total: 16.4ms	remaining: 70.1ms
19:	learn: 0.4235655	total: 17.3ms	remaining: 69.3ms
20:	learn: 0.4210994	total: 18.2ms	remaining: 68.5ms
21:	learn: 0.4182044	total: 19ms	remaining: 67.5ms
22:	learn: 0.4158961	total: 19.9ms	remaining: 66.5ms
23:	learn: 0.4135268	total: 20.8ms	remaining: 65.8ms
24:	learn: 0.4104541	total: 21.7ms	remaining: 65ms
25:	learn: 0.4085333	total: 22.5ms	remaining: 64ms
26:	learn: 

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


77:	learn: 0.3504153	total: 65.6ms	remaining: 18.5ms
78:	learn: 0.3498943	total: 66.5ms	remaining: 17.7ms
79:	learn: 0.3492710	total: 67.3ms	remaining: 16.8ms
80:	learn: 0.3488137	total: 68.3ms	remaining: 16ms
81:	learn: 0.3481833	total: 69.3ms	remaining: 15.2ms
82:	learn: 0.3476707	total: 70.3ms	remaining: 14.4ms
83:	learn: 0.3471145	total: 71.4ms	remaining: 13.6ms
84:	learn: 0.3465608	total: 72.3ms	remaining: 12.8ms
85:	learn: 0.3462105	total: 73.1ms	remaining: 11.9ms
86:	learn: 0.3455674	total: 73.9ms	remaining: 11ms
87:	learn: 0.3451056	total: 74.7ms	remaining: 10.2ms
88:	learn: 0.3446722	total: 75.5ms	remaining: 9.33ms
89:	learn: 0.3442069	total: 76.3ms	remaining: 8.48ms
90:	learn: 0.3435748	total: 77.2ms	remaining: 7.63ms
91:	learn: 0.3430661	total: 78ms	remaining: 6.78ms
92:	learn: 0.3424932	total: 78.9ms	remaining: 5.94ms
93:	learn: 0.3419805	total: 79.7ms	remaining: 5.09ms
94:	learn: 0.3416461	total: 80.6ms	remaining: 4.24ms
95:	learn: 0.3412830	total: 81.3ms	remaining: 3.39ms

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


13:	learn: 0.4649639	total: 11.1ms	remaining: 68.4ms
14:	learn: 0.4581044	total: 12.1ms	remaining: 68.6ms
15:	learn: 0.4532654	total: 12.9ms	remaining: 68ms
16:	learn: 0.4469045	total: 13.8ms	remaining: 67.4ms
17:	learn: 0.4421328	total: 14.7ms	remaining: 66.9ms
18:	learn: 0.4379041	total: 15.6ms	remaining: 66.6ms
19:	learn: 0.4327148	total: 16.5ms	remaining: 66ms
20:	learn: 0.4295664	total: 17.4ms	remaining: 65.4ms
21:	learn: 0.4251710	total: 18.3ms	remaining: 64.7ms
22:	learn: 0.4240502	total: 18.9ms	remaining: 63.2ms
23:	learn: 0.4216094	total: 19.7ms	remaining: 62.2ms
24:	learn: 0.4179273	total: 20.5ms	remaining: 61.5ms
25:	learn: 0.4166632	total: 21.4ms	remaining: 61ms
26:	learn: 0.4152234	total: 22.3ms	remaining: 60.3ms
27:	learn: 0.4128302	total: 23.1ms	remaining: 59.3ms
28:	learn: 0.4114567	total: 23.9ms	remaining: 58.5ms
29:	learn: 0.4093869	total: 24.8ms	remaining: 57.9ms
30:	learn: 0.4085485	total: 25.5ms	remaining: 56.7ms
31:	learn: 0.4073835	total: 26.3ms	remaining: 56ms
3

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


11:	learn: 0.4632459	total: 10.2ms	remaining: 74.4ms
12:	learn: 0.4571355	total: 11.1ms	remaining: 74.2ms
13:	learn: 0.4492524	total: 11.9ms	remaining: 73.4ms
14:	learn: 0.4422205	total: 12.8ms	remaining: 72.8ms
15:	learn: 0.4356313	total: 13.7ms	remaining: 71.8ms
16:	learn: 0.4294268	total: 14.5ms	remaining: 70.6ms
17:	learn: 0.4251082	total: 15.3ms	remaining: 69.6ms
18:	learn: 0.4220378	total: 16.1ms	remaining: 68.6ms
19:	learn: 0.4177670	total: 16.9ms	remaining: 67.6ms
20:	learn: 0.4138136	total: 17.7ms	remaining: 66.6ms
21:	learn: 0.4114897	total: 18.6ms	remaining: 66.1ms
22:	learn: 0.4096169	total: 19.6ms	remaining: 65.6ms
23:	learn: 0.4072070	total: 20.7ms	remaining: 65.5ms
24:	learn: 0.4050400	total: 21.6ms	remaining: 64.8ms
25:	learn: 0.4024990	total: 22.4ms	remaining: 63.9ms
26:	learn: 0.4006778	total: 23.3ms	remaining: 63ms
27:	learn: 0.3987995	total: 24.2ms	remaining: 62.2ms
28:	learn: 0.3978179	total: 25.1ms	remaining: 61.4ms
29:	learn: 0.3969270	total: 25.9ms	remaining: 60

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


80:	learn: 0.3439129	total: 71.1ms	remaining: 16.7ms
81:	learn: 0.3434324	total: 72.2ms	remaining: 15.8ms
82:	learn: 0.3426560	total: 73.2ms	remaining: 15ms
83:	learn: 0.3422028	total: 74.2ms	remaining: 14.1ms
84:	learn: 0.3415758	total: 75.1ms	remaining: 13.2ms
85:	learn: 0.3412035	total: 75.9ms	remaining: 12.4ms
86:	learn: 0.3401923	total: 76.7ms	remaining: 11.5ms
87:	learn: 0.3394084	total: 77.6ms	remaining: 10.6ms
88:	learn: 0.3389742	total: 78.4ms	remaining: 9.69ms
89:	learn: 0.3385464	total: 79.5ms	remaining: 8.83ms
90:	learn: 0.3382145	total: 80.5ms	remaining: 7.96ms
91:	learn: 0.3378231	total: 81.7ms	remaining: 7.1ms
92:	learn: 0.3369842	total: 82.8ms	remaining: 6.23ms
93:	learn: 0.3363803	total: 83.7ms	remaining: 5.34ms
94:	learn: 0.3359409	total: 84.6ms	remaining: 4.45ms
95:	learn: 0.3351191	total: 85.6ms	remaining: 3.56ms
96:	learn: 0.3345893	total: 86.5ms	remaining: 2.67ms
97:	learn: 0.3338363	total: 87.4ms	remaining: 1.78ms
98:	learn: 0.3334920	total: 88.2ms	remaining: 890

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


0:	learn: 0.6555376	total: 1ms	remaining: 99.5ms
1:	learn: 0.6235844	total: 1.81ms	remaining: 88.5ms
2:	learn: 0.5933827	total: 2.58ms	remaining: 83.5ms
3:	learn: 0.5710237	total: 3.41ms	remaining: 81.8ms
4:	learn: 0.5522010	total: 4.22ms	remaining: 80.1ms
5:	learn: 0.5363706	total: 5.01ms	remaining: 78.5ms
6:	learn: 0.5224840	total: 5.9ms	remaining: 78.3ms
7:	learn: 0.5086320	total: 6.73ms	remaining: 77.4ms
8:	learn: 0.4997637	total: 7.59ms	remaining: 76.8ms
9:	learn: 0.4858290	total: 8.58ms	remaining: 77.2ms
10:	learn: 0.4773480	total: 9.58ms	remaining: 77.5ms
11:	learn: 0.4710727	total: 10.5ms	remaining: 77.2ms
12:	learn: 0.4611057	total: 11.4ms	remaining: 76.1ms
13:	learn: 0.4526533	total: 12.2ms	remaining: 74.8ms
14:	learn: 0.4451799	total: 13ms	remaining: 73.8ms
15:	learn: 0.4400126	total: 13.8ms	remaining: 72.6ms
16:	learn: 0.4339540	total: 14.6ms	remaining: 71.4ms
17:	learn: 0.4289856	total: 15.6ms	remaining: 70.9ms
18:	learn: 0.4257573	total: 16.4ms	remaining: 69.9ms
19:	learn

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


0:	learn: 0.6544117	total: 927us	remaining: 91.8ms
1:	learn: 0.6249201	total: 1.71ms	remaining: 84ms
2:	learn: 0.5963593	total: 2.55ms	remaining: 82.6ms
3:	learn: 0.5738709	total: 3.35ms	remaining: 80.3ms
4:	learn: 0.5533208	total: 4.14ms	remaining: 78.8ms
5:	learn: 0.5366001	total: 5.13ms	remaining: 80.4ms
6:	learn: 0.5178801	total: 6.03ms	remaining: 80.1ms
7:	learn: 0.5033031	total: 6.84ms	remaining: 78.7ms
8:	learn: 0.4884807	total: 7.8ms	remaining: 78.9ms
9:	learn: 0.4789881	total: 8.6ms	remaining: 77.4ms
10:	learn: 0.4685460	total: 9.41ms	remaining: 76.2ms
11:	learn: 0.4609102	total: 10.2ms	remaining: 74.8ms
12:	learn: 0.4515636	total: 11.1ms	remaining: 74.1ms
13:	learn: 0.4432493	total: 11.9ms	remaining: 73.1ms
14:	learn: 0.4384408	total: 12.8ms	remaining: 72.6ms
15:	learn: 0.4311124	total: 13.6ms	remaining: 71.6ms
16:	learn: 0.4266337	total: 14.6ms	remaining: 71.4ms
17:	learn: 0.4208247	total: 15.5ms	remaining: 70.5ms
18:	learn: 0.4161441	total: 16.4ms	remaining: 69.8ms
19:	lear

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


54:	learn: 0.3587207	total: 50.6ms	remaining: 41.4ms
55:	learn: 0.3579581	total: 51.6ms	remaining: 40.6ms
56:	learn: 0.3571185	total: 52.5ms	remaining: 39.6ms
57:	learn: 0.3566297	total: 53.4ms	remaining: 38.7ms
58:	learn: 0.3556686	total: 54.4ms	remaining: 37.8ms
59:	learn: 0.3548915	total: 55.2ms	remaining: 36.8ms
60:	learn: 0.3542693	total: 56.3ms	remaining: 36ms
61:	learn: 0.3535535	total: 57.4ms	remaining: 35.2ms
62:	learn: 0.3527940	total: 58.4ms	remaining: 34.3ms
63:	learn: 0.3521635	total: 59.6ms	remaining: 33.5ms
64:	learn: 0.3513379	total: 60.6ms	remaining: 32.6ms
65:	learn: 0.3496508	total: 61.6ms	remaining: 31.7ms
66:	learn: 0.3489191	total: 62.6ms	remaining: 30.8ms
67:	learn: 0.3479739	total: 63.5ms	remaining: 29.9ms
68:	learn: 0.3475503	total: 64.6ms	remaining: 29ms
69:	learn: 0.3469609	total: 65.6ms	remaining: 28.1ms
70:	learn: 0.3464168	total: 66.7ms	remaining: 27.2ms
71:	learn: 0.3454552	total: 67.8ms	remaining: 26.4ms
72:	learn: 0.3449885	total: 68.8ms	remaining: 25.5

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.r

### K-Fold CV

In [18]:
from sklearn.model_selection import KFold

# Initialize 5-fold cross-validator
kfold = KFold(n_splits=5, shuffle=False)

results1_kfold = {}

# Loop over each model
for model_name, model1 in models1.items():
    accuracies = []
    f1_scores = []

    # Loop over each fold in 5-fold cross-validation
    for train_index, test_index in kfold.split(X, y):
        # Split the data into training and testing sets for the current fold
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Apply SMOTE to the training data to balance class distribution
        X_train_oversampled, y_train_oversampled = smote.fit_resample(X_train, y_train)

        # Train the model on the oversampled training data
        model1.fit(X_train_oversampled, y_train_oversampled.ravel())

        # Predict the target on the test data
        y_pred = model1.predict(X_test)

        # Evaluate the prediction
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='macro')

        accuracies.append(accuracy)
        f1_scores.append(f1)

    # Compute average accuracy and F1-score across all folds
    avg_accuracy = sum(accuracies) / len(accuracies)
    avg_f1_score = sum(f1_scores) / len(f1_scores)

    results1_kfold[model_name] = {
        'Average Accuracy': avg_accuracy,
        'Average F1-Score (macro)': avg_f1_score
    }


  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.r

[LightGBM] [Info] Number of positive: 1695, number of negative: 1695
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000196 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 87
[LightGBM] [Info] Number of data points in the train set: 3390, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1699, number of negative: 1699
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000281 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 87
[LightGBM] [Info] Number of data points in the train set: 3398, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[Lig

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


0:	learn: 0.6496598	total: 1.14ms	remaining: 113ms
1:	learn: 0.6191865	total: 2.23ms	remaining: 109ms
2:	learn: 0.5929624	total: 3.04ms	remaining: 98.4ms
3:	learn: 0.5713090	total: 3.98ms	remaining: 95.7ms
4:	learn: 0.5539271	total: 5.06ms	remaining: 96.2ms
5:	learn: 0.5366551	total: 6.15ms	remaining: 96.3ms
6:	learn: 0.5198465	total: 7.2ms	remaining: 95.6ms
7:	learn: 0.5045378	total: 8.3ms	remaining: 95.4ms
8:	learn: 0.4932969	total: 9.35ms	remaining: 94.6ms
9:	learn: 0.4796024	total: 10.4ms	remaining: 93.2ms
10:	learn: 0.4704427	total: 11.3ms	remaining: 91.8ms
11:	learn: 0.4624609	total: 12.6ms	remaining: 92.5ms
12:	learn: 0.4554347	total: 13.7ms	remaining: 91.7ms
13:	learn: 0.4472477	total: 14.8ms	remaining: 90.9ms
14:	learn: 0.4413655	total: 15.8ms	remaining: 89.7ms
15:	learn: 0.4327582	total: 16.8ms	remaining: 88.4ms
16:	learn: 0.4265463	total: 17.9ms	remaining: 87.2ms
17:	learn: 0.4213755	total: 19.2ms	remaining: 87.5ms
18:	learn: 0.4164125	total: 20.3ms	remaining: 86.6ms
19:	lea

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


0:	learn: 0.6547612	total: 986us	remaining: 97.7ms
1:	learn: 0.6216123	total: 1.91ms	remaining: 93.6ms
2:	learn: 0.5940315	total: 2.68ms	remaining: 86.8ms
3:	learn: 0.5710338	total: 3.49ms	remaining: 83.7ms
4:	learn: 0.5533896	total: 4.25ms	remaining: 80.7ms
5:	learn: 0.5288291	total: 5.07ms	remaining: 79.5ms
6:	learn: 0.5085043	total: 5.88ms	remaining: 78.1ms
7:	learn: 0.4914158	total: 6.68ms	remaining: 76.8ms
8:	learn: 0.4817332	total: 7.28ms	remaining: 73.6ms
9:	learn: 0.4680142	total: 8.1ms	remaining: 72.9ms
10:	learn: 0.4567986	total: 8.91ms	remaining: 72.1ms
11:	learn: 0.4465935	total: 9.7ms	remaining: 71.1ms
12:	learn: 0.4386785	total: 10.7ms	remaining: 71.8ms
13:	learn: 0.4312835	total: 11.6ms	remaining: 71.3ms
14:	learn: 0.4258857	total: 12.4ms	remaining: 70.4ms
15:	learn: 0.4191189	total: 13.2ms	remaining: 69.2ms
16:	learn: 0.4157029	total: 14ms	remaining: 68.3ms
17:	learn: 0.4106709	total: 14.8ms	remaining: 67.4ms
18:	learn: 0.4069488	total: 15.6ms	remaining: 66.7ms
19:	lear

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


24:	learn: 0.3932717	total: 20ms	remaining: 60.1ms
25:	learn: 0.3916236	total: 20.8ms	remaining: 59.1ms
26:	learn: 0.3896862	total: 21.5ms	remaining: 58.2ms
27:	learn: 0.3876057	total: 22.3ms	remaining: 57.4ms
28:	learn: 0.3854368	total: 23.1ms	remaining: 56.6ms
29:	learn: 0.3838894	total: 23.9ms	remaining: 55.7ms
30:	learn: 0.3823975	total: 24.7ms	remaining: 54.9ms
31:	learn: 0.3807000	total: 25.5ms	remaining: 54.2ms
32:	learn: 0.3794693	total: 26.3ms	remaining: 53.3ms
33:	learn: 0.3779279	total: 27.1ms	remaining: 52.6ms
34:	learn: 0.3764627	total: 27.9ms	remaining: 51.9ms
35:	learn: 0.3749861	total: 28.8ms	remaining: 51.1ms
36:	learn: 0.3736661	total: 29.6ms	remaining: 50.3ms
37:	learn: 0.3723536	total: 30.3ms	remaining: 49.5ms
38:	learn: 0.3703241	total: 31.1ms	remaining: 48.7ms
39:	learn: 0.3687490	total: 31.9ms	remaining: 47.8ms
40:	learn: 0.3679698	total: 32.8ms	remaining: 47.2ms
41:	learn: 0.3667706	total: 33.6ms	remaining: 46.4ms
42:	learn: 0.3659412	total: 34.4ms	remaining: 45

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


## Label: LONG_INTERACTION

In [19]:
# Label distribution before balancing
print(encoded_data['LONG_INTERACTION_interruptibility'].value_counts())

LONG_INTERACTION_interruptibility
1    1443
0    1387
Name: count, dtype: int64


### Model building and LOSO CV

In [20]:
# Prepare features (X) and target variable (y) for predicting LONG_INTERACTION
X = encoded_data.drop(columns=['LONG_INTERACTION_interruptibility', 'SHORT_INTERACTION_interruptibility','uid'], axis=1)
y = encoded_data['LONG_INTERACTION_interruptibility']
groups = encoded_data['uid']

# Initialize Leave-One-Group-Out cross-validator
logo = LeaveOneGroupOut()

# Initialize SMOTE for oversampling to handle class imbalance
smote = SMOTE(random_state=42)

# Define models to evaluate
models2 = {
    'Random Forest': RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42),
    'XGBoost': XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42),
    'LightGBM': LGBMClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42),
    'CatBoost': CatBoostClassifier(iterations=100, depth=5, learning_rate=0.1, loss_function='Logloss', cat_features=[0], random_seed=42),
    'SVM': SVC(random_state=42),
    'Dummy': DummyClassifier(strategy="stratified", random_state=42)
}

results2_logo = {}

# Loop over each model
for model_name, model2 in models2.items():
    accuracies = []
    f1_scores = []

    # Loop over each fold in Leave-One-Group-Out cross-validation
    for i, (train_index, test_index) in enumerate(logo.split(X, y, groups)):
        # Split the data into training and testing sets for the current fold
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Apply SMOTE to the training data to balance class distribution
        X_train_oversampled, y_train_oversampled = smote.fit_resample(X_train, y_train)

        # Train the model on the oversampled training data
        model2.fit(X_train_oversampled, y_train_oversampled.ravel())

        # Predict the target on the test data
        y_pred = model2.predict(X_test)

        # Evaluate the prediction
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='macro')

        accuracies.append(accuracy)
        f1_scores.append(f1)

    # Compute average accuracy and F1-score across all folds
    avg_accuracy = sum(accuracies) / len(accuracies)
    avg_f1_score = sum(f1_scores) / len(f1_scores)

    results2_logo[model_name] = {
        'Average Accuracy': avg_accuracy,
        'Average F1-Score (macro)': avg_f1_score
    }

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.r

[LightGBM] [Info] Number of positive: 1400, number of negative: 1400
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000240 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 86
[LightGBM] [Info] Number of data points in the train set: 2800, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1419, number of negative: 1419
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000185 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 88
[LightGBM] [Info] Number of data points in the train set: 2838, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[Lig

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


[LightGBM] [Info] Number of positive: 1340, number of negative: 1340
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000252 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 88
[LightGBM] [Info] Number of data points in the train set: 2680, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1371, number of negative: 1371
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000184 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 88
[LightGBM] [Info] Number of data points in the train set: 2742, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[Lig

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


[LightGBM] [Info] Number of positive: 1359, number of negative: 1359
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000256 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 88
[LightGBM] [Info] Number of data points in the train set: 2718, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1360, number of negative: 1360
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000284 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 88
[LightGBM] [Info] Number of data points in the train set: 2720, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[Lig

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


[LightGBM] [Info] Number of positive: 1361, number of negative: 1361
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000229 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 88
[LightGBM] [Info] Number of data points in the train set: 2722, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1431, number of negative: 1431
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000182 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 88
[LightGBM] [Info] Number of data points in the train set: 2862, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[Lig

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


[LightGBM] [Info] Number of positive: 1420, number of negative: 1420
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000204 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 88
[LightGBM] [Info] Number of data points in the train set: 2840, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1398, number of negative: 1398
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000223 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 88
[LightGBM] [Info] Number of data points in the train set: 2796, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[Lig

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


[LightGBM] [Info] Number of positive: 1371, number of negative: 1371
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000242 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 87
[LightGBM] [Info] Number of data points in the train set: 2742, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1399, number of negative: 1399
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000217 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 88
[LightGBM] [Info] Number of data points in the train set: 2798, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[Lig

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


[LightGBM] [Info] Number of positive: 1389, number of negative: 1389
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000210 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 88
[LightGBM] [Info] Number of data points in the train set: 2778, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1399, number of negative: 1399
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000190 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 88
[LightGBM] [Info] Number of data points in the train set: 2798, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[Lig

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


[LightGBM] [Info] Number of positive: 1364, number of negative: 1364
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000238 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 88
[LightGBM] [Info] Number of data points in the train set: 2728, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
0:	learn: 0.6728183	total: 1.23ms	remaining: 122ms
1:	learn: 0.6566377	total: 2.2ms	remaining: 108ms
2:	learn: 0.6424171	total: 3.23ms	remaining: 104ms
3:	learn: 0.6299668	total: 4.23ms	remaining: 102ms
4:	learn: 0.6190853	total: 5.14ms	remaining: 97.7ms
5:	learn: 0.6102043	total: 5.92ms	remaining: 92.8ms
6:	learn: 0.6004116	total: 6.75ms	remaining: 89.7ms
7:	learn: 0.5958645	total: 7.37ms	remaining: 84.7ms
8:	learn: 0.5898240	total: 8.37ms	remaining: 84.7ms
9:	learn: 0.5839849	total: 9.38ms	remai

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


0:	learn: 0.6720109	total: 934us	remaining: 92.5ms
1:	learn: 0.6572850	total: 2.09ms	remaining: 102ms
2:	learn: 0.6416920	total: 3.16ms	remaining: 102ms
3:	learn: 0.6304563	total: 4.09ms	remaining: 98.1ms
4:	learn: 0.6222018	total: 4.92ms	remaining: 93.5ms
5:	learn: 0.6119510	total: 5.68ms	remaining: 89ms
6:	learn: 0.6039384	total: 6.57ms	remaining: 87.3ms
7:	learn: 0.5944641	total: 7.35ms	remaining: 84.5ms
8:	learn: 0.5882213	total: 8.36ms	remaining: 84.5ms
9:	learn: 0.5852903	total: 8.84ms	remaining: 79.6ms
10:	learn: 0.5803769	total: 9.58ms	remaining: 77.5ms
11:	learn: 0.5764297	total: 10.4ms	remaining: 76.2ms
12:	learn: 0.5717316	total: 11.3ms	remaining: 75.5ms
13:	learn: 0.5668285	total: 12.1ms	remaining: 74.2ms
14:	learn: 0.5633751	total: 12.9ms	remaining: 73.3ms
15:	learn: 0.5597429	total: 13.7ms	remaining: 71.9ms
16:	learn: 0.5560620	total: 14.5ms	remaining: 70.6ms
17:	learn: 0.5532531	total: 15.4ms	remaining: 70ms
18:	learn: 0.5504114	total: 16.1ms	remaining: 68.8ms
19:	learn:

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


0:	learn: 0.6753332	total: 886us	remaining: 87.7ms
1:	learn: 0.6574408	total: 1.72ms	remaining: 84.4ms
2:	learn: 0.6429770	total: 2.43ms	remaining: 78.7ms
3:	learn: 0.6325616	total: 3.27ms	remaining: 78.5ms
4:	learn: 0.6202603	total: 4.14ms	remaining: 78.7ms
5:	learn: 0.6125059	total: 4.87ms	remaining: 76.2ms
6:	learn: 0.6034526	total: 5.57ms	remaining: 74ms
7:	learn: 0.5961893	total: 6.31ms	remaining: 72.5ms
8:	learn: 0.5900579	total: 6.89ms	remaining: 69.7ms
9:	learn: 0.5823727	total: 7.58ms	remaining: 68.2ms
10:	learn: 0.5765667	total: 8.31ms	remaining: 67.3ms
11:	learn: 0.5721109	total: 9.03ms	remaining: 66.2ms
12:	learn: 0.5674805	total: 9.86ms	remaining: 66ms
13:	learn: 0.5641616	total: 10.7ms	remaining: 65.4ms
14:	learn: 0.5619387	total: 11.2ms	remaining: 63.5ms
15:	learn: 0.5583696	total: 11.9ms	remaining: 62.7ms
16:	learn: 0.5551981	total: 12.7ms	remaining: 62.2ms
17:	learn: 0.5526505	total: 13.6ms	remaining: 61.8ms
18:	learn: 0.5509668	total: 14.3ms	remaining: 61.1ms
19:	lear

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


73:	learn: 0.4996554	total: 55.7ms	remaining: 19.6ms
74:	learn: 0.4991884	total: 56.8ms	remaining: 18.9ms
75:	learn: 0.4988158	total: 57.6ms	remaining: 18.2ms
76:	learn: 0.4984433	total: 58.3ms	remaining: 17.4ms
77:	learn: 0.4979251	total: 59.1ms	remaining: 16.7ms
78:	learn: 0.4973141	total: 59.8ms	remaining: 15.9ms
79:	learn: 0.4968606	total: 60.6ms	remaining: 15.1ms
80:	learn: 0.4964231	total: 61.4ms	remaining: 14.4ms
81:	learn: 0.4959903	total: 62.1ms	remaining: 13.6ms
82:	learn: 0.4952894	total: 62.9ms	remaining: 12.9ms
83:	learn: 0.4948666	total: 63.6ms	remaining: 12.1ms
84:	learn: 0.4942397	total: 64.4ms	remaining: 11.4ms
85:	learn: 0.4938639	total: 65.1ms	remaining: 10.6ms
86:	learn: 0.4933316	total: 65.8ms	remaining: 9.84ms
87:	learn: 0.4929988	total: 66.6ms	remaining: 9.08ms
88:	learn: 0.4924859	total: 67.3ms	remaining: 8.32ms
89:	learn: 0.4920249	total: 68.2ms	remaining: 7.57ms
90:	learn: 0.4914603	total: 68.9ms	remaining: 6.82ms
91:	learn: 0.4909891	total: 69.7ms	remaining: 

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


0:	learn: 0.6762896	total: 950us	remaining: 94.1ms
1:	learn: 0.6603025	total: 1.89ms	remaining: 92.4ms
2:	learn: 0.6449141	total: 2.82ms	remaining: 91.3ms
3:	learn: 0.6315906	total: 3.61ms	remaining: 86.6ms
4:	learn: 0.6200549	total: 4.34ms	remaining: 82.5ms
5:	learn: 0.6100585	total: 5.13ms	remaining: 80.3ms
6:	learn: 0.5999096	total: 5.91ms	remaining: 78.5ms
7:	learn: 0.5947018	total: 6.43ms	remaining: 73.9ms
8:	learn: 0.5883390	total: 7.18ms	remaining: 72.6ms
9:	learn: 0.5819925	total: 7.99ms	remaining: 71.9ms
10:	learn: 0.5768189	total: 8.91ms	remaining: 72.1ms
11:	learn: 0.5740767	total: 9.45ms	remaining: 69.3ms
12:	learn: 0.5687477	total: 10.2ms	remaining: 68.6ms
13:	learn: 0.5659440	total: 11.1ms	remaining: 68.1ms
14:	learn: 0.5612961	total: 12ms	remaining: 68ms
15:	learn: 0.5579652	total: 12.7ms	remaining: 66.9ms
16:	learn: 0.5547375	total: 13.5ms	remaining: 66ms
17:	learn: 0.5515273	total: 14.3ms	remaining: 65.1ms
18:	learn: 0.5497184	total: 15ms	remaining: 63.9ms
19:	learn: 0

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


0:	learn: 0.6721623	total: 1.2ms	remaining: 119ms
1:	learn: 0.6548125	total: 2.3ms	remaining: 113ms
2:	learn: 0.6387750	total: 3.27ms	remaining: 106ms
3:	learn: 0.6272403	total: 4.25ms	remaining: 102ms
4:	learn: 0.6167297	total: 5.01ms	remaining: 95.2ms
5:	learn: 0.6098647	total: 5.76ms	remaining: 90.2ms
6:	learn: 0.6009960	total: 6.51ms	remaining: 86.5ms
7:	learn: 0.5948590	total: 7.46ms	remaining: 85.7ms
8:	learn: 0.5894983	total: 8.26ms	remaining: 83.5ms
9:	learn: 0.5837002	total: 9.3ms	remaining: 83.7ms
10:	learn: 0.5786163	total: 10.3ms	remaining: 83.7ms
11:	learn: 0.5729156	total: 11.4ms	remaining: 83.4ms
12:	learn: 0.5678740	total: 12.5ms	remaining: 83.4ms
13:	learn: 0.5640801	total: 13.9ms	remaining: 85.3ms
14:	learn: 0.5619339	total: 14.8ms	remaining: 83.7ms
15:	learn: 0.5578834	total: 15.9ms	remaining: 83.3ms
16:	learn: 0.5548622	total: 16.7ms	remaining: 81.7ms
17:	learn: 0.5526451	total: 17.6ms	remaining: 80.2ms
18:	learn: 0.5503288	total: 18.5ms	remaining: 78.7ms
19:	learn:

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


48:	learn: 0.5103930	total: 38ms	remaining: 39.6ms
49:	learn: 0.5097353	total: 38.8ms	remaining: 38.8ms
50:	learn: 0.5088679	total: 39.5ms	remaining: 38ms
51:	learn: 0.5087095	total: 40ms	remaining: 36.9ms
52:	learn: 0.5086765	total: 40.4ms	remaining: 35.8ms
53:	learn: 0.5082281	total: 41.1ms	remaining: 35ms
54:	learn: 0.5075459	total: 41.8ms	remaining: 34.2ms
55:	learn: 0.5067023	total: 42.6ms	remaining: 33.4ms
56:	learn: 0.5060882	total: 43.3ms	remaining: 32.7ms
57:	learn: 0.5055356	total: 44.1ms	remaining: 31.9ms
58:	learn: 0.5049876	total: 45.2ms	remaining: 31.4ms
59:	learn: 0.5047629	total: 45.8ms	remaining: 30.6ms
60:	learn: 0.5041541	total: 46.7ms	remaining: 29.8ms
61:	learn: 0.5039587	total: 47.4ms	remaining: 29ms
62:	learn: 0.5035590	total: 48.1ms	remaining: 28.3ms
63:	learn: 0.5033259	total: 48.9ms	remaining: 27.5ms
64:	learn: 0.5028640	total: 49.7ms	remaining: 26.8ms
65:	learn: 0.5021817	total: 50.5ms	remaining: 26ms
66:	learn: 0.5017594	total: 51.3ms	remaining: 25.3ms
67:	l

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


0:	learn: 0.6705556	total: 923us	remaining: 91.4ms
1:	learn: 0.6549343	total: 1.89ms	remaining: 92.8ms
2:	learn: 0.6406438	total: 2.97ms	remaining: 96.2ms
3:	learn: 0.6275491	total: 4.1ms	remaining: 98.4ms
4:	learn: 0.6156448	total: 4.94ms	remaining: 94ms
5:	learn: 0.6061276	total: 5.76ms	remaining: 90.2ms
6:	learn: 0.5992461	total: 6.56ms	remaining: 87.2ms
7:	learn: 0.5922053	total: 7.29ms	remaining: 83.9ms
8:	learn: 0.5851495	total: 8.07ms	remaining: 81.6ms
9:	learn: 0.5786420	total: 9ms	remaining: 81ms
10:	learn: 0.5744940	total: 9.86ms	remaining: 79.8ms
11:	learn: 0.5705324	total: 10.6ms	remaining: 77.5ms
12:	learn: 0.5662316	total: 11.3ms	remaining: 75.5ms
13:	learn: 0.5623190	total: 12ms	remaining: 73.9ms
14:	learn: 0.5572927	total: 12.8ms	remaining: 72.4ms
15:	learn: 0.5538421	total: 13.5ms	remaining: 70.9ms
16:	learn: 0.5510848	total: 14.2ms	remaining: 69.4ms
17:	learn: 0.5482463	total: 14.9ms	remaining: 68.1ms
18:	learn: 0.5457593	total: 15.7ms	remaining: 66.8ms
19:	learn: 0.5

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


0:	learn: 0.6700695	total: 1.17ms	remaining: 116ms
1:	learn: 0.6531879	total: 1.99ms	remaining: 97.4ms
2:	learn: 0.6375645	total: 2.99ms	remaining: 96.6ms
3:	learn: 0.6249381	total: 4ms	remaining: 96.1ms
4:	learn: 0.6175600	total: 4.53ms	remaining: 86ms
5:	learn: 0.6099663	total: 5.18ms	remaining: 81.2ms
6:	learn: 0.6004372	total: 5.99ms	remaining: 79.6ms
7:	learn: 0.5934770	total: 6.78ms	remaining: 78ms
8:	learn: 0.5873787	total: 7.42ms	remaining: 75ms
9:	learn: 0.5816996	total: 8.2ms	remaining: 73.8ms
10:	learn: 0.5762744	total: 8.94ms	remaining: 72.4ms
11:	learn: 0.5719790	total: 9.69ms	remaining: 71.1ms
12:	learn: 0.5671381	total: 10.4ms	remaining: 69.8ms
13:	learn: 0.5637490	total: 11.1ms	remaining: 68.4ms
14:	learn: 0.5598846	total: 11.9ms	remaining: 67.4ms
15:	learn: 0.5564175	total: 12.7ms	remaining: 66.5ms
16:	learn: 0.5530572	total: 13.4ms	remaining: 65.6ms
17:	learn: 0.5503371	total: 14.2ms	remaining: 64.8ms
18:	learn: 0.5476069	total: 15ms	remaining: 63.9ms
19:	learn: 0.545

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


64:	learn: 0.5075638	total: 53.5ms	remaining: 28.8ms
65:	learn: 0.5070456	total: 54.5ms	remaining: 28.1ms
66:	learn: 0.5067510	total: 55.3ms	remaining: 27.2ms
67:	learn: 0.5061426	total: 56.1ms	remaining: 26.4ms
68:	learn: 0.5054865	total: 56.9ms	remaining: 25.6ms
69:	learn: 0.5045912	total: 57.7ms	remaining: 24.7ms
70:	learn: 0.5040462	total: 58.5ms	remaining: 23.9ms
71:	learn: 0.5034267	total: 59.2ms	remaining: 23ms
72:	learn: 0.5029842	total: 60ms	remaining: 22.2ms
73:	learn: 0.5025584	total: 60.9ms	remaining: 21.4ms
74:	learn: 0.5022239	total: 61.7ms	remaining: 20.6ms
75:	learn: 0.5018984	total: 62.6ms	remaining: 19.8ms
76:	learn: 0.5014773	total: 63.4ms	remaining: 18.9ms
77:	learn: 0.5007436	total: 64.2ms	remaining: 18.1ms
78:	learn: 0.5000936	total: 65ms	remaining: 17.3ms
79:	learn: 0.4995425	total: 65.8ms	remaining: 16.4ms
80:	learn: 0.4990654	total: 66.6ms	remaining: 15.6ms
81:	learn: 0.4985809	total: 67.5ms	remaining: 14.8ms
82:	learn: 0.4982059	total: 68.4ms	remaining: 14ms
8

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


0:	learn: 0.6728479	total: 1.04ms	remaining: 103ms
1:	learn: 0.6537534	total: 2.07ms	remaining: 101ms
2:	learn: 0.6400439	total: 3.31ms	remaining: 107ms
3:	learn: 0.6289257	total: 4.4ms	remaining: 106ms
4:	learn: 0.6173845	total: 5.51ms	remaining: 105ms
5:	learn: 0.6084204	total: 6.44ms	remaining: 101ms
6:	learn: 0.6009392	total: 7.33ms	remaining: 97.4ms
7:	learn: 0.5939268	total: 8.22ms	remaining: 94.6ms
8:	learn: 0.5894824	total: 8.95ms	remaining: 90.5ms
9:	learn: 0.5839200	total: 9.89ms	remaining: 89ms
10:	learn: 0.5781786	total: 10.9ms	remaining: 88.5ms
11:	learn: 0.5736935	total: 11.8ms	remaining: 86.2ms
12:	learn: 0.5683664	total: 12.5ms	remaining: 84ms
13:	learn: 0.5641832	total: 13.4ms	remaining: 82ms
14:	learn: 0.5603681	total: 14.3ms	remaining: 80.9ms
15:	learn: 0.5570738	total: 15.1ms	remaining: 79.1ms
16:	learn: 0.5543988	total: 15.9ms	remaining: 77.6ms
17:	learn: 0.5523251	total: 16.8ms	remaining: 76.7ms
18:	learn: 0.5505761	total: 17.8ms	remaining: 75.8ms
19:	learn: 0.547

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


65:	learn: 0.4987014	total: 69.2ms	remaining: 35.7ms
66:	learn: 0.4981335	total: 70.5ms	remaining: 34.7ms
67:	learn: 0.4978779	total: 71.6ms	remaining: 33.7ms
68:	learn: 0.4976335	total: 72.7ms	remaining: 32.7ms
69:	learn: 0.4968999	total: 73.8ms	remaining: 31.6ms
70:	learn: 0.4965274	total: 74.8ms	remaining: 30.6ms
71:	learn: 0.4963014	total: 75.9ms	remaining: 29.5ms
72:	learn: 0.4955505	total: 76.9ms	remaining: 28.5ms
73:	learn: 0.4953042	total: 78ms	remaining: 27.4ms
74:	learn: 0.4947002	total: 79ms	remaining: 26.3ms
75:	learn: 0.4942159	total: 80.1ms	remaining: 25.3ms
76:	learn: 0.4939059	total: 81.1ms	remaining: 24.2ms
77:	learn: 0.4934846	total: 82ms	remaining: 23.1ms
78:	learn: 0.4930778	total: 83ms	remaining: 22.1ms
79:	learn: 0.4924765	total: 84ms	remaining: 21ms
80:	learn: 0.4920637	total: 85ms	remaining: 19.9ms
81:	learn: 0.4916300	total: 85.9ms	remaining: 18.9ms
82:	learn: 0.4911505	total: 86.9ms	remaining: 17.8ms
83:	learn: 0.4905734	total: 88ms	remaining: 16.8ms
84:	learn

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


79:	learn: 0.5032813	total: 84.8ms	remaining: 21.2ms
80:	learn: 0.5028117	total: 86.1ms	remaining: 20.2ms
81:	learn: 0.5021335	total: 87.2ms	remaining: 19.1ms
82:	learn: 0.5015836	total: 88.3ms	remaining: 18.1ms
83:	learn: 0.5013004	total: 89.4ms	remaining: 17ms
84:	learn: 0.5010841	total: 90.4ms	remaining: 16ms
85:	learn: 0.5008312	total: 91.4ms	remaining: 14.9ms
86:	learn: 0.5003544	total: 92.4ms	remaining: 13.8ms
87:	learn: 0.4999723	total: 93.4ms	remaining: 12.7ms
88:	learn: 0.4991539	total: 94.4ms	remaining: 11.7ms
89:	learn: 0.4986172	total: 95.6ms	remaining: 10.6ms
90:	learn: 0.4980368	total: 96.7ms	remaining: 9.56ms
91:	learn: 0.4975104	total: 97.7ms	remaining: 8.49ms
92:	learn: 0.4969565	total: 98.6ms	remaining: 7.42ms
93:	learn: 0.4965132	total: 99.6ms	remaining: 6.36ms
94:	learn: 0.4960033	total: 101ms	remaining: 5.3ms
95:	learn: 0.4954534	total: 102ms	remaining: 4.24ms
96:	learn: 0.4951154	total: 103ms	remaining: 3.19ms
97:	learn: 0.4946922	total: 104ms	remaining: 2.12ms
98

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


0:	learn: 0.6712263	total: 1.57ms	remaining: 156ms
1:	learn: 0.6540370	total: 2.83ms	remaining: 139ms
2:	learn: 0.6412677	total: 3.77ms	remaining: 122ms
3:	learn: 0.6318057	total: 4.58ms	remaining: 110ms
4:	learn: 0.6212531	total: 5.45ms	remaining: 104ms
5:	learn: 0.6122526	total: 6.63ms	remaining: 104ms
6:	learn: 0.6034983	total: 7.58ms	remaining: 101ms
7:	learn: 0.5956829	total: 8.54ms	remaining: 98.2ms
8:	learn: 0.5896781	total: 9.49ms	remaining: 96ms
9:	learn: 0.5847986	total: 10.4ms	remaining: 93.3ms
10:	learn: 0.5797798	total: 11.3ms	remaining: 91.5ms
11:	learn: 0.5755800	total: 12.2ms	remaining: 89.7ms
12:	learn: 0.5708757	total: 13.1ms	remaining: 87.6ms
13:	learn: 0.5678643	total: 14.3ms	remaining: 87.9ms
14:	learn: 0.5654359	total: 15.4ms	remaining: 87.3ms
15:	learn: 0.5615104	total: 16.6ms	remaining: 87.2ms
16:	learn: 0.5586611	total: 17.5ms	remaining: 85.2ms
17:	learn: 0.5567203	total: 18.3ms	remaining: 83.6ms
18:	learn: 0.5542684	total: 19.2ms	remaining: 81.9ms
19:	learn: 0

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.r

### K-Fold CV

In [21]:
from sklearn.model_selection import KFold

# Initialize 5-fold cross-validator
kfold = KFold(n_splits=5, shuffle=False)

results2_kfold = {}

# Loop over each model
for model_name, model2 in models2.items():
    accuracies = []
    f1_scores = []

    # Loop over each fold in 5-fold cross-validation
    for train_index, test_index in kfold.split(X, y):
        # Split the data into training and testing sets for the current fold
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Apply SMOTE to the training data to balance class distribution
        X_train_oversampled, y_train_oversampled = smote.fit_resample(X_train, y_train)

        # Train the model on the oversampled training data
        model2.fit(X_train_oversampled, y_train_oversampled.ravel())

        # Predict the target on the test data
        y_pred = model2.predict(X_test)

        # Evaluate the prediction
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='macro')

        accuracies.append(accuracy)
        f1_scores.append(f1)

    # Compute average accuracy and F1-score across all folds
    avg_accuracy = sum(accuracies) / len(accuracies)
    avg_f1_score = sum(f1_scores) / len(f1_scores)

    results2_kfold[model_name] = {
        'Average Accuracy': avg_accuracy,
        'Average F1-Score (macro)': avg_f1_score
    }

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.r

[LightGBM] [Info] Number of positive: 1144, number of negative: 1144
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000187 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 86
[LightGBM] [Info] Number of data points in the train set: 2288, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1158, number of negative: 1158
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000212 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 86
[LightGBM] [Info] Number of data points in the train set: 2316, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[Lig

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


[LightGBM] [Info] Number of positive: 1159, number of negative: 1159
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000253 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 87
[LightGBM] [Info] Number of data points in the train set: 2318, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1182, number of negative: 1182
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000175 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 86
[LightGBM] [Info] Number of data points in the train set: 2364, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
0:	l

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


85:	learn: 0.4770393	total: 78.9ms	remaining: 12.8ms
86:	learn: 0.4766165	total: 80ms	remaining: 12ms
87:	learn: 0.4762308	total: 81ms	remaining: 11ms
88:	learn: 0.4756301	total: 82.3ms	remaining: 10.2ms
89:	learn: 0.4748644	total: 83.3ms	remaining: 9.25ms
90:	learn: 0.4742615	total: 84.2ms	remaining: 8.32ms
91:	learn: 0.4738147	total: 85.2ms	remaining: 7.41ms
92:	learn: 0.4734412	total: 86.1ms	remaining: 6.48ms
93:	learn: 0.4731078	total: 87ms	remaining: 5.55ms
94:	learn: 0.4718148	total: 87.9ms	remaining: 4.62ms
95:	learn: 0.4713760	total: 88.7ms	remaining: 3.7ms
96:	learn: 0.4707598	total: 89.6ms	remaining: 2.77ms
97:	learn: 0.4701679	total: 90.5ms	remaining: 1.85ms
98:	learn: 0.4696999	total: 91.4ms	remaining: 923us
99:	learn: 0.4689998	total: 92.5ms	remaining: 0us
0:	learn: 0.6715613	total: 1.23ms	remaining: 121ms
1:	learn: 0.6516646	total: 2.38ms	remaining: 117ms
2:	learn: 0.6364084	total: 3.45ms	remaining: 112ms
3:	learn: 0.6247511	total: 4.49ms	remaining: 108ms
4:	learn: 0.6124

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


0:	learn: 0.6725680	total: 1.27ms	remaining: 126ms
1:	learn: 0.6578199	total: 2.34ms	remaining: 115ms
2:	learn: 0.6400877	total: 19.4ms	remaining: 626ms
3:	learn: 0.6272510	total: 20.5ms	remaining: 491ms
4:	learn: 0.6203358	total: 21.2ms	remaining: 402ms
5:	learn: 0.6112040	total: 22.5ms	remaining: 352ms
6:	learn: 0.6018572	total: 23.5ms	remaining: 312ms
7:	learn: 0.5940544	total: 24.6ms	remaining: 282ms
8:	learn: 0.5876107	total: 25.7ms	remaining: 260ms
9:	learn: 0.5798972	total: 26.8ms	remaining: 241ms
10:	learn: 0.5735825	total: 27.8ms	remaining: 225ms
11:	learn: 0.5684253	total: 29ms	remaining: 212ms
12:	learn: 0.5635740	total: 30ms	remaining: 201ms
13:	learn: 0.5576682	total: 30.8ms	remaining: 189ms
14:	learn: 0.5531824	total: 31.8ms	remaining: 180ms
15:	learn: 0.5498463	total: 32.8ms	remaining: 172ms
16:	learn: 0.5455603	total: 33.8ms	remaining: 165ms
17:	learn: 0.5424235	total: 34.8ms	remaining: 158ms
18:	learn: 0.5395239	total: 35.6ms	remaining: 152ms
19:	learn: 0.5366744	total

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


In [23]:
from tabulate import tabulate

# Models list
models = [
    'Random Forest', 'Gradient Boosting', 'XGBoost',
    'LightGBM', 'CatBoost', 'SVM', 'Dummy'
]

# Prepare table data
table_data = []
for model in models:
    row = [model]
    
    # Shell 1: SHORT_INTERACTION, Leave-One-Group-Out
    row.append(results1_logo.get(model, {}).get('Average Accuracy', 0.0))
    row.append(results1_logo.get(model, {}).get('Average F1-Score (macro)', 0.0))
    
    # Shell 3: LONG_INTERACTION, Leave-One-Group-Out
    row.append(results2_logo.get(model, {}).get('Average Accuracy', 0.0))
    row.append(results2_logo.get(model, {}).get('Average F1-Score (macro)', 0.0))

    # Shell 2: SHORT_INTERACTION, 5-fold
    row.append(results1_kfold.get(model, {}).get('Average Accuracy', 0.0))
    row.append(results1_kfold.get(model, {}).get('Average F1-Score (macro)', 0.0))
    
    # Shell 4: LONG_INTERACTION, 5-fold
    row.append(results2_kfold.get(model, {}).get('Average Accuracy', 0.0))
    row.append(results2_kfold.get(model, {}).get('Average F1-Score (macro)', 0.0))
    
    table_data.append(row)

# Define headers
headers = [
    'Model',
    'LOSO CV\nShort\nInteraction\nAccuracy', 'LOSO CV\nShort\nInteraction\nF1',
    'LOSO CV\nLong\nInteraction\nAccuracy', 'LOSO CV\nLong\nInteraction\nF1',
    '5-fold CV\nShort\nInteraction\nAccuracy', '5-fold CV\nShort\nInteraction\nF1',
    '5-fold CV\nLong\nInteraction\nAccuracy', '5-fold CV\nLong\nInteraction\nF1'
]

# Print the table
print("Machine learning model performance")
print(tabulate(table_data, headers=headers, tablefmt='fancy_grid', floatfmt='.3f', numalign="decimal"))


Machine learning model performance
╒═══════════════════╤═══════════════╤═══════════════╤═══════════════╤═══════════════╤═══════════════╤═══════════════╤═══════════════╤═══════════════╕
│ Model             │       LOSO CV │       LOSO CV │       LOSO CV │       LOSO CV │     5-fold CV │     5-fold CV │     5-fold CV │     5-fold CV │
│                   │         Short │         Short │          Long │          Long │         Short │         Short │          Long │          Long │
│                   │   Interaction │   Interaction │   Interaction │   Interaction │   Interaction │   Interaction │   Interaction │   Interaction │
│                   │      Accuracy │            F1 │      Accuracy │            F1 │      Accuracy │            F1 │      Accuracy │            F1 │
╞═══════════════════╪═══════════════╪═══════════════╪═══════════════╪═══════════════╪═══════════════╪═══════════════╪═══════════════╪═══════════════╡
│ Random Forest     │         0.828 │         0.746 │         0.6