# Settings

## Constants

In [1]:
import os

PATH_DATA = './Dataset'
sensor_data = ['UserInfo.csv', 'Service.csv', 'ContextualFactor.csv', 'Interruptibility.csv']

# Utility Functions

In [30]:
import pandas as pd
import numpy as np
from collections import defaultdict
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import LeaveOneGroupOut
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.svm import SVC
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, f1_score


# Load the Dataset into Dataframe

In [3]:
dataframes = {
    filename: pd.read_csv(os.path.join(PATH_DATA, filename)).reset_index(drop=True)
    for filename in sensor_data
}
dfService = dataframes['Service.csv']
dfContextualFactor = dataframes['ContextualFactor.csv']
dfUserInfo = dataframes['UserInfo.csv']
dfInterruptibility = dataframes['Interruptibility.csv']

# Preprocessing

In [4]:
# Select specific columns from dfContextualFactor, dfService, and dfUserInfo DataFrames
# Combine relevant columns to create a unified dataset
dfContextualFactor_selected_columns=dfContextualFactor[['uid','sid','activity1','activity2','activity3','userRoom','userPosition',]]
dfService_selected_columns=dfService[['weekOfExperiment','dayOfWeek','startTime', 'activityInquiry','availabilityInquiry','speechShadowing_1','speechShadowing_2','speechShadowing_3','speechShadowing_4','speechShadowing_5','continue-to-nextInquiry_1','continue-to-nextInquiry_2','continue-to-nextInquiry_3','continue-to-nextInquiry_4','endTime','endType']]
dfInterruptibility_selected_columns=dfInterruptibility[['SHORT_INTERACTION_availability', 'LONG_INTERACTION_availability']]

## Create binary columns for interaction types
## SHORT_INTERACTION: True if availabilityInquiry is not NaN
# dfCombinedAll['SHORT_INTERACTION_availability'] = dfCombinedAll['availabilityInquiry'].notna()
## LONG_INTERACTION: True if continue-to-nextInquiry_1 is not NaN
# dfCombinedAll['LONG_INTERACTION_availability'] = dfCombinedAll['continue-to-nextInquiry_1'].notna()

# ! For those who want to redefine LONG_INTERACTION with thresholds longer than 3 minutes,
# you can use the following columns:
# 5 minutes  => use continue-to-nextInquiry_2
# 7 minutes  => use continue-to-nextInquiry_3
# 9 minutes  => use continue-to-nextInquiry_4


dfCombinedAll=pd.concat([dfContextualFactor_selected_columns, dfService_selected_columns,dfInterruptibility_selected_columns], axis=1)

In [5]:
# Concatenate and merge the all columns from dfUserInfo (dfUserInfo.csv)
dfUserInfo_selected_columns = dfUserInfo[['uid', 'settingType', 'speakerRoom', 'speakerPosition']]
dfCombinedAll = pd.merge(dfCombinedAll, dfUserInfo_selected_columns, on='uid', how='left')

## Position Processing

In [6]:
# Define function to calculate proximity between user and speaker
def calculate_proximity(row):
    # Return 0 if user and speaker are in different rooms
    if row['userRoom'] != row['speakerRoom']:
        return 0
    # If in the same room
    elif row['userPosition'] == row['speakerPosition']:
        return 2  # Same position
    else:
        return 1  # Different positions (including missing position)

# Apply proximity calculation to create a new 'proximity' column
dfCombinedAll['proximity'] = dfCombinedAll.apply(calculate_proximity, axis=1)

print(dfCombinedAll[['userRoom', 'userPosition', 'speakerRoom', 'speakerPosition', 'proximity']].head(10))


      userRoom userPosition speakerRoom speakerPosition  proximity
0     Bed Room          Bed    Bed Room            Desk          1
1    Rest Room          NaN    Bed Room            Desk          0
2  Living Room          NaN    Bed Room            Desk          0
3  Living Room          NaN    Bed Room            Desk          0
4     Bed Room          Bed    Bed Room            Desk          1
5     Bed Room          Bed    Bed Room            Desk          1
6     Bed Room          Bed    Bed Room            Desk          1
7     Bed Room         Desk    Bed Room            Desk          2
8     Bed Room          Bed    Bed Room            Desk          1
9     Bed Room          Bed    Bed Room            Desk          1


## Activity and Time Processing

In [7]:
# Process activity columns for one-hot encoding
activity_cols = ['activity1', 'activity2', 'activity3']
df_activity = dfContextualFactor[activity_cols].copy()

# Get unique activities across all activity columns, excluding NaN
all_unique_activities = pd.unique(df_activity.values.ravel())
all_unique_activities = [x for x in all_unique_activities if pd.notna(x)]

# Create a DataFrame for one-hot encoding of activities
dfActivity_one_hot_encoding = pd.DataFrame(0, index=df_activity.index, columns=['act_' + str(val) for val in all_unique_activities])

# Perform one-hot encoding for each activity column
for col in activity_cols:
    for val in all_unique_activities:
        dfActivity_one_hot_encoding['act_' + str(val)] |= (df_activity[col] == val).astype(int)

# Concatenate one-hot encoded activity columns to dfCombinedAll
dfActivity_one_hot_encoding
dfCombinedAll = pd.concat([dfCombinedAll, dfActivity_one_hot_encoding], axis=1)

In [8]:
# Convert startTime to datetime and extract total minutes since midnight
dfCombinedAll['startTime'] = pd.to_datetime(dfCombinedAll['startTime'], format='%H:%M:%S', errors='coerce')
dfCombinedAll['minute'] = dfCombinedAll['startTime'].dt.hour * 60 + dfCombinedAll['startTime'].dt.minute

# Map days of the week to numerical values (MON=0, TUE=1, ..., SUN=6)
day_map = {'MON': 0, 'TUE': 1, 'WED': 2, 'THU': 3, 'FRI': 4, 'SAT': 5, 'SUN': 6}
dfCombinedAll['dayOfWeek'] = dfCombinedAll['dayOfWeek'].map(day_map)

# Bin minutes into 30-minute intervals for temporal analysis
dfCombinedAll['minute_bin'] = (dfCombinedAll['minute'] // 30).astype(int)

In [9]:
dfCombinedAll.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2901 entries, 0 to 2900
Data columns (total 43 columns):
 #   Column                                    Non-Null Count  Dtype         
---  ------                                    --------------  -----         
 0   uid                                       2901 non-null   int64         
 1   sid                                       2901 non-null   int64         
 2   activity1                                 2901 non-null   object        
 3   activity2                                 112 non-null    object        
 4   activity3                                 3 non-null      object        
 5   userRoom                                  2901 non-null   object        
 6   userPosition                              2414 non-null   object        
 7   weekOfExperiment                          2901 non-null   int64         
 8   dayOfWeek                                 2901 non-null   int64         
 9   startTime                     

# Feature Extraction

In [10]:
# Select features for response prediction
dfFeatresForResponse = dfCombinedAll[['uid',
    'act_Taking a Nap / Sleeping','act_Hygiene','act_Eating','act_Using Media','act_Social Interaction',
    'act_Returning from Outside / Other Rooms','act_Studying / Working','act_Others','act_House Chores',
    'act_Self Caring','act_Visiting Outside / Other Rooms','act_Resting',
    'settingType','userRoom','userPosition','speakerRoom','speakerPosition',
    'minute_bin','dayOfWeek','SHORT_INTERACTION_availability','LONG_INTERACTION_availability']].copy()

# Encode categorical columns using LabelEncoder
categorical_columns = ['settingType', 'userRoom', 'userPosition', 'speakerRoom', 'speakerPosition', 'minute_bin']
label_encoders = defaultdict(LabelEncoder)

# Apply label encoding to each categorical column
for col in categorical_columns:
    dfFeatresForResponse[col] = label_encoders[col].fit_transform(dfFeatresForResponse[col])

# Create a copy of the encoded data for further processing
encoded_data = dfFeatresForResponse.copy()


encoded_data.info()
encoded_data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2901 entries, 0 to 2900
Data columns (total 22 columns):
 #   Column                                    Non-Null Count  Dtype
---  ------                                    --------------  -----
 0   uid                                       2901 non-null   int64
 1   act_Taking a Nap / Sleeping               2901 non-null   int64
 2   act_Hygiene                               2901 non-null   int64
 3   act_Eating                                2901 non-null   int64
 4   act_Using Media                           2901 non-null   int64
 5   act_Social Interaction                    2901 non-null   int64
 6   act_Returning from Outside / Other Rooms  2901 non-null   int64
 7   act_Studying / Working                    2901 non-null   int64
 8   act_Others                                2901 non-null   int64
 9   act_House Chores                          2901 non-null   int64
 10  act_Self Caring                           2901 non-null   in

Unnamed: 0,uid,act_Taking a Nap / Sleeping,act_Hygiene,act_Eating,act_Using Media,act_Social Interaction,act_Returning from Outside / Other Rooms,act_Studying / Working,act_Others,act_House Chores,...,act_Resting,settingType,userRoom,userPosition,speakerRoom,speakerPosition,minute_bin,dayOfWeek,SHORT_INTERACTION_availability,LONG_INTERACTION_availability
0,1,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,12,0,True,False
1,1,0,1,0,0,0,0,0,0,0,...,0,0,7,3,0,1,15,0,False,False
2,1,0,0,1,0,0,0,0,0,0,...,0,0,5,3,0,1,12,1,False,False
3,1,0,0,1,0,0,0,0,0,0,...,0,0,5,3,0,1,13,1,False,False
4,1,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,18,1,True,False


## Label: SHORT_INTERACTION

In [11]:
# Label distribution before balancing
print(encoded_data['SHORT_INTERACTION_availability'].value_counts())

SHORT_INTERACTION_availability
True     2158
False     743
Name: count, dtype: int64


### Model Building and LOSO CV

In [40]:
# Prepare features (X) and target variables (y) for SHORT_INTERACTION
X = encoded_data.drop(columns=['SHORT_INTERACTION_availability', 'LONG_INTERACTION_availability', 'uid'], axis=1) # Drop target and unrelated columns
y = encoded_data['SHORT_INTERACTION_availability'] # Target variable
groups = encoded_data['uid'] # Group by user ID for Leave-One-Group-Out CV

# Initialize Leave-One-Group-Out cross-validator
logo = LeaveOneGroupOut()

# Initialize SMOTE for oversampling to handle class imbalance
smote = SMOTE(random_state=42)

# Define models to evaluate
models1 = {
    'Random Forest': RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42),
    'XGBoost': XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42),
    'LightGBM': LGBMClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42),
    'CatBoost': CatBoostClassifier(iterations=100, depth=5, learning_rate=0.1, loss_function='Logloss', cat_features=[0], random_seed=42),
    'SVM': SVC(random_state=42),
    'Dummy': DummyClassifier(strategy="stratified", random_state=42)
}

results1_logo = {}

# Loop over each model
for model_name, model1 in models1.items():
    accuracies = []
    f1_scores = []

    # Loop over each fold in Leave-One-Group-Out cross-validation
    for i, (train_index, test_index) in enumerate(logo.split(X, y, groups)):
        # Split the data into training and testing sets for the current fold
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Apply SMOTE to the training data to balance class distribution
        X_train_oversampled, y_train_oversampled = smote.fit_resample(X_train, y_train)

        # Train the model on the oversampled training data
        model1.fit(X_train_oversampled, y_train_oversampled.ravel())

        # Predict the target on the test data
        y_pred = model1.predict(X_test)

        # Evaluate the prediction
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='macro')

        accuracies.append(accuracy)
        f1_scores.append(f1)
    # Compute average accuracy and F1-score across all folds
    avg_accuracy = sum(accuracies) / len(accuracies)
    avg_f1_score = sum(f1_scores) / len(f1_scores)

    results1_logo[model_name] = {
        'Average Accuracy': avg_accuracy,
        'Average F1-Score (macro)': avg_f1_score
    }


  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.r

[LightGBM] [Info] Number of positive: 2104, number of negative: 2104
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000241 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 93
[LightGBM] [Info] Number of data points in the train set: 4208, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2120, number of negative: 2120
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000265 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 95
[LightGBM] [Info] Number of data points in the train set: 4240, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[Lig

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


[LightGBM] [Info] Number of positive: 2088, number of negative: 2088
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000312 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 93
[LightGBM] [Info] Number of data points in the train set: 4176, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2128, number of negative: 2128
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000322 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 94
[LightGBM] [Info] Number of data points in the train set: 4256, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[Lig

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


[LightGBM] [Info] Number of positive: 2081, number of negative: 2081
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000279 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 95
[LightGBM] [Info] Number of data points in the train set: 4162, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2102, number of negative: 2102
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000234 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 95
[LightGBM] [Info] Number of data points in the train set: 4204, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[Lig

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


[LightGBM] [Info] Number of positive: 2058, number of negative: 2058
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000286 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 95
[LightGBM] [Info] Number of data points in the train set: 4116, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2056, number of negative: 2056
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000293 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 95
[LightGBM] [Info] Number of data points in the train set: 4112, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[Lig

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


[LightGBM] [Info] Number of positive: 1998, number of negative: 1998
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000286 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 95
[LightGBM] [Info] Number of data points in the train set: 3996, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2120, number of negative: 2120
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000299 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 95
[LightGBM] [Info] Number of data points in the train set: 4240, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[Lig

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


[LightGBM] [Info] Number of positive: 2015, number of negative: 2015
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000264 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 93
[LightGBM] [Info] Number of data points in the train set: 4030, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2107, number of negative: 2107
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000313 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 95
[LightGBM] [Info] Number of data points in the train set: 4214, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
0:	l

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


27:	learn: 0.3930711	total: 31.8ms	remaining: 81.8ms
28:	learn: 0.3908774	total: 33.4ms	remaining: 81.7ms
29:	learn: 0.3889027	total: 34.6ms	remaining: 80.7ms
30:	learn: 0.3879162	total: 36.2ms	remaining: 80.5ms
31:	learn: 0.3862507	total: 37.6ms	remaining: 79.8ms
32:	learn: 0.3847787	total: 38.9ms	remaining: 79ms
33:	learn: 0.3829621	total: 40.3ms	remaining: 78.2ms
34:	learn: 0.3821934	total: 41.8ms	remaining: 77.6ms
35:	learn: 0.3806552	total: 43.1ms	remaining: 76.6ms
36:	learn: 0.3789264	total: 44.5ms	remaining: 75.7ms
37:	learn: 0.3769745	total: 45.6ms	remaining: 74.5ms
38:	learn: 0.3759756	total: 46.8ms	remaining: 73.2ms
39:	learn: 0.3753039	total: 48ms	remaining: 72ms
40:	learn: 0.3740083	total: 49.1ms	remaining: 70.6ms
41:	learn: 0.3726401	total: 50.3ms	remaining: 69.5ms
42:	learn: 0.3714265	total: 51.5ms	remaining: 68.2ms
43:	learn: 0.3706684	total: 52.6ms	remaining: 66.9ms
44:	learn: 0.3698616	total: 53.7ms	remaining: 65.7ms
45:	learn: 0.3688983	total: 55ms	remaining: 64.5ms
4

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


0:	learn: 0.6495044	total: 922us	remaining: 91.3ms
1:	learn: 0.6167502	total: 1.95ms	remaining: 95.3ms
2:	learn: 0.5889530	total: 2.79ms	remaining: 90.1ms
3:	learn: 0.5667000	total: 3.66ms	remaining: 87.8ms
4:	learn: 0.5483847	total: 4.46ms	remaining: 84.7ms
5:	learn: 0.5308677	total: 5.25ms	remaining: 82.3ms
6:	learn: 0.5158887	total: 6.06ms	remaining: 80.5ms
7:	learn: 0.5038435	total: 6.92ms	remaining: 79.6ms
8:	learn: 0.4908485	total: 7.95ms	remaining: 80.4ms
9:	learn: 0.4765186	total: 8.83ms	remaining: 79.5ms
10:	learn: 0.4641438	total: 9.69ms	remaining: 78.4ms
11:	learn: 0.4543736	total: 10.6ms	remaining: 77.4ms
12:	learn: 0.4465678	total: 11.3ms	remaining: 75.9ms
13:	learn: 0.4406373	total: 12.2ms	remaining: 74.7ms
14:	learn: 0.4357069	total: 13ms	remaining: 73.6ms
15:	learn: 0.4308051	total: 13.9ms	remaining: 72.8ms
16:	learn: 0.4246559	total: 14.8ms	remaining: 72.1ms
17:	learn: 0.4196560	total: 15.5ms	remaining: 70.7ms
18:	learn: 0.4145751	total: 16.3ms	remaining: 69.7ms
19:	le

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


72:	learn: 0.3377040	total: 66.6ms	remaining: 24.6ms
73:	learn: 0.3370907	total: 67.7ms	remaining: 23.8ms
74:	learn: 0.3363184	total: 68.6ms	remaining: 22.9ms
75:	learn: 0.3359034	total: 69.6ms	remaining: 22ms
76:	learn: 0.3352830	total: 70.6ms	remaining: 21.1ms
77:	learn: 0.3345599	total: 71.6ms	remaining: 20.2ms
78:	learn: 0.3339161	total: 72.5ms	remaining: 19.3ms
79:	learn: 0.3335199	total: 73.5ms	remaining: 18.4ms
80:	learn: 0.3331144	total: 74.4ms	remaining: 17.5ms
81:	learn: 0.3326957	total: 75.3ms	remaining: 16.5ms
82:	learn: 0.3323495	total: 76.2ms	remaining: 15.6ms
83:	learn: 0.3318783	total: 77.1ms	remaining: 14.7ms
84:	learn: 0.3315173	total: 78.2ms	remaining: 13.8ms
85:	learn: 0.3310745	total: 79.1ms	remaining: 12.9ms
86:	learn: 0.3302907	total: 80ms	remaining: 11.9ms
87:	learn: 0.3298105	total: 81.1ms	remaining: 11.1ms
88:	learn: 0.3293928	total: 82ms	remaining: 10.1ms
89:	learn: 0.3287237	total: 82.9ms	remaining: 9.21ms
90:	learn: 0.3282443	total: 83.8ms	remaining: 8.29ms

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


0:	learn: 0.6544721	total: 952us	remaining: 94.3ms
1:	learn: 0.6197306	total: 1.96ms	remaining: 96ms
2:	learn: 0.5935470	total: 2.92ms	remaining: 94.5ms
3:	learn: 0.5717301	total: 4.01ms	remaining: 96.2ms
4:	learn: 0.5458614	total: 5.06ms	remaining: 96.1ms
5:	learn: 0.5243390	total: 6.11ms	remaining: 95.7ms
6:	learn: 0.5108979	total: 7.06ms	remaining: 93.8ms
7:	learn: 0.4984290	total: 7.86ms	remaining: 90.4ms
8:	learn: 0.4867842	total: 8.66ms	remaining: 87.6ms
9:	learn: 0.4776980	total: 9.45ms	remaining: 85.1ms
10:	learn: 0.4652109	total: 10.3ms	remaining: 83.7ms
11:	learn: 0.4542510	total: 11.3ms	remaining: 82.7ms
12:	learn: 0.4445346	total: 12.3ms	remaining: 82.2ms
13:	learn: 0.4368646	total: 13.2ms	remaining: 81.3ms
14:	learn: 0.4299325	total: 14.1ms	remaining: 80.1ms
15:	learn: 0.4240402	total: 15ms	remaining: 78.5ms
16:	learn: 0.4177844	total: 15.8ms	remaining: 77.2ms
17:	learn: 0.4135594	total: 16.7ms	remaining: 76ms
18:	learn: 0.4104693	total: 17.5ms	remaining: 74.7ms
19:	learn:

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


0:	learn: 0.6571607	total: 1.01ms	remaining: 100ms
1:	learn: 0.6247087	total: 1.92ms	remaining: 93.8ms
2:	learn: 0.5951631	total: 2.81ms	remaining: 90.8ms
3:	learn: 0.5736417	total: 3.62ms	remaining: 86.8ms
4:	learn: 0.5536987	total: 4.41ms	remaining: 83.7ms
5:	learn: 0.5379548	total: 5.25ms	remaining: 82.2ms
6:	learn: 0.5192911	total: 6.07ms	remaining: 80.7ms
7:	learn: 0.5036263	total: 7.04ms	remaining: 81ms
8:	learn: 0.4924652	total: 8.11ms	remaining: 82ms
9:	learn: 0.4805939	total: 9.12ms	remaining: 82.1ms
10:	learn: 0.4701855	total: 10ms	remaining: 81ms
11:	learn: 0.4630785	total: 11ms	remaining: 80.3ms
12:	learn: 0.4553374	total: 12.1ms	remaining: 81.2ms
13:	learn: 0.4470501	total: 12.9ms	remaining: 79.5ms
14:	learn: 0.4422814	total: 13.8ms	remaining: 78ms
15:	learn: 0.4377419	total: 14.6ms	remaining: 76.5ms
16:	learn: 0.4318409	total: 15.4ms	remaining: 75.2ms
17:	learn: 0.4270773	total: 16.2ms	remaining: 73.9ms
18:	learn: 0.4232881	total: 17ms	remaining: 72.7ms
19:	learn: 0.42005

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


76:	learn: 0.3313860	total: 66ms	remaining: 19.7ms
77:	learn: 0.3307577	total: 67ms	remaining: 18.9ms
78:	learn: 0.3304482	total: 67.9ms	remaining: 18ms
79:	learn: 0.3300154	total: 68.8ms	remaining: 17.2ms
80:	learn: 0.3293592	total: 69.6ms	remaining: 16.3ms
81:	learn: 0.3288866	total: 70.5ms	remaining: 15.5ms
82:	learn: 0.3282907	total: 71.3ms	remaining: 14.6ms
83:	learn: 0.3279578	total: 72.2ms	remaining: 13.7ms
84:	learn: 0.3275546	total: 73ms	remaining: 12.9ms
85:	learn: 0.3271217	total: 73.8ms	remaining: 12ms
86:	learn: 0.3267298	total: 74.7ms	remaining: 11.2ms
87:	learn: 0.3261620	total: 75.5ms	remaining: 10.3ms
88:	learn: 0.3254689	total: 76.4ms	remaining: 9.44ms
89:	learn: 0.3249353	total: 77.2ms	remaining: 8.58ms
90:	learn: 0.3241060	total: 78.1ms	remaining: 7.72ms
91:	learn: 0.3237610	total: 79ms	remaining: 6.87ms
92:	learn: 0.3231320	total: 79.9ms	remaining: 6.01ms
93:	learn: 0.3222034	total: 80.9ms	remaining: 5.17ms
94:	learn: 0.3216434	total: 81.8ms	remaining: 4.3ms
95:	le

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


14:	learn: 0.4432926	total: 12.6ms	remaining: 71.2ms
15:	learn: 0.4365972	total: 13.6ms	remaining: 71.3ms
16:	learn: 0.4301656	total: 14.5ms	remaining: 70.9ms
17:	learn: 0.4259170	total: 15.3ms	remaining: 69.7ms
18:	learn: 0.4211554	total: 16.1ms	remaining: 68.6ms
19:	learn: 0.4182131	total: 17ms	remaining: 68.2ms
20:	learn: 0.4141543	total: 17.8ms	remaining: 67ms
21:	learn: 0.4105267	total: 18.7ms	remaining: 66.2ms
22:	learn: 0.4071419	total: 19.5ms	remaining: 65.3ms
23:	learn: 0.4052543	total: 20.3ms	remaining: 64.3ms
24:	learn: 0.4027380	total: 21ms	remaining: 63ms
25:	learn: 0.4005487	total: 21.8ms	remaining: 62ms
26:	learn: 0.3987586	total: 22.6ms	remaining: 61ms
27:	learn: 0.3972924	total: 23.5ms	remaining: 60.3ms
28:	learn: 0.3957609	total: 24.3ms	remaining: 59.4ms
29:	learn: 0.3947456	total: 25.1ms	remaining: 58.5ms
30:	learn: 0.3933693	total: 25.9ms	remaining: 57.7ms
31:	learn: 0.3911747	total: 26.9ms	remaining: 57.2ms
32:	learn: 0.3897201	total: 27.8ms	remaining: 56.4ms
33:	l

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


30:	learn: 0.3888523	total: 26.4ms	remaining: 58.7ms
31:	learn: 0.3869747	total: 27.3ms	remaining: 57.9ms
32:	learn: 0.3847449	total: 28.1ms	remaining: 57ms
33:	learn: 0.3835316	total: 28.9ms	remaining: 56.2ms
34:	learn: 0.3819497	total: 29.7ms	remaining: 55.1ms
35:	learn: 0.3804645	total: 30.4ms	remaining: 54ms
36:	learn: 0.3793327	total: 31.3ms	remaining: 53.3ms
37:	learn: 0.3778677	total: 32.1ms	remaining: 52.4ms
38:	learn: 0.3766874	total: 33ms	remaining: 51.6ms
39:	learn: 0.3755786	total: 33.8ms	remaining: 50.8ms
40:	learn: 0.3740367	total: 34.7ms	remaining: 49.9ms
41:	learn: 0.3730620	total: 35.7ms	remaining: 49.2ms
42:	learn: 0.3724272	total: 36.7ms	remaining: 48.7ms
43:	learn: 0.3712661	total: 37.5ms	remaining: 47.7ms
44:	learn: 0.3706877	total: 38.4ms	remaining: 46.9ms
45:	learn: 0.3700637	total: 39.1ms	remaining: 45.9ms
46:	learn: 0.3688240	total: 40ms	remaining: 45ms
47:	learn: 0.3678638	total: 40.7ms	remaining: 44.1ms
48:	learn: 0.3672299	total: 41.5ms	remaining: 43.2ms
49:

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


0:	learn: 0.6544010	total: 1.09ms	remaining: 108ms
1:	learn: 0.6226168	total: 2.08ms	remaining: 102ms
2:	learn: 0.5910155	total: 2.81ms	remaining: 90.9ms
3:	learn: 0.5677308	total: 3.63ms	remaining: 87.1ms
4:	learn: 0.5480512	total: 4.39ms	remaining: 83.5ms
5:	learn: 0.5331134	total: 5.05ms	remaining: 79.1ms
6:	learn: 0.5194300	total: 5.85ms	remaining: 77.7ms
7:	learn: 0.5073464	total: 6.62ms	remaining: 76.2ms
8:	learn: 0.4944736	total: 7.46ms	remaining: 75.5ms
9:	learn: 0.4856878	total: 8.18ms	remaining: 73.6ms
10:	learn: 0.4747789	total: 8.99ms	remaining: 72.7ms
11:	learn: 0.4644131	total: 9.84ms	remaining: 72.2ms
12:	learn: 0.4579907	total: 10.6ms	remaining: 71ms
13:	learn: 0.4532641	total: 11.4ms	remaining: 70ms
14:	learn: 0.4454584	total: 12.1ms	remaining: 68.8ms
15:	learn: 0.4401620	total: 12.9ms	remaining: 67.9ms
16:	learn: 0.4344662	total: 13.7ms	remaining: 66.8ms
17:	learn: 0.4308699	total: 14.5ms	remaining: 65.9ms
18:	learn: 0.4270832	total: 15.3ms	remaining: 65.1ms
19:	learn

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


73:	learn: 0.3514957	total: 64ms	remaining: 22.5ms
74:	learn: 0.3511434	total: 64.9ms	remaining: 21.6ms
75:	learn: 0.3508715	total: 66.1ms	remaining: 20.9ms
76:	learn: 0.3504705	total: 67.4ms	remaining: 20.1ms
77:	learn: 0.3499391	total: 68.6ms	remaining: 19.4ms
78:	learn: 0.3493486	total: 69.7ms	remaining: 18.5ms
79:	learn: 0.3488031	total: 70.5ms	remaining: 17.6ms
80:	learn: 0.3483174	total: 71.4ms	remaining: 16.8ms
81:	learn: 0.3480370	total: 72.3ms	remaining: 15.9ms
82:	learn: 0.3469128	total: 73.1ms	remaining: 15ms
83:	learn: 0.3463160	total: 74ms	remaining: 14.1ms
84:	learn: 0.3455775	total: 74.8ms	remaining: 13.2ms
85:	learn: 0.3449280	total: 75.7ms	remaining: 12.3ms
86:	learn: 0.3436881	total: 76.6ms	remaining: 11.4ms
87:	learn: 0.3431120	total: 77.4ms	remaining: 10.6ms
88:	learn: 0.3424810	total: 78.4ms	remaining: 9.68ms
89:	learn: 0.3420940	total: 79.2ms	remaining: 8.79ms
90:	learn: 0.3416419	total: 79.9ms	remaining: 7.9ms
91:	learn: 0.3411697	total: 80.7ms	remaining: 7.02ms


  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


8:	learn: 0.4964458	total: 8.28ms	remaining: 83.7ms
9:	learn: 0.4840278	total: 9.44ms	remaining: 85ms
10:	learn: 0.4723800	total: 10.4ms	remaining: 83.9ms
11:	learn: 0.4654881	total: 11.8ms	remaining: 86.3ms
12:	learn: 0.4587558	total: 13.2ms	remaining: 88.3ms
13:	learn: 0.4509717	total: 14.2ms	remaining: 87.4ms
14:	learn: 0.4460766	total: 15.1ms	remaining: 85.4ms
15:	learn: 0.4396100	total: 15.9ms	remaining: 83.4ms
16:	learn: 0.4346171	total: 17.4ms	remaining: 85.1ms
17:	learn: 0.4287901	total: 19.2ms	remaining: 87.7ms
18:	learn: 0.4250312	total: 21ms	remaining: 89.5ms
19:	learn: 0.4212411	total: 22.4ms	remaining: 89.7ms
20:	learn: 0.4171274	total: 23.9ms	remaining: 89.8ms
21:	learn: 0.4137248	total: 25.1ms	remaining: 89.1ms
22:	learn: 0.4107107	total: 26.3ms	remaining: 88.2ms
23:	learn: 0.4086420	total: 27.7ms	remaining: 87.7ms
24:	learn: 0.4060659	total: 28.9ms	remaining: 86.7ms
25:	learn: 0.4042215	total: 30.1ms	remaining: 85.7ms
26:	learn: 0.4013342	total: 31.4ms	remaining: 84.8ms

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


0:	learn: 0.6563287	total: 1.47ms	remaining: 146ms
1:	learn: 0.6223003	total: 2.55ms	remaining: 125ms
2:	learn: 0.5961929	total: 3.48ms	remaining: 113ms
3:	learn: 0.5685119	total: 4.29ms	remaining: 103ms
4:	learn: 0.5506872	total: 5.13ms	remaining: 97.5ms
5:	learn: 0.5338392	total: 6.01ms	remaining: 94.1ms
6:	learn: 0.5210480	total: 6.89ms	remaining: 91.6ms
7:	learn: 0.5078624	total: 7.71ms	remaining: 88.7ms
8:	learn: 0.4973310	total: 8.64ms	remaining: 87.4ms
9:	learn: 0.4862239	total: 9.43ms	remaining: 84.9ms
10:	learn: 0.4779945	total: 10.2ms	remaining: 82.8ms
11:	learn: 0.4700756	total: 11ms	remaining: 81ms
12:	learn: 0.4604217	total: 11.9ms	remaining: 79.8ms
13:	learn: 0.4523843	total: 12.7ms	remaining: 78.3ms
14:	learn: 0.4474760	total: 13.9ms	remaining: 78.5ms
15:	learn: 0.4401434	total: 14.7ms	remaining: 77.1ms
16:	learn: 0.4344762	total: 15.5ms	remaining: 75.9ms
17:	learn: 0.4285958	total: 16.4ms	remaining: 74.6ms
18:	learn: 0.4252558	total: 17.2ms	remaining: 73.4ms
19:	learn: 

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


60:	learn: 0.3549733	total: 58.7ms	remaining: 37.6ms
61:	learn: 0.3543056	total: 59.7ms	remaining: 36.6ms
62:	learn: 0.3534605	total: 60.5ms	remaining: 35.5ms
63:	learn: 0.3528434	total: 61.4ms	remaining: 34.5ms
64:	learn: 0.3524377	total: 62.2ms	remaining: 33.5ms
65:	learn: 0.3513330	total: 63ms	remaining: 32.5ms
66:	learn: 0.3504575	total: 63.9ms	remaining: 31.5ms
67:	learn: 0.3498451	total: 64.9ms	remaining: 30.5ms
68:	learn: 0.3491423	total: 65.8ms	remaining: 29.6ms
69:	learn: 0.3484449	total: 66.8ms	remaining: 28.6ms
70:	learn: 0.3478139	total: 67.6ms	remaining: 27.6ms
71:	learn: 0.3471575	total: 68.5ms	remaining: 26.6ms
72:	learn: 0.3464471	total: 69.4ms	remaining: 25.7ms
73:	learn: 0.3457857	total: 70.3ms	remaining: 24.7ms
74:	learn: 0.3450337	total: 71ms	remaining: 23.7ms
75:	learn: 0.3441561	total: 71.9ms	remaining: 22.7ms
76:	learn: 0.3437701	total: 72.7ms	remaining: 21.7ms
77:	learn: 0.3429213	total: 73.6ms	remaining: 20.8ms
78:	learn: 0.3423761	total: 74.6ms	remaining: 19.8

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


0:	learn: 0.6537023	total: 905us	remaining: 89.7ms
1:	learn: 0.6197268	total: 1.81ms	remaining: 88.7ms
2:	learn: 0.5874531	total: 2.6ms	remaining: 84ms
3:	learn: 0.5653608	total: 3.44ms	remaining: 82.5ms
4:	learn: 0.5452165	total: 4.23ms	remaining: 80.4ms
5:	learn: 0.5294835	total: 5ms	remaining: 78.4ms
6:	learn: 0.5126903	total: 5.83ms	remaining: 77.4ms
7:	learn: 0.5021556	total: 6.64ms	remaining: 76.4ms
8:	learn: 0.4895882	total: 7.5ms	remaining: 75.9ms
9:	learn: 0.4758397	total: 8.33ms	remaining: 75ms
10:	learn: 0.4641077	total: 9.1ms	remaining: 73.7ms
11:	learn: 0.4569236	total: 9.92ms	remaining: 72.8ms
12:	learn: 0.4487490	total: 10.8ms	remaining: 72.2ms
13:	learn: 0.4411182	total: 11.6ms	remaining: 71.2ms
14:	learn: 0.4335757	total: 12.5ms	remaining: 70.6ms
15:	learn: 0.4279861	total: 13.2ms	remaining: 69.4ms
16:	learn: 0.4219181	total: 14.1ms	remaining: 68.6ms
17:	learn: 0.4177543	total: 14.9ms	remaining: 67.8ms
18:	learn: 0.4132819	total: 15.7ms	remaining: 66.8ms
19:	learn: 0.4

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.r

Random Forest - Accuracy: 0.8336, F1-Score: 0.7385
Gradient Boosting - Accuracy: 0.8124, F1-Score: 0.6996
XGBoost - Accuracy: 0.8235, F1-Score: 0.7137
LightGBM - Accuracy: 0.8228, F1-Score: 0.7148
CatBoost - Accuracy: 0.8252, F1-Score: 0.7278
SVM - Accuracy: 0.7653, F1-Score: 0.6822
Dummy - Accuracy: 0.5071, F1-Score: 0.4416


  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


### K-Fold CV

In [41]:
from sklearn.model_selection import KFold

# Initialize 5-fold cross-validator
kfold = KFold(n_splits=5, shuffle=False)

results1_kfold = {}

# Loop over each model
for model_name, model1 in models1.items():
    accuracies = []
    f1_scores = []

    # Loop over each fold in 5-fold cross-validation
    for train_index, test_index in kfold.split(X, y):
        # Split the data into training and testing sets for the current fold
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Apply SMOTE to the training data to balance class distribution
        X_train_oversampled, y_train_oversampled = smote.fit_resample(X_train, y_train)

        # Train the model on the oversampled training data
        model1.fit(X_train_oversampled, y_train_oversampled.ravel())

        # Predict the target on the test data
        y_pred = model1.predict(X_test)

        # Evaluate the prediction
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='macro')

        accuracies.append(accuracy)
        f1_scores.append(f1)

    # Compute average accuracy and F1-score across all folds
    avg_accuracy = sum(accuracies) / len(accuracies)
    avg_f1_score = sum(f1_scores) / len(f1_scores)

    results1_kfold[model_name] = {
        'Average Accuracy': avg_accuracy,
        'Average F1-Score (macro)': avg_f1_score
    }


  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.r

[LightGBM] [Info] Number of positive: 1716, number of negative: 1716
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000255 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 91
[LightGBM] [Info] Number of data points in the train set: 3432, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1778, number of negative: 1778
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000270 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 92
[LightGBM] [Info] Number of data points in the train set: 3556, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[Lig

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


[LightGBM] [Info] Number of positive: 1728, number of negative: 1728
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000281 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 93
[LightGBM] [Info] Number of data points in the train set: 3456, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
0:	learn: 0.6536531	total: 1.1ms	remaining: 109ms
1:	learn: 0.6227726	total: 1.89ms	remaining: 92.4ms
2:	learn: 0.5884056	total: 2.66ms	remaining: 86ms
3:	learn: 0.5639487	total: 3.42ms	remaining: 82ms
4:	learn: 0.5394201	total: 4.15ms	remaining: 78.9ms
5:	learn: 0.5217324	total: 4.93ms	remaining: 77.2ms
6:	learn: 0.5069026	total: 5.66ms	remaining: 75.2ms
7:	learn: 0.4902600	total: 6.47ms	remaining: 74.4ms
8:	learn: 0.4794030	total: 7.24ms	remaining: 73.2ms
9:	learn: 0.4703887	total: 8.03ms	remain

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


0:	learn: 0.6529720	total: 791us	remaining: 78.4ms
1:	learn: 0.6170222	total: 1.58ms	remaining: 77.5ms
2:	learn: 0.5874868	total: 2.51ms	remaining: 81.3ms
3:	learn: 0.5622815	total: 3.25ms	remaining: 77.9ms
4:	learn: 0.5436234	total: 3.95ms	remaining: 75ms
5:	learn: 0.5282429	total: 4.65ms	remaining: 72.8ms
6:	learn: 0.5077982	total: 5.38ms	remaining: 71.5ms
7:	learn: 0.4919153	total: 6.08ms	remaining: 69.9ms
8:	learn: 0.4823170	total: 6.87ms	remaining: 69.4ms
9:	learn: 0.4678202	total: 7.55ms	remaining: 68ms
10:	learn: 0.4587949	total: 8.34ms	remaining: 67.5ms
11:	learn: 0.4504857	total: 9.08ms	remaining: 66.6ms
12:	learn: 0.4399476	total: 9.82ms	remaining: 65.7ms
13:	learn: 0.4298470	total: 10.5ms	remaining: 64.7ms
14:	learn: 0.4216719	total: 11.2ms	remaining: 63.7ms
15:	learn: 0.4143371	total: 12ms	remaining: 62.8ms
16:	learn: 0.4103875	total: 12.7ms	remaining: 61.9ms
17:	learn: 0.4067765	total: 13.4ms	remaining: 61.2ms
18:	learn: 0.4027866	total: 14.2ms	remaining: 60.4ms
19:	learn:

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


38:	learn: 0.3708285	total: 31.2ms	remaining: 48.9ms
39:	learn: 0.3689674	total: 32.1ms	remaining: 48.1ms
40:	learn: 0.3682638	total: 32.9ms	remaining: 47.4ms
41:	learn: 0.3663470	total: 33.8ms	remaining: 46.6ms
42:	learn: 0.3648100	total: 34.6ms	remaining: 45.8ms
43:	learn: 0.3632607	total: 35.4ms	remaining: 45ms
44:	learn: 0.3615907	total: 36.1ms	remaining: 44.1ms
45:	learn: 0.3604952	total: 36.9ms	remaining: 43.3ms
46:	learn: 0.3599562	total: 37.7ms	remaining: 42.5ms
47:	learn: 0.3586697	total: 38.5ms	remaining: 41.7ms
48:	learn: 0.3575145	total: 39.3ms	remaining: 40.9ms
49:	learn: 0.3571062	total: 40ms	remaining: 40ms
50:	learn: 0.3556333	total: 40.8ms	remaining: 39.2ms
51:	learn: 0.3550356	total: 41.6ms	remaining: 38.4ms
52:	learn: 0.3545033	total: 42.4ms	remaining: 37.6ms
53:	learn: 0.3533526	total: 43.1ms	remaining: 36.7ms
54:	learn: 0.3521534	total: 43.8ms	remaining: 35.9ms
55:	learn: 0.3513438	total: 44.6ms	remaining: 35.1ms
56:	learn: 0.3509386	total: 45.5ms	remaining: 34.4ms

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())



Random Forest - Accuracy: 0.8259, F1-Score: 0.7897
Gradient Boosting - Accuracy: 0.8139, F1-Score: 0.7678
XGBoost - Accuracy: 0.8121, F1-Score: 0.7667
LightGBM - Accuracy: 0.8145, F1-Score: 0.7711
CatBoost - Accuracy: 0.8139, F1-Score: 0.7742
SVM - Accuracy: 0.7708, F1-Score: 0.7357
Dummy - Accuracy: 0.4747, F1-Score: 0.4475


  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


## Label: LONG_INTERACTION

In [42]:
# Label distribution before balancing
print(encoded_data['LONG_INTERACTION_availability'].value_counts())

LONG_INTERACTION_availability
False    1468
True     1433
Name: count, dtype: int64


### Model building and LOSO CV

In [43]:
# Prepare features (X) and target variable (y) for predicting LONG_INTERACTION
X = encoded_data.drop(columns=['LONG_INTERACTION_availability', 'SHORT_INTERACTION_availability','uid'], axis=1)
y = encoded_data['LONG_INTERACTION_availability']
groups = encoded_data['uid']

# Initialize Leave-One-Group-Out cross-validator
logo = LeaveOneGroupOut()

# Initialize SMOTE for oversampling to handle class imbalance
smote = SMOTE(random_state=42)

# Define models to evaluate
models2 = {
    'Random Forest': RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42),
    'XGBoost': XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42),
    'LightGBM': LGBMClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42),
    'CatBoost': CatBoostClassifier(iterations=100, depth=5, learning_rate=0.1, loss_function='Logloss', cat_features=[0], random_seed=42),
    'SVM': SVC(random_state=42),
    'Dummy': DummyClassifier(strategy="stratified", random_state=42)
}

results2_logo = {}

# Loop over each model
for model_name, model2 in models2.items():
    accuracies = []
    f1_scores = []

    # Loop over each fold in Leave-One-Group-Out cross-validation
    for i, (train_index, test_index) in enumerate(logo.split(X, y, groups)):
        # Split the data into training and testing sets for the current fold
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Apply SMOTE to the training data to balance class distribution
        X_train_oversampled, y_train_oversampled = smote.fit_resample(X_train, y_train)

        # Train the model on the oversampled training data
        model2.fit(X_train_oversampled, y_train_oversampled.ravel())

        # Predict the target on the test data
        y_pred = model2.predict(X_test)

        # Evaluate the prediction
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='macro')

        accuracies.append(accuracy)
        f1_scores.append(f1)

    # Compute average accuracy and F1-score across all folds
    avg_accuracy = sum(accuracies) / len(accuracies)
    avg_f1_score = sum(f1_scores) / len(f1_scores)

    results2_logo[model_name] = {
        'Average Accuracy': avg_accuracy,
        'Average F1-Score (macro)': avg_f1_score
    }

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.r

[LightGBM] [Info] Number of positive: 1425, number of negative: 1425
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000244 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 90
[LightGBM] [Info] Number of data points in the train set: 2850, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1444, number of negative: 1444
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000249 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 92
[LightGBM] [Info] Number of data points in the train set: 2888, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[Lig

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


[LightGBM] [Info] Number of positive: 1365, number of negative: 1365
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000269 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 92
[LightGBM] [Info] Number of data points in the train set: 2730, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1443, number of negative: 1443
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000237 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 90
[LightGBM] [Info] Number of data points in the train set: 2886, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[Lig

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


[LightGBM] [Info] Number of positive: 1458, number of negative: 1458
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000273 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 92
[LightGBM] [Info] Number of data points in the train set: 2916, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1452, number of negative: 1452
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000237 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 92
[LightGBM] [Info] Number of data points in the train set: 2904, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[Lig

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


[LightGBM] [Info] Number of positive: 1402, number of negative: 1402
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000255 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 92
[LightGBM] [Info] Number of data points in the train set: 2804, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1406, number of negative: 1406
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000222 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 93
[LightGBM] [Info] Number of data points in the train set: 2812, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[Lig

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


[LightGBM] [Info] Number of positive: 1441, number of negative: 1441
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000238 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 92
[LightGBM] [Info] Number of data points in the train set: 2882, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1417, number of negative: 1417
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000208 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 92
[LightGBM] [Info] Number of data points in the train set: 2834, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[Lig

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


[LightGBM] [Info] Number of positive: 1423, number of negative: 1423
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000194 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 92
[LightGBM] [Info] Number of data points in the train set: 2846, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1439, number of negative: 1439
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000238 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 92
[LightGBM] [Info] Number of data points in the train set: 2878, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[Lig

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


[LightGBM] [Info] Number of positive: 1398, number of negative: 1398
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000232 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 92
[LightGBM] [Info] Number of data points in the train set: 2796, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1414, number of negative: 1414
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000235 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 92
[LightGBM] [Info] Number of data points in the train set: 2828, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


[LightGBM] [Info] Number of positive: 1424, number of negative: 1424
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000273 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 92
[LightGBM] [Info] Number of data points in the train set: 2848, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1445, number of negative: 1445
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000228 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 90
[LightGBM] [Info] Number of data points in the train set: 2890, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[Lig

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


0:	learn: 0.6728474	total: 1.17ms	remaining: 116ms
1:	learn: 0.6569232	total: 2.25ms	remaining: 110ms
2:	learn: 0.6434571	total: 3.19ms	remaining: 103ms
3:	learn: 0.6337466	total: 4.06ms	remaining: 97.5ms
4:	learn: 0.6217621	total: 5.03ms	remaining: 95.5ms
5:	learn: 0.6149952	total: 5.94ms	remaining: 93.1ms
6:	learn: 0.6061168	total: 6.97ms	remaining: 92.7ms
7:	learn: 0.5991345	total: 7.92ms	remaining: 91.1ms
8:	learn: 0.5924261	total: 8.94ms	remaining: 90.4ms
9:	learn: 0.5863084	total: 9.97ms	remaining: 89.7ms
10:	learn: 0.5798708	total: 11ms	remaining: 88.9ms
11:	learn: 0.5764239	total: 11.8ms	remaining: 86.4ms
12:	learn: 0.5734171	total: 12.5ms	remaining: 84ms
13:	learn: 0.5686116	total: 14ms	remaining: 85.7ms
14:	learn: 0.5656370	total: 15ms	remaining: 85.2ms
15:	learn: 0.5615597	total: 16ms	remaining: 83.9ms
16:	learn: 0.5580848	total: 16.8ms	remaining: 82ms
17:	learn: 0.5550347	total: 17.9ms	remaining: 81.6ms
18:	learn: 0.5529056	total: 18.8ms	remaining: 80.3ms
19:	learn: 0.55031

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


65:	learn: 0.5088966	total: 50.1ms	remaining: 25.8ms
66:	learn: 0.5079816	total: 51ms	remaining: 25.1ms
67:	learn: 0.5075545	total: 52ms	remaining: 24.5ms
68:	learn: 0.5071037	total: 52.9ms	remaining: 23.8ms
69:	learn: 0.5065997	total: 53.9ms	remaining: 23.1ms
70:	learn: 0.5059539	total: 54.7ms	remaining: 22.4ms
71:	learn: 0.5054788	total: 55.6ms	remaining: 21.6ms
72:	learn: 0.5050664	total: 56.4ms	remaining: 20.9ms
73:	learn: 0.5043205	total: 57.1ms	remaining: 20.1ms
74:	learn: 0.5037865	total: 57.9ms	remaining: 19.3ms
75:	learn: 0.5034465	total: 58.6ms	remaining: 18.5ms
76:	learn: 0.5028113	total: 59.4ms	remaining: 17.7ms
77:	learn: 0.5023460	total: 60.1ms	remaining: 17ms
78:	learn: 0.5019032	total: 60.9ms	remaining: 16.2ms
79:	learn: 0.5015965	total: 61.7ms	remaining: 15.4ms
80:	learn: 0.5012766	total: 62.4ms	remaining: 14.6ms
81:	learn: 0.5009171	total: 63.2ms	remaining: 13.9ms
82:	learn: 0.5006772	total: 63.9ms	remaining: 13.1ms
83:	learn: 0.5001551	total: 64.7ms	remaining: 12.3ms

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


0:	learn: 0.6726471	total: 830us	remaining: 82.2ms
1:	learn: 0.6549100	total: 1.6ms	remaining: 78.4ms
2:	learn: 0.6403111	total: 2.3ms	remaining: 74.4ms
3:	learn: 0.6281896	total: 2.99ms	remaining: 71.8ms
4:	learn: 0.6177638	total: 3.67ms	remaining: 69.7ms
5:	learn: 0.6084371	total: 4.37ms	remaining: 68.5ms
6:	learn: 0.6010205	total: 5.04ms	remaining: 67ms
7:	learn: 0.5946789	total: 5.78ms	remaining: 66.5ms
8:	learn: 0.5866462	total: 6.49ms	remaining: 65.6ms
9:	learn: 0.5813165	total: 7.23ms	remaining: 65.1ms
10:	learn: 0.5761641	total: 7.98ms	remaining: 64.6ms
11:	learn: 0.5722310	total: 8.69ms	remaining: 63.7ms
12:	learn: 0.5680694	total: 9.43ms	remaining: 63.1ms
13:	learn: 0.5629218	total: 10.2ms	remaining: 62.4ms
14:	learn: 0.5595088	total: 10.9ms	remaining: 61.6ms
15:	learn: 0.5555085	total: 11.6ms	remaining: 61ms
16:	learn: 0.5526894	total: 12.4ms	remaining: 60.4ms
17:	learn: 0.5499873	total: 13.1ms	remaining: 59.7ms
18:	learn: 0.5477593	total: 13.8ms	remaining: 58.9ms
19:	learn:

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


0:	learn: 0.6737706	total: 877us	remaining: 86.8ms
1:	learn: 0.6603303	total: 1.38ms	remaining: 67.8ms
2:	learn: 0.6467684	total: 2.05ms	remaining: 66.4ms
3:	learn: 0.6356922	total: 2.73ms	remaining: 65.7ms
4:	learn: 0.6244514	total: 3.42ms	remaining: 64.9ms
5:	learn: 0.6156525	total: 4.07ms	remaining: 63.8ms
6:	learn: 0.6077385	total: 4.78ms	remaining: 63.6ms
7:	learn: 0.6004900	total: 5.47ms	remaining: 63ms
8:	learn: 0.5942929	total: 6.17ms	remaining: 62.4ms
9:	learn: 0.5892715	total: 6.9ms	remaining: 62.1ms
10:	learn: 0.5821713	total: 7.58ms	remaining: 61.4ms
11:	learn: 0.5784751	total: 8.3ms	remaining: 60.8ms
12:	learn: 0.5744053	total: 9.01ms	remaining: 60.3ms
13:	learn: 0.5698985	total: 9.69ms	remaining: 59.5ms
14:	learn: 0.5668630	total: 10.5ms	remaining: 59.3ms
15:	learn: 0.5636829	total: 11.3ms	remaining: 59.1ms
16:	learn: 0.5602172	total: 12ms	remaining: 58.6ms
17:	learn: 0.5569798	total: 12.8ms	remaining: 58.2ms
18:	learn: 0.5547667	total: 13.5ms	remaining: 57.7ms
19:	learn:

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


58:	learn: 0.5110509	total: 48.5ms	remaining: 33.7ms
59:	learn: 0.5107386	total: 49.4ms	remaining: 33ms
60:	learn: 0.5102016	total: 50.2ms	remaining: 32.1ms
61:	learn: 0.5098758	total: 51.3ms	remaining: 31.4ms
62:	learn: 0.5094655	total: 52.2ms	remaining: 30.7ms
63:	learn: 0.5089188	total: 53.2ms	remaining: 29.9ms
64:	learn: 0.5084796	total: 54.1ms	remaining: 29.1ms
65:	learn: 0.5080978	total: 54.9ms	remaining: 28.3ms
66:	learn: 0.5077378	total: 55.7ms	remaining: 27.4ms
67:	learn: 0.5067774	total: 56.5ms	remaining: 26.6ms
68:	learn: 0.5064569	total: 57.3ms	remaining: 25.7ms
69:	learn: 0.5055375	total: 58.1ms	remaining: 24.9ms
70:	learn: 0.5047198	total: 58.9ms	remaining: 24ms
71:	learn: 0.5043448	total: 59.6ms	remaining: 23.2ms
72:	learn: 0.5038259	total: 60.4ms	remaining: 22.3ms
73:	learn: 0.5032607	total: 61.1ms	remaining: 21.5ms
74:	learn: 0.5029134	total: 61.9ms	remaining: 20.6ms
75:	learn: 0.5023789	total: 62.7ms	remaining: 19.8ms
76:	learn: 0.5019397	total: 63.6ms	remaining: 19ms

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


0:	learn: 0.6742273	total: 892us	remaining: 88.3ms
1:	learn: 0.6537273	total: 1.61ms	remaining: 79.1ms
2:	learn: 0.6371123	total: 2.35ms	remaining: 76.1ms
3:	learn: 0.6255034	total: 3.06ms	remaining: 73.3ms
4:	learn: 0.6138807	total: 3.73ms	remaining: 70.8ms
5:	learn: 0.6049889	total: 4.44ms	remaining: 69.5ms
6:	learn: 0.5942565	total: 5.12ms	remaining: 68ms
7:	learn: 0.5868694	total: 5.84ms	remaining: 67.2ms
8:	learn: 0.5803664	total: 6.56ms	remaining: 66.3ms
9:	learn: 0.5751180	total: 7.28ms	remaining: 65.5ms
10:	learn: 0.5685418	total: 8.01ms	remaining: 64.8ms
11:	learn: 0.5626521	total: 8.7ms	remaining: 63.8ms
12:	learn: 0.5596744	total: 9.38ms	remaining: 62.8ms
13:	learn: 0.5559016	total: 10.1ms	remaining: 62.2ms
14:	learn: 0.5518676	total: 10.8ms	remaining: 61.4ms
15:	learn: 0.5491761	total: 11.6ms	remaining: 60.8ms
16:	learn: 0.5455019	total: 12.4ms	remaining: 60.4ms
17:	learn: 0.5431766	total: 13.1ms	remaining: 59.6ms
18:	learn: 0.5404428	total: 13.8ms	remaining: 58.9ms
19:	lea

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


0:	learn: 0.6748476	total: 844us	remaining: 83.6ms
1:	learn: 0.6572455	total: 1.59ms	remaining: 78ms
2:	learn: 0.6435172	total: 2.29ms	remaining: 74.1ms
3:	learn: 0.6301061	total: 2.98ms	remaining: 71.6ms
4:	learn: 0.6173365	total: 3.71ms	remaining: 70.5ms
5:	learn: 0.6070298	total: 4.47ms	remaining: 70ms
6:	learn: 0.5987754	total: 5.16ms	remaining: 68.6ms
7:	learn: 0.5907567	total: 5.84ms	remaining: 67.2ms
8:	learn: 0.5844378	total: 6.58ms	remaining: 66.5ms
9:	learn: 0.5786278	total: 7.25ms	remaining: 65.3ms
10:	learn: 0.5724043	total: 7.96ms	remaining: 64.4ms
11:	learn: 0.5698764	total: 8.48ms	remaining: 62.2ms
12:	learn: 0.5648238	total: 9.15ms	remaining: 61.3ms
13:	learn: 0.5613392	total: 9.88ms	remaining: 60.7ms
14:	learn: 0.5584593	total: 10.5ms	remaining: 59.8ms
15:	learn: 0.5546959	total: 11.3ms	remaining: 59.1ms
16:	learn: 0.5522554	total: 12ms	remaining: 58.6ms
17:	learn: 0.5493932	total: 12.8ms	remaining: 58.2ms
18:	learn: 0.5464846	total: 13.5ms	remaining: 57.4ms
19:	learn:

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


79:	learn: 0.5029757	total: 61.6ms	remaining: 15.4ms
80:	learn: 0.5027051	total: 62.5ms	remaining: 14.7ms
81:	learn: 0.5024371	total: 63.6ms	remaining: 14ms
82:	learn: 0.5018052	total: 64.4ms	remaining: 13.2ms
83:	learn: 0.5015350	total: 65.1ms	remaining: 12.4ms
84:	learn: 0.5011118	total: 65.8ms	remaining: 11.6ms
85:	learn: 0.5005603	total: 66.6ms	remaining: 10.8ms
86:	learn: 0.5001469	total: 67.2ms	remaining: 10ms
87:	learn: 0.4999287	total: 67.9ms	remaining: 9.26ms
88:	learn: 0.4996711	total: 68.6ms	remaining: 8.48ms
89:	learn: 0.4992855	total: 69.3ms	remaining: 7.71ms
90:	learn: 0.4986632	total: 70.1ms	remaining: 6.93ms
91:	learn: 0.4982009	total: 70.8ms	remaining: 6.15ms
92:	learn: 0.4979062	total: 71.4ms	remaining: 5.38ms
93:	learn: 0.4975310	total: 72.2ms	remaining: 4.61ms
94:	learn: 0.4970791	total: 72.9ms	remaining: 3.84ms
95:	learn: 0.4965722	total: 73.6ms	remaining: 3.07ms
96:	learn: 0.4962271	total: 74.3ms	remaining: 2.3ms
97:	learn: 0.4957352	total: 75ms	remaining: 1.53ms


  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


0:	learn: 0.6738419	total: 901us	remaining: 89.2ms
1:	learn: 0.6574743	total: 1.72ms	remaining: 84.5ms
2:	learn: 0.6441921	total: 2.55ms	remaining: 82.3ms
3:	learn: 0.6321186	total: 3.31ms	remaining: 79.6ms
4:	learn: 0.6206724	total: 4.07ms	remaining: 77.4ms
5:	learn: 0.6110484	total: 4.83ms	remaining: 75.6ms
6:	learn: 0.6028525	total: 5.56ms	remaining: 73.8ms
7:	learn: 0.5963835	total: 6.32ms	remaining: 72.7ms
8:	learn: 0.5898165	total: 7.06ms	remaining: 71.4ms
9:	learn: 0.5842625	total: 7.83ms	remaining: 70.4ms
10:	learn: 0.5778239	total: 8.52ms	remaining: 68.9ms
11:	learn: 0.5738061	total: 9.23ms	remaining: 67.7ms
12:	learn: 0.5681406	total: 9.97ms	remaining: 66.7ms
13:	learn: 0.5644253	total: 10.7ms	remaining: 65.6ms
14:	learn: 0.5611076	total: 11.4ms	remaining: 64.8ms
15:	learn: 0.5589316	total: 12.1ms	remaining: 63.7ms
16:	learn: 0.5565212	total: 13ms	remaining: 63.7ms
17:	learn: 0.5534225	total: 13.8ms	remaining: 62.8ms
18:	learn: 0.5512853	total: 14.5ms	remaining: 61.8ms
19:	le

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


0:	learn: 0.6710035	total: 897us	remaining: 88.8ms
1:	learn: 0.6545732	total: 1.64ms	remaining: 80.5ms
2:	learn: 0.6422235	total: 2.32ms	remaining: 74.9ms
3:	learn: 0.6315317	total: 3.01ms	remaining: 72.2ms
4:	learn: 0.6228412	total: 3.68ms	remaining: 69.9ms
5:	learn: 0.6153912	total: 4.41ms	remaining: 69.1ms
6:	learn: 0.6070677	total: 5.11ms	remaining: 67.8ms
7:	learn: 0.6003154	total: 5.83ms	remaining: 67ms
8:	learn: 0.5946984	total: 6.5ms	remaining: 65.8ms
9:	learn: 0.5875919	total: 7.17ms	remaining: 64.5ms
10:	learn: 0.5821841	total: 7.86ms	remaining: 63.6ms
11:	learn: 0.5789460	total: 8.53ms	remaining: 62.5ms
12:	learn: 0.5746493	total: 9.23ms	remaining: 61.7ms
13:	learn: 0.5699093	total: 9.96ms	remaining: 61.2ms
14:	learn: 0.5657457	total: 10.7ms	remaining: 60.5ms
15:	learn: 0.5629685	total: 11.4ms	remaining: 59.6ms
16:	learn: 0.5602990	total: 12.1ms	remaining: 58.9ms
17:	learn: 0.5573862	total: 12.8ms	remaining: 58.2ms
18:	learn: 0.5550977	total: 13.5ms	remaining: 57.5ms
19:	lea

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


81:	learn: 0.4972753	total: 62.7ms	remaining: 13.8ms
82:	learn: 0.4967452	total: 63.7ms	remaining: 13ms
83:	learn: 0.4964510	total: 64.5ms	remaining: 12.3ms
84:	learn: 0.4961055	total: 65.3ms	remaining: 11.5ms
85:	learn: 0.4953676	total: 66.1ms	remaining: 10.8ms
86:	learn: 0.4950969	total: 66.8ms	remaining: 9.99ms
87:	learn: 0.4948062	total: 67.6ms	remaining: 9.22ms
88:	learn: 0.4944112	total: 68.4ms	remaining: 8.45ms
89:	learn: 0.4939387	total: 69.2ms	remaining: 7.68ms
90:	learn: 0.4935929	total: 69.9ms	remaining: 6.92ms
91:	learn: 0.4927927	total: 70.7ms	remaining: 6.15ms
92:	learn: 0.4923469	total: 71.6ms	remaining: 5.39ms
93:	learn: 0.4920596	total: 72.3ms	remaining: 4.62ms
94:	learn: 0.4915482	total: 73.1ms	remaining: 3.85ms
95:	learn: 0.4909074	total: 73.8ms	remaining: 3.08ms
96:	learn: 0.4904872	total: 74.6ms	remaining: 2.31ms
97:	learn: 0.4902059	total: 75.4ms	remaining: 1.54ms
98:	learn: 0.4899581	total: 76.2ms	remaining: 769us
99:	learn: 0.4897047	total: 76.9ms	remaining: 0us

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


0:	learn: 0.6724731	total: 846us	remaining: 83.8ms
1:	learn: 0.6572044	total: 1.71ms	remaining: 83.7ms
2:	learn: 0.6441163	total: 2.44ms	remaining: 79ms
3:	learn: 0.6329796	total: 3.22ms	remaining: 77.3ms
4:	learn: 0.6228248	total: 3.94ms	remaining: 74.8ms
5:	learn: 0.6164994	total: 4.54ms	remaining: 71.1ms
6:	learn: 0.6085593	total: 5.24ms	remaining: 69.6ms
7:	learn: 0.5995769	total: 5.99ms	remaining: 68.9ms
8:	learn: 0.5929013	total: 6.76ms	remaining: 68.3ms
9:	learn: 0.5867412	total: 7.5ms	remaining: 67.5ms
10:	learn: 0.5811787	total: 8.19ms	remaining: 66.3ms
11:	learn: 0.5770062	total: 8.99ms	remaining: 66ms
12:	learn: 0.5722840	total: 9.74ms	remaining: 65.2ms
13:	learn: 0.5688525	total: 10.8ms	remaining: 66.1ms
14:	learn: 0.5655146	total: 11.8ms	remaining: 66.7ms
15:	learn: 0.5616025	total: 12.9ms	remaining: 67.5ms
16:	learn: 0.5581219	total: 13.8ms	remaining: 67.3ms
17:	learn: 0.5557634	total: 14.4ms	remaining: 65.7ms
18:	learn: 0.5538586	total: 15ms	remaining: 63.9ms
19:	learn: 

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


0:	learn: 0.6733803	total: 838us	remaining: 83ms
1:	learn: 0.6561263	total: 1.66ms	remaining: 81.2ms
2:	learn: 0.6434857	total: 2.36ms	remaining: 76.4ms
3:	learn: 0.6338190	total: 3.12ms	remaining: 74.9ms
4:	learn: 0.6228912	total: 3.81ms	remaining: 72.4ms
5:	learn: 0.6144509	total: 4.46ms	remaining: 69.9ms
6:	learn: 0.6059800	total: 5.17ms	remaining: 68.6ms
7:	learn: 0.5980244	total: 5.85ms	remaining: 67.3ms
8:	learn: 0.5922831	total: 6.53ms	remaining: 66.1ms
9:	learn: 0.5868879	total: 7.21ms	remaining: 64.9ms
10:	learn: 0.5807013	total: 7.9ms	remaining: 63.9ms
11:	learn: 0.5769272	total: 8.56ms	remaining: 62.8ms
12:	learn: 0.5726039	total: 9.29ms	remaining: 62.1ms
13:	learn: 0.5694566	total: 10.1ms	remaining: 62.1ms
14:	learn: 0.5666418	total: 10.9ms	remaining: 61.6ms
15:	learn: 0.5630791	total: 11.7ms	remaining: 61.3ms
16:	learn: 0.5605786	total: 12.4ms	remaining: 60.6ms
17:	learn: 0.5579614	total: 13.1ms	remaining: 59.9ms
18:	learn: 0.5559744	total: 13.8ms	remaining: 59ms
19:	learn

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


93:	learn: 0.4957575	total: 76.2ms	remaining: 4.86ms
94:	learn: 0.4950598	total: 77.1ms	remaining: 4.06ms
95:	learn: 0.4947518	total: 78ms	remaining: 3.25ms
96:	learn: 0.4942765	total: 78.9ms	remaining: 2.44ms
97:	learn: 0.4938628	total: 79.6ms	remaining: 1.63ms
98:	learn: 0.4932895	total: 80.4ms	remaining: 811us
99:	learn: 0.4930548	total: 81ms	remaining: 0us
0:	learn: 0.6731870	total: 879us	remaining: 87.1ms
1:	learn: 0.6599435	total: 1.62ms	remaining: 79.3ms
2:	learn: 0.6486434	total: 2.32ms	remaining: 75ms
3:	learn: 0.6348114	total: 3.02ms	remaining: 72.5ms
4:	learn: 0.6230028	total: 3.75ms	remaining: 71.2ms
5:	learn: 0.6146193	total: 4.42ms	remaining: 69.2ms
6:	learn: 0.6057363	total: 5.15ms	remaining: 68.4ms
7:	learn: 0.5975556	total: 5.87ms	remaining: 67.5ms
8:	learn: 0.5953176	total: 6.27ms	remaining: 63.4ms
9:	learn: 0.5894972	total: 6.97ms	remaining: 62.7ms
10:	learn: 0.5842878	total: 7.67ms	remaining: 62.1ms
11:	learn: 0.5789721	total: 8.37ms	remaining: 61.4ms
12:	learn: 0.5

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.r


Random Forest - Accuracy: 0.6841, F1-Score: 0.6307
Gradient Boosting - Accuracy: 0.6674, F1-Score: 0.6008
XGBoost - Accuracy: 0.6735, F1-Score: 0.6083
LightGBM - Accuracy: 0.6775, F1-Score: 0.6090
CatBoost - Accuracy: 0.6915, F1-Score: 0.6166
SVM - Accuracy: 0.6502, F1-Score: 0.6012
Dummy - Accuracy: 0.4879, F1-Score: 0.4555


  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


### K-Fold CV

In [44]:
from sklearn.model_selection import KFold

# Initialize 5-fold cross-validator
kfold = KFold(n_splits=5, shuffle=False)

results2_kfold = {}

# Loop over each model
for model_name, model2 in models2.items():
    accuracies = []
    f1_scores = []

    # Loop over each fold in 5-fold cross-validation
    for train_index, test_index in kfold.split(X, y):
        # Split the data into training and testing sets for the current fold
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Apply SMOTE to the training data to balance class distribution
        X_train_oversampled, y_train_oversampled = smote.fit_resample(X_train, y_train)

        # Train the model on the oversampled training data
        model2.fit(X_train_oversampled, y_train_oversampled.ravel())

        # Predict the target on the test data
        y_pred = model2.predict(X_test)

        # Evaluate the prediction
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='macro')

        accuracies.append(accuracy)
        f1_scores.append(f1)

    # Compute average accuracy and F1-score across all folds
    avg_accuracy = sum(accuracies) / len(accuracies)
    avg_f1_score = sum(f1_scores) / len(f1_scores)

    results2_kfold[model_name] = {
        'Average Accuracy': avg_accuracy,
        'Average F1-Score (macro)': avg_f1_score
    }

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.r

[LightGBM] [Info] Number of positive: 1188, number of negative: 1188
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000211 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 88
[LightGBM] [Info] Number of data points in the train set: 2376, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1173, number of negative: 1173
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000190 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 90
[LightGBM] [Info] Number of data points in the train set: 2346, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[Lig

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


[LightGBM] [Info] Number of positive: 1201, number of negative: 1201
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000649 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 90
[LightGBM] [Info] Number of data points in the train set: 2402, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
0:	learn: 0.6715702	total: 1.07ms	remaining: 106ms
1:	learn: 0.6538396	total: 2.01ms	remaining: 98.6ms
2:	learn: 0.6392834	total: 2.9ms	remaining: 93.7ms
3:	learn: 0.6272745	total: 3.77ms	remaining: 90.5ms
4:	learn: 0.6170082	total: 4.56ms	remaining: 86.7ms
5:	learn: 0.6092339	total: 5.28ms	remaining: 82.7ms
6:	learn: 0.6027320	total: 6.13ms	remaining: 81.5ms
7:	learn: 0.5971089	total: 6.8ms	remaining: 78.2ms
8:	learn: 0.5889696	total: 7.48ms	remaining: 75.6ms
9:	learn: 0.5837884	total: 8.37ms	rem

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


19:	learn: 0.5400372	total: 16.7ms	remaining: 66.8ms
20:	learn: 0.5378518	total: 17.6ms	remaining: 66ms
21:	learn: 0.5359611	total: 18.4ms	remaining: 65.2ms
22:	learn: 0.5342366	total: 19.3ms	remaining: 64.8ms
23:	learn: 0.5328386	total: 20.1ms	remaining: 63.6ms
24:	learn: 0.5321067	total: 20.8ms	remaining: 62.4ms
25:	learn: 0.5312819	total: 21.5ms	remaining: 61.2ms
26:	learn: 0.5296831	total: 22.2ms	remaining: 60.1ms
27:	learn: 0.5283529	total: 22.9ms	remaining: 58.9ms
28:	learn: 0.5264427	total: 23.6ms	remaining: 57.7ms
29:	learn: 0.5254465	total: 24.4ms	remaining: 57ms
30:	learn: 0.5243280	total: 25.3ms	remaining: 56.3ms
31:	learn: 0.5229724	total: 26ms	remaining: 55.2ms
32:	learn: 0.5217923	total: 26.7ms	remaining: 54.1ms
33:	learn: 0.5211452	total: 27.4ms	remaining: 53.1ms
34:	learn: 0.5199951	total: 28.1ms	remaining: 52.2ms
35:	learn: 0.5187001	total: 28.9ms	remaining: 51.3ms
36:	learn: 0.5184780	total: 29.2ms	remaining: 49.7ms
37:	learn: 0.5175379	total: 29.9ms	remaining: 48.8ms

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


0:	learn: 0.6741379	total: 1ms	remaining: 99.2ms
1:	learn: 0.6571544	total: 1.77ms	remaining: 86.5ms
2:	learn: 0.6448336	total: 2.64ms	remaining: 85.4ms
3:	learn: 0.6332969	total: 3.44ms	remaining: 82.5ms
4:	learn: 0.6251638	total: 4.16ms	remaining: 79.1ms
5:	learn: 0.6178365	total: 4.99ms	remaining: 78.2ms
6:	learn: 0.6104293	total: 5.72ms	remaining: 75.9ms
7:	learn: 0.6044367	total: 6.43ms	remaining: 73.9ms
8:	learn: 0.5991271	total: 7.14ms	remaining: 72.2ms
9:	learn: 0.5935131	total: 7.83ms	remaining: 70.4ms
10:	learn: 0.5887796	total: 8.58ms	remaining: 69.4ms
11:	learn: 0.5843930	total: 9.27ms	remaining: 68ms
12:	learn: 0.5785885	total: 10ms	remaining: 66.9ms
13:	learn: 0.5752997	total: 10.8ms	remaining: 66.3ms
14:	learn: 0.5715743	total: 11.5ms	remaining: 65.3ms
15:	learn: 0.5676404	total: 12.2ms	remaining: 64ms
16:	learn: 0.5643894	total: 12.9ms	remaining: 63ms
17:	learn: 0.5626147	total: 13.7ms	remaining: 62.5ms
18:	learn: 0.5593796	total: 14.4ms	remaining: 61.6ms
19:	learn: 0.5

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())



Random Forest - Accuracy: 0.6649, F1-Score: 0.6609
Gradient Boosting - Accuracy: 0.6612, F1-Score: 0.6568
XGBoost - Accuracy: 0.6687, F1-Score: 0.6641
LightGBM - Accuracy: 0.6601, F1-Score: 0.6553
CatBoost - Accuracy: 0.6680, F1-Score: 0.6637
SVM - Accuracy: 0.6060, F1-Score: 0.6042
Dummy - Accuracy: 0.4854, F1-Score: 0.4835


  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


In [49]:
from tabulate import tabulate

# Models list
models = [
    'Random Forest', 'Gradient Boosting', 'XGBoost',
    'LightGBM', 'CatBoost', 'SVM', 'Dummy'
]

# Prepare table data
table_data = []
for model in models:
    row = [model]
    
    # Shell 1: SHORT_INTERACTION, Leave-One-Group-Out
    row.append(results1_logo.get(model, {}).get('Average Accuracy', 0.0))
    row.append(results1_logo.get(model, {}).get('Average F1-Score (macro)', 0.0))
    
    # Shell 3: LONG_INTERACTION, Leave-One-Group-Out
    row.append(results2_logo.get(model, {}).get('Average Accuracy', 0.0))
    row.append(results2_logo.get(model, {}).get('Average F1-Score (macro)', 0.0))

    # Shell 2: SHORT_INTERACTION, 5-fold
    row.append(results1_kfold.get(model, {}).get('Average Accuracy', 0.0))
    row.append(results1_kfold.get(model, {}).get('Average F1-Score (macro)', 0.0))
    
    # Shell 4: LONG_INTERACTION, 5-fold
    row.append(results2_kfold.get(model, {}).get('Average Accuracy', 0.0))
    row.append(results2_kfold.get(model, {}).get('Average F1-Score (macro)', 0.0))
    
    table_data.append(row)

# Define headers
headers = [
    'Model',
    'LOSO CV\nShort\nInteraction\nAccuracy', 'LOSO CV\nShort\nInteraction\nF1',
    'LOSO CV\nLong\nInteraction\nAccuracy', 'LOSO CV\nLong\nInteraction\nF1',
    '5-fold CV\nShort\nInteraction\nAccuracy', '5-fold CV\nShort\nInteraction\nF1',
    '5-fold CV\nLong\nInteraction\nAccuracy', '5-fold CV\nLong\nInteraction\nF1'
]

# Print the table
print("Machine learning model performance")
print(tabulate(table_data, headers=headers, tablefmt='fancy_grid', floatfmt='.3f', numalign="decimal"))


Machine learning model performance
╒═══════════════════╤═══════════════╤═══════════════╤═══════════════╤═══════════════╤═══════════════╤═══════════════╤═══════════════╤═══════════════╕
│ Model             │       LOSO CV │       LOSO CV │       LOSO CV │       LOSO CV │     5-fold CV │     5-fold CV │     5-fold CV │     5-fold CV │
│                   │         Short │         Short │          Long │          Long │         Short │         Short │          Long │          Long │
│                   │   Interaction │   Interaction │   Interaction │   Interaction │   Interaction │   Interaction │   Interaction │   Interaction │
│                   │      Accuracy │            F1 │      Accuracy │            F1 │      Accuracy │            F1 │      Accuracy │            F1 │
╞═══════════════════╪═══════════════╪═══════════════╪═══════════════╪═══════════════╪═══════════════╪═══════════════╪═══════════════╪═══════════════╡
│ Random Forest     │         0.834 │         0.738 │         0.6