# Settings

## Constants

In [1]:
import os

PATH_DATA = './Dataset'
sensor_data = ['UserInfo.csv', 'Service.csv', 'ContextualFactor.csv', 'Availability.csv']

# Utility Functions

In [2]:
# Install required packages
!pip install -q scikit-learn catboost
!pip install -q scikit-learn catboost xgboost lightgbm imbalanced-learn

# === Core Libraries ===
import pandas as pd
import numpy as np
from datetime import datetime, timezone, timedelta

# Set timezone to UTC+9 for consistent time handling
tz = timezone(+timedelta(hours=9))

# === Preprocessing and Utilities ===
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict

# === Model Selection and Evaluation ===
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.metrics import accuracy_score, f1_score

# === Machine Learning Models ===
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.svm import SVC
from sklearn.dummy import DummyClassifier

# === Handling Imbalanced Data ===
from imblearn.over_sampling import SMOTE


# Load the Dataset into Dataframe

In [3]:
dataframes = {
    filename: pd.read_csv(os.path.join(PATH_DATA, filename)).reset_index(drop=True)
    for filename in sensor_data
}
dfService = dataframes['Service.csv']
dfContextualFactor = dataframes['ContextualFactor.csv']
dfUserInfo = dataframes['UserInfo.csv']
dfAvailability = dataframes['Availability.csv']

# Preprocessing

In [5]:
# Select specific columns from dfContextualFactor, dfService, and dfUserInfo DataFrames
# Combine relevant columns to create a unified dataset
dfContextualFactor_selected_columns=dfContextualFactor[['uid','sid','activity1','activity2','activity3','userRoom','userPosition',]]
dfService_selected_columns=dfService[['weekOfExperiment','dayOfWeek','startTime', 'activityInquiry','availabilityInquiry','speechShadowing','continue-to-nextInquiry_1','continue-to-nextInquiry_2','continue-to-nextInquiry_3','continue-to-nextInquiry_4','endTime','endType']]
dfAvailability_selected_columns=dfAvailability[['SHORT_INTERACTION_availability', 'LONG_INTERACTION_availability']]

## Create binary columns for interaction types
## SHORT_INTERACTION: True if availabilityInquiry is not NaN
# dfCombinedAll['SHORT_INTERACTION_availability'] = dfCombinedAll['availabilityInquiry'].notna()
## LONG_INTERACTION: True if continue-to-nextInquiry_1 is not NaN
# dfCombinedAll['LONG_INTERACTION_availability'] = dfCombinedAll['continue-to-nextInquiry_1'].notna()

# ! For those who want to redefine LONG_INTERACTION with thresholds longer than 3 minutes,
# you can use the following columns:
# 5 minutes  => use continue-to-nextInquiry_2
# 7 minutes  => use continue-to-nextInquiry_3
# 9 minutes  => use continue-to-nextInquiry_4


dfCombinedAll=pd.concat([dfContextualFactor_selected_columns, dfService_selected_columns,dfAvailability_selected_columns], axis=1)

In [7]:
# Concatenate and merge the all columns from dfUserInfo (dfUserInfo.csv)
dfUserInfo_selected_columns = dfUserInfo[['uid', 'homeType', 'speakerRoom', 'speakerPosition']]
dfCombinedAll = pd.merge(dfCombinedAll, dfUserInfo_selected_columns, on='uid', how='left')

## Position Processing

In [10]:
# Define function to calculate proximity between user and speaker
def calculate_proximity(row):
    # Return 0 if user and speaker are in different rooms
    if row['userRoom'] != row['speakerRoom']:
        return 0
    # If in the same room
    elif row['userPosition'] == row['speakerPosition']:
        return 2  # Same position
    else:
        return 1  # Different positions (including missing position)

# Apply proximity calculation to create a new 'proximity' column
dfCombinedAll['proximity'] = dfCombinedAll.apply(calculate_proximity, axis=1)

print(dfCombinedAll[['userRoom', 'userPosition', 'speakerRoom', 'speakerPosition', 'proximity']].head(10))


      userRoom userPosition speakerRoom speakerPosition  proximity
0     Bed Room          Bed    Bed Room            Desk          1
1    Rest Room          NaN    Bed Room            Desk          0
2  Living Room          NaN    Bed Room            Desk          0
3  Living Room          NaN    Bed Room            Desk          0
4     Bed Room          Bed    Bed Room            Desk          1
5     Bed Room          Bed    Bed Room            Desk          1
6     Bed Room          Bed    Bed Room            Desk          1
7     Bed Room         Desk    Bed Room            Desk          2
8     Bed Room          Bed    Bed Room            Desk          1
9     Bed Room          Bed    Bed Room            Desk          1


## Activity and Time Processing

In [11]:
# Process activity columns for one-hot encoding
activity_cols = ['activity1', 'activity2', 'activity3']
df_activity = dfContextualFactor[activity_cols].copy()

# Get unique activities across all activity columns, excluding NaN
all_unique_activities = pd.unique(df_activity.values.ravel())
all_unique_activities = [x for x in all_unique_activities if pd.notna(x)]

# Create a DataFrame for one-hot encoding of activities
dfActivity_one_hot_encoding = pd.DataFrame(0, index=df_activity.index, columns=['act_' + str(val) for val in all_unique_activities])

# Perform one-hot encoding for each activity column
for col in activity_cols:
    for val in all_unique_activities:
        dfActivity_one_hot_encoding['act_' + str(val)] |= (df_activity[col] == val).astype(int)

# Concatenate one-hot encoded activity columns to dfCombinedAll
dfActivity_one_hot_encoding
dfCombinedAll = pd.concat([dfCombinedAll, dfActivity_one_hot_encoding], axis=1)

In [12]:
# Convert startTime to datetime and extract total minutes since midnight
dfCombinedAll['startTime'] = pd.to_datetime(dfCombinedAll['startTime'], format='%H:%M:%S', errors='coerce')
dfCombinedAll['minute'] = dfCombinedAll['startTime'].dt.hour * 60 + dfCombinedAll['startTime'].dt.minute

# Map days of the week to numerical values (MON=0, TUE=1, ..., SUN=6)
day_map = {'MON': 0, 'TUE': 1, 'WED': 2, 'THU': 3, 'FRI': 4, 'SAT': 5, 'SUN': 6}
dfCombinedAll['dayOfWeek'] = dfCombinedAll['dayOfWeek'].map(day_map)

# Bin minutes into 30-minute intervals for temporal analysis
dfCombinedAll['minute_bin'] = (dfCombinedAll['minute'] // 30).astype(int)
dfCombinedAll.head()

Unnamed: 0,uid,sid,activity1,activity2,activity3,userRoom,userPosition,weekOfExperiment,dayOfWeek,startTime,...,act_Social Interaction,act_Returning from Outside / Other Rooms,act_Studying / Working,act_Others,act_House Chores,act_Self Caring,act_Visiting Outside / Other Rooms,act_Resting,minute,minute_bin
0,1,1,Taking a Nap / Sleeping,,,Bed Room,Bed,1,0,1900-01-01 09:00:26,...,0,0,0,0,0,0,0,0,540,18
1,1,2,Hygiene,,,Rest Room,,1,0,1900-01-01 10:34:26,...,0,0,0,0,0,0,0,0,634,21
2,1,3,Eating,,,Living Room,,1,1,1900-01-01 09:00:46,...,0,0,0,0,0,0,0,0,540,18
3,1,4,Eating,,,Living Room,,1,1,1900-01-01 09:44:46,...,0,0,0,0,0,0,0,0,584,19
4,1,5,Taking a Nap / Sleeping,,,Bed Room,Bed,1,1,1900-01-01 12:04:48,...,0,0,0,0,0,0,0,0,724,24


In [13]:
dfCombinedAll.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2905 entries, 0 to 2904
Data columns (total 39 columns):
 #   Column                                    Non-Null Count  Dtype         
---  ------                                    --------------  -----         
 0   uid                                       2905 non-null   int64         
 1   sid                                       2905 non-null   int64         
 2   activity1                                 2905 non-null   object        
 3   activity2                                 112 non-null    object        
 4   activity3                                 3 non-null      object        
 5   userRoom                                  2905 non-null   object        
 6   userPosition                              2414 non-null   object        
 7   weekOfExperiment                          2905 non-null   int64         
 8   dayOfWeek                                 2905 non-null   int64         
 9   startTime                     

# Feature Extraction

In [18]:
# Select features for response prediction
dfFeatresForResponse = dfCombinedAll[['uid',
    'act_Taking a Nap / Sleeping','act_Hygiene','act_Eating','act_Using Media','act_Social Interaction',
    'act_Returning from Outside / Other Rooms','act_Studying / Working','act_Others','act_House Chores',
    'act_Self Caring','act_Visiting Outside / Other Rooms','act_Resting',
    'homeType','userRoom','userPosition','speakerRoom','speakerPosition',
    'minute_bin','dayOfWeek','SHORT_INTERACTION_availability','LONG_INTERACTION_availability']].copy()

# Encode categorical columns using LabelEncoder
categorical_columns = ['homeType', 'userRoom', 'userPosition', 'speakerRoom', 'speakerPosition', 'minute_bin']
label_encoders = defaultdict(LabelEncoder)

# Apply label encoding to each categorical column
for col in categorical_columns:
    dfFeatresForResponse[col] = label_encoders[col].fit_transform(dfFeatresForResponse[col])

# Create a copy of the encoded data for further processing
encoded_data = dfFeatresForResponse.copy()


encoded_data.info()
encoded_data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2905 entries, 0 to 2904
Data columns (total 22 columns):
 #   Column                                    Non-Null Count  Dtype
---  ------                                    --------------  -----
 0   uid                                       2905 non-null   int64
 1   act_Taking a Nap / Sleeping               2905 non-null   int64
 2   act_Hygiene                               2905 non-null   int64
 3   act_Eating                                2905 non-null   int64
 4   act_Using Media                           2905 non-null   int64
 5   act_Social Interaction                    2905 non-null   int64
 6   act_Returning from Outside / Other Rooms  2905 non-null   int64
 7   act_Studying / Working                    2905 non-null   int64
 8   act_Others                                2905 non-null   int64
 9   act_House Chores                          2905 non-null   int64
 10  act_Self Caring                           2905 non-null   in

Unnamed: 0,uid,act_Taking a Nap / Sleeping,act_Hygiene,act_Eating,act_Using Media,act_Social Interaction,act_Returning from Outside / Other Rooms,act_Studying / Working,act_Others,act_House Chores,...,act_Resting,homeType,userRoom,userPosition,speakerRoom,speakerPosition,minute_bin,dayOfWeek,SHORT_INTERACTION_availability,LONG_INTERACTION_availability
0,1,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,12,0,True,False
1,1,0,1,0,0,0,0,0,0,0,...,0,0,7,3,0,1,15,0,False,False
2,1,0,0,1,0,0,0,0,0,0,...,0,0,5,3,0,1,12,1,False,False
3,1,0,0,1,0,0,0,0,0,0,...,0,0,5,3,0,1,13,1,False,False
4,1,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,18,1,True,False


## Label: SHORT_INTERACTION

In [19]:
print(encoded_data['SHORT_INTERACTION_availability'].value_counts())

SHORT_INTERACTION_availability
True     2159
False     746
Name: count, dtype: int64


### Model Building and LOSO CV

In [20]:
# Prepare features (X) and target variables (y) for SHORT_INTERACTION
X = encoded_data.drop(columns=['SHORT_INTERACTION_availability', 'LONG_INTERACTION_availability', 'uid'], axis=1) # Drop target and unrelated columns
y = encoded_data['SHORT_INTERACTION_availability'] # Target variable
groups = encoded_data['uid'] # Group by user ID for Leave-One-Group-Out CV

# Initialize Leave-One-Group-Out cross-validator
logo = LeaveOneGroupOut()

# Initialize SMOTE for oversampling to handle class imbalance
smote = SMOTE(random_state=42)

# Define models to evaluate
models1 = {
    'Random Forest': RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42),
    'XGBoost': XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42),
    'LightGBM': LGBMClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42),
    'CatBoost': CatBoostClassifier(iterations=100, depth=5, learning_rate=0.1, loss_function='Logloss', cat_features=[0], random_seed=42),
    'SVM': SVC(random_state=42),
    'Dummy': DummyClassifier(strategy="stratified", random_state=42)
}

results1 = {}

# Loop over each model
for model_name, model1 in models1.items():
    accuracies = []
    f1_scores = []

    # Loop over each fold in Leave-One-Group-Out cross-validation
    for i, (train_index, test_index) in enumerate(logo.split(X, y, groups)):
        # Split the data into training and testing sets for the current fold
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Apply SMOTE to the training data to balance class distribution
        X_train_oversampled, y_train_oversampled = smote.fit_resample(X_train, y_train)

        # Train the model on the oversampled training data
        model1.fit(X_train_oversampled, y_train_oversampled.ravel())

        # Predict the target on the test data
        y_pred = model1.predict(X_test)

        # Evaluate the prediction
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='macro')

        accuracies.append(accuracy)
        f1_scores.append(f1)
    # Compute average accuracy and F1-score across all folds
    avg_accuracy = sum(accuracies) / len(accuracies)
    avg_f1_score = sum(f1_scores) / len(f1_scores)

    results1[model_name] = {
        'Average Accuracy': avg_accuracy,
        'Average F1-Score (macro)': avg_f1_score
    }

for model_name, metrics in results1.items():
    print(f'{model_name} - Accuracy: {metrics["Average Accuracy"]:.4f}, F1-Score: {metrics["Average F1-Score (macro)"]:.4f}')

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.r

[LightGBM] [Info] Number of positive: 2105, number of negative: 2105
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000533 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 95
[LightGBM] [Info] Number of data points in the train set: 4210, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2121, number of negative: 2121
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000345 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 97
[LightGBM] [Info] Number of data points in the train set: 4242, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[Lig

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


[LightGBM] [Info] Number of positive: 2089, number of negative: 2089
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000292 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 95
[LightGBM] [Info] Number of data points in the train set: 4178, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2129, number of negative: 2129
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000301 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 96
[LightGBM] [Info] Number of data points in the train set: 4258, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[Lig

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


[LightGBM] [Info] Number of positive: 2082, number of negative: 2082
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000291 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 97
[LightGBM] [Info] Number of data points in the train set: 4164, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2103, number of negative: 2103
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000279 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 97
[LightGBM] [Info] Number of data points in the train set: 4206, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[Lig

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


[LightGBM] [Info] Number of positive: 2059, number of negative: 2059
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000276 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 97
[LightGBM] [Info] Number of data points in the train set: 4118, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2057, number of negative: 2057
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000292 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 97
[LightGBM] [Info] Number of data points in the train set: 4114, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[Lig

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


[LightGBM] [Info] Number of positive: 1999, number of negative: 1999
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000316 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 97
[LightGBM] [Info] Number of data points in the train set: 3998, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2121, number of negative: 2121
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000304 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 97
[LightGBM] [Info] Number of data points in the train set: 4242, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[Lig

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


[LightGBM] [Info] Number of positive: 2117, number of negative: 2117
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000292 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 96
[LightGBM] [Info] Number of data points in the train set: 4234, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2112, number of negative: 2112
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000332 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 94
[LightGBM] [Info] Number of data points in the train set: 4224, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[Lig

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


[LightGBM] [Info] Number of positive: 2137, number of negative: 2137
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000342 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 97
[LightGBM] [Info] Number of data points in the train set: 4274, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2016, number of negative: 2016
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000314 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 95
[LightGBM] [Info] Number of data points in the train set: 4032, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[Lig

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


0:	learn: 0.6539332	total: 137ms	remaining: 13.5s
1:	learn: 0.6227200	total: 138ms	remaining: 6.76s
2:	learn: 0.5932275	total: 139ms	remaining: 4.5s
3:	learn: 0.5732934	total: 140ms	remaining: 3.37s
4:	learn: 0.5552400	total: 141ms	remaining: 2.69s
5:	learn: 0.5371145	total: 142ms	remaining: 2.23s
6:	learn: 0.5182639	total: 143ms	remaining: 1.91s
7:	learn: 0.5066205	total: 145ms	remaining: 1.66s
8:	learn: 0.4959752	total: 146ms	remaining: 1.47s
9:	learn: 0.4818779	total: 147ms	remaining: 1.32s
10:	learn: 0.4696720	total: 148ms	remaining: 1.2s
11:	learn: 0.4592730	total: 149ms	remaining: 1.09s
12:	learn: 0.4508690	total: 151ms	remaining: 1.01s
13:	learn: 0.4425535	total: 152ms	remaining: 934ms
14:	learn: 0.4380117	total: 153ms	remaining: 868ms
15:	learn: 0.4337342	total: 154ms	remaining: 810ms
16:	learn: 0.4280162	total: 156ms	remaining: 759ms
17:	learn: 0.4234805	total: 157ms	remaining: 713ms
18:	learn: 0.4192242	total: 158ms	remaining: 672ms
19:	learn: 0.4154809	total: 159ms	remaining

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


42:	learn: 0.3807496	total: 42.4ms	remaining: 56.1ms
43:	learn: 0.3800016	total: 43.2ms	remaining: 55ms
44:	learn: 0.3792734	total: 44.4ms	remaining: 54.2ms
45:	learn: 0.3783244	total: 45.6ms	remaining: 53.5ms
46:	learn: 0.3777711	total: 46.7ms	remaining: 52.6ms
47:	learn: 0.3770053	total: 47.6ms	remaining: 51.6ms
48:	learn: 0.3762343	total: 48.7ms	remaining: 50.7ms
49:	learn: 0.3757251	total: 49.8ms	remaining: 49.8ms
50:	learn: 0.3751133	total: 50.8ms	remaining: 48.8ms
51:	learn: 0.3744783	total: 51.8ms	remaining: 47.8ms
52:	learn: 0.3735704	total: 52.9ms	remaining: 46.9ms
53:	learn: 0.3730471	total: 53.8ms	remaining: 45.8ms
54:	learn: 0.3725274	total: 54.7ms	remaining: 44.7ms
55:	learn: 0.3713431	total: 55.7ms	remaining: 43.8ms
56:	learn: 0.3704797	total: 56.7ms	remaining: 42.8ms
57:	learn: 0.3697025	total: 57.8ms	remaining: 41.8ms
58:	learn: 0.3688608	total: 58.9ms	remaining: 40.9ms
59:	learn: 0.3683333	total: 59.9ms	remaining: 39.9ms
60:	learn: 0.3674575	total: 60.9ms	remaining: 38

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


0:	learn: 0.6582664	total: 1.27ms	remaining: 126ms
1:	learn: 0.6295567	total: 2ms	remaining: 97.9ms
2:	learn: 0.6063531	total: 3ms	remaining: 97ms
3:	learn: 0.5847677	total: 4.03ms	remaining: 96.8ms
4:	learn: 0.5685851	total: 4.75ms	remaining: 90.2ms
5:	learn: 0.5463750	total: 5.73ms	remaining: 89.8ms
6:	learn: 0.5296545	total: 6.73ms	remaining: 89.5ms
7:	learn: 0.5167777	total: 7.7ms	remaining: 88.5ms
8:	learn: 0.5089139	total: 8.25ms	remaining: 83.4ms
9:	learn: 0.4960397	total: 9.27ms	remaining: 83.4ms
10:	learn: 0.4836464	total: 10.3ms	remaining: 83.5ms
11:	learn: 0.4733291	total: 11.3ms	remaining: 83.2ms
12:	learn: 0.4665921	total: 12.3ms	remaining: 82.2ms
13:	learn: 0.4585363	total: 13.2ms	remaining: 81.1ms
14:	learn: 0.4525431	total: 14.1ms	remaining: 79.7ms
15:	learn: 0.4478714	total: 15ms	remaining: 78.8ms
16:	learn: 0.4433666	total: 16ms	remaining: 78.3ms
17:	learn: 0.4382152	total: 17ms	remaining: 77.4ms
18:	learn: 0.4345378	total: 18.1ms	remaining: 77ms
19:	learn: 0.4316387	

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


41:	learn: 0.3741211	total: 43.1ms	remaining: 59.6ms
42:	learn: 0.3732597	total: 44.2ms	remaining: 58.6ms
43:	learn: 0.3722983	total: 45.3ms	remaining: 57.7ms
44:	learn: 0.3716665	total: 46.4ms	remaining: 56.7ms
45:	learn: 0.3708101	total: 47.5ms	remaining: 55.8ms
46:	learn: 0.3701093	total: 48.6ms	remaining: 54.8ms
47:	learn: 0.3684754	total: 49.6ms	remaining: 53.7ms
48:	learn: 0.3680453	total: 50.7ms	remaining: 52.8ms
49:	learn: 0.3674522	total: 51.7ms	remaining: 51.7ms
50:	learn: 0.3668628	total: 53ms	remaining: 50.9ms
51:	learn: 0.3657349	total: 54.4ms	remaining: 50.2ms
52:	learn: 0.3654457	total: 55.5ms	remaining: 49.3ms
53:	learn: 0.3646304	total: 56.9ms	remaining: 48.5ms
54:	learn: 0.3638149	total: 58ms	remaining: 47.4ms
55:	learn: 0.3632565	total: 59.1ms	remaining: 46.4ms
56:	learn: 0.3627679	total: 60.1ms	remaining: 45.4ms
57:	learn: 0.3618387	total: 61.3ms	remaining: 44.4ms
58:	learn: 0.3611888	total: 62.5ms	remaining: 43.4ms
59:	learn: 0.3607790	total: 63.5ms	remaining: 42.3

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


0:	learn: 0.6530848	total: 1.43ms	remaining: 141ms
1:	learn: 0.6203906	total: 2.69ms	remaining: 132ms
2:	learn: 0.5933682	total: 3.75ms	remaining: 121ms
3:	learn: 0.5719741	total: 4.78ms	remaining: 115ms
4:	learn: 0.5537489	total: 5.75ms	remaining: 109ms
5:	learn: 0.5374459	total: 6.79ms	remaining: 106ms
6:	learn: 0.5178319	total: 7.78ms	remaining: 103ms
7:	learn: 0.5016669	total: 8.77ms	remaining: 101ms
8:	learn: 0.4903379	total: 9.73ms	remaining: 98.4ms
9:	learn: 0.4803597	total: 10.9ms	remaining: 97.9ms
10:	learn: 0.4690697	total: 12.1ms	remaining: 97.8ms
11:	learn: 0.4587176	total: 13.6ms	remaining: 99.4ms
12:	learn: 0.4499876	total: 14.7ms	remaining: 98.5ms
13:	learn: 0.4426515	total: 15.9ms	remaining: 97.8ms
14:	learn: 0.4376302	total: 17.2ms	remaining: 97.3ms
15:	learn: 0.4332608	total: 18.2ms	remaining: 95.7ms
16:	learn: 0.4272659	total: 19.2ms	remaining: 93.9ms
17:	learn: 0.4225786	total: 20.4ms	remaining: 92.8ms
18:	learn: 0.4185935	total: 21.4ms	remaining: 91.4ms
19:	learn: 

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


35:	learn: 0.3734989	total: 38ms	remaining: 67.5ms
36:	learn: 0.3727051	total: 38.9ms	remaining: 66.2ms
37:	learn: 0.3711941	total: 39.8ms	remaining: 65ms
38:	learn: 0.3699197	total: 41ms	remaining: 64.1ms
39:	learn: 0.3687872	total: 42ms	remaining: 63ms
40:	learn: 0.3667388	total: 43.2ms	remaining: 62.2ms
41:	learn: 0.3657005	total: 44.4ms	remaining: 61.3ms
42:	learn: 0.3648858	total: 45.4ms	remaining: 60.2ms
43:	learn: 0.3639370	total: 46.7ms	remaining: 59.5ms
44:	learn: 0.3630769	total: 47.9ms	remaining: 58.5ms
45:	learn: 0.3620344	total: 49.1ms	remaining: 57.6ms
46:	learn: 0.3609263	total: 50.3ms	remaining: 56.7ms
47:	learn: 0.3596704	total: 51.4ms	remaining: 55.7ms
48:	learn: 0.3584667	total: 52.6ms	remaining: 54.7ms
49:	learn: 0.3578252	total: 53.6ms	remaining: 53.6ms
50:	learn: 0.3560551	total: 54.9ms	remaining: 52.8ms
51:	learn: 0.3551079	total: 55.9ms	remaining: 51.6ms
52:	learn: 0.3541776	total: 57.2ms	remaining: 50.7ms
53:	learn: 0.3534132	total: 58.3ms	remaining: 49.7ms
54:

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


0:	learn: 0.6550505	total: 1.28ms	remaining: 126ms
1:	learn: 0.6236391	total: 2.4ms	remaining: 118ms
2:	learn: 0.5910439	total: 3.56ms	remaining: 115ms
3:	learn: 0.5710563	total: 4.26ms	remaining: 102ms
4:	learn: 0.5472948	total: 5.28ms	remaining: 100ms
5:	learn: 0.5315300	total: 6.32ms	remaining: 98.9ms
6:	learn: 0.5170435	total: 7.37ms	remaining: 97.9ms
7:	learn: 0.5050407	total: 8.49ms	remaining: 97.6ms
8:	learn: 0.4910776	total: 9.6ms	remaining: 97ms
9:	learn: 0.4776611	total: 10.6ms	remaining: 95.6ms
10:	learn: 0.4667459	total: 11.7ms	remaining: 94.7ms
11:	learn: 0.4566441	total: 12.7ms	remaining: 93.4ms
12:	learn: 0.4483451	total: 13.7ms	remaining: 92ms
13:	learn: 0.4432039	total: 14.9ms	remaining: 91.6ms
14:	learn: 0.4372644	total: 16ms	remaining: 90.7ms
15:	learn: 0.4323569	total: 17ms	remaining: 89.3ms
16:	learn: 0.4279778	total: 18.1ms	remaining: 88.3ms
17:	learn: 0.4225745	total: 19.1ms	remaining: 87ms
18:	learn: 0.4181627	total: 20.3ms	remaining: 86.5ms
19:	learn: 0.4145580

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


47:	learn: 0.3786332	total: 49.3ms	remaining: 53.4ms
48:	learn: 0.3779620	total: 50.5ms	remaining: 52.6ms
49:	learn: 0.3773405	total: 51.7ms	remaining: 51.7ms
50:	learn: 0.3763927	total: 52.7ms	remaining: 50.6ms
51:	learn: 0.3755933	total: 53.7ms	remaining: 49.6ms
52:	learn: 0.3743874	total: 54.8ms	remaining: 48.6ms
53:	learn: 0.3738749	total: 55.8ms	remaining: 47.6ms
54:	learn: 0.3727551	total: 56.9ms	remaining: 46.6ms
55:	learn: 0.3713705	total: 58ms	remaining: 45.6ms
56:	learn: 0.3703946	total: 59ms	remaining: 44.5ms
57:	learn: 0.3694684	total: 60ms	remaining: 43.5ms
58:	learn: 0.3687105	total: 61ms	remaining: 42.4ms
59:	learn: 0.3681059	total: 61.9ms	remaining: 41.3ms
60:	learn: 0.3674364	total: 63.1ms	remaining: 40.4ms
61:	learn: 0.3659265	total: 64.2ms	remaining: 39.4ms
62:	learn: 0.3653904	total: 65.3ms	remaining: 38.3ms
63:	learn: 0.3649149	total: 66.4ms	remaining: 37.3ms
64:	learn: 0.3642418	total: 67.4ms	remaining: 36.3ms
65:	learn: 0.3633166	total: 68.4ms	remaining: 35.3ms
6

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


2:	learn: 0.5913446	total: 2.83ms	remaining: 91.5ms
3:	learn: 0.5704587	total: 4.05ms	remaining: 97.3ms
4:	learn: 0.5509793	total: 5.19ms	remaining: 98.6ms
5:	learn: 0.5361366	total: 5.95ms	remaining: 93.2ms
6:	learn: 0.5233895	total: 6.96ms	remaining: 92.5ms
7:	learn: 0.5066301	total: 7.83ms	remaining: 90ms
8:	learn: 0.4936793	total: 8.78ms	remaining: 88.7ms
9:	learn: 0.4844891	total: 9.84ms	remaining: 88.5ms
10:	learn: 0.4760570	total: 10.9ms	remaining: 88ms
11:	learn: 0.4655230	total: 11.9ms	remaining: 87.2ms
12:	learn: 0.4588233	total: 12.9ms	remaining: 86.3ms
13:	learn: 0.4543369	total: 13.9ms	remaining: 85.3ms
14:	learn: 0.4470219	total: 15.1ms	remaining: 85.5ms
15:	learn: 0.4419483	total: 16.2ms	remaining: 85.2ms
16:	learn: 0.4364817	total: 17.3ms	remaining: 84.5ms
17:	learn: 0.4315985	total: 18.5ms	remaining: 84.3ms
18:	learn: 0.4278695	total: 19.4ms	remaining: 82.9ms
19:	learn: 0.4226505	total: 20.4ms	remaining: 81.8ms
20:	learn: 0.4189230	total: 21.4ms	remaining: 80.5ms
21:	l

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


53:	learn: 0.3646451	total: 55.5ms	remaining: 47.2ms
54:	learn: 0.3638519	total: 56.6ms	remaining: 46.3ms
55:	learn: 0.3625947	total: 57.7ms	remaining: 45.3ms
56:	learn: 0.3620639	total: 59ms	remaining: 44.5ms
57:	learn: 0.3613018	total: 60.2ms	remaining: 43.6ms
58:	learn: 0.3606304	total: 62ms	remaining: 43.1ms
59:	learn: 0.3601951	total: 63.4ms	remaining: 42.3ms
60:	learn: 0.3592124	total: 64.9ms	remaining: 41.5ms
61:	learn: 0.3581733	total: 66.1ms	remaining: 40.5ms
62:	learn: 0.3577875	total: 67.4ms	remaining: 39.6ms
63:	learn: 0.3569442	total: 68.7ms	remaining: 38.6ms
64:	learn: 0.3560876	total: 70ms	remaining: 37.7ms
65:	learn: 0.3555449	total: 71.5ms	remaining: 36.8ms
66:	learn: 0.3549912	total: 73ms	remaining: 35.9ms
67:	learn: 0.3544469	total: 74.2ms	remaining: 34.9ms
68:	learn: 0.3535836	total: 75.7ms	remaining: 34ms
69:	learn: 0.3527239	total: 77.3ms	remaining: 33.1ms
70:	learn: 0.3521517	total: 78.7ms	remaining: 32.1ms
71:	learn: 0.3516345	total: 80.1ms	remaining: 31.1ms
72:

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


0:	learn: 0.6518829	total: 1.27ms	remaining: 126ms
1:	learn: 0.6228948	total: 2.41ms	remaining: 118ms
2:	learn: 0.5991231	total: 3.3ms	remaining: 107ms
3:	learn: 0.5776204	total: 4.33ms	remaining: 104ms
4:	learn: 0.5531437	total: 5.31ms	remaining: 101ms
5:	learn: 0.5310815	total: 6.49ms	remaining: 102ms
6:	learn: 0.5160833	total: 7.58ms	remaining: 101ms
7:	learn: 0.5062167	total: 8.87ms	remaining: 102ms
8:	learn: 0.4936796	total: 9.84ms	remaining: 99.5ms
9:	learn: 0.4837135	total: 11.1ms	remaining: 99.6ms
10:	learn: 0.4724978	total: 12.3ms	remaining: 99.4ms
11:	learn: 0.4620004	total: 13.9ms	remaining: 102ms
12:	learn: 0.4534828	total: 15ms	remaining: 100ms
13:	learn: 0.4452337	total: 16.1ms	remaining: 98.8ms
14:	learn: 0.4367254	total: 17.2ms	remaining: 97.5ms
15:	learn: 0.4301043	total: 18.2ms	remaining: 95.7ms
16:	learn: 0.4252417	total: 19.2ms	remaining: 93.9ms
17:	learn: 0.4203634	total: 20.3ms	remaining: 92.5ms
18:	learn: 0.4171953	total: 21.3ms	remaining: 90.9ms
19:	learn: 0.414

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


39:	learn: 0.3750932	total: 42.8ms	remaining: 64.3ms
40:	learn: 0.3737153	total: 44ms	remaining: 63.4ms
41:	learn: 0.3722787	total: 45.1ms	remaining: 62.3ms
42:	learn: 0.3713993	total: 46ms	remaining: 61ms
43:	learn: 0.3702737	total: 47.1ms	remaining: 59.9ms
44:	learn: 0.3684240	total: 48.1ms	remaining: 58.8ms
45:	learn: 0.3676897	total: 49.2ms	remaining: 57.8ms
46:	learn: 0.3663752	total: 50.4ms	remaining: 56.9ms
47:	learn: 0.3655363	total: 51.5ms	remaining: 55.8ms
48:	learn: 0.3648715	total: 52.5ms	remaining: 54.7ms
49:	learn: 0.3641157	total: 53.7ms	remaining: 53.7ms
50:	learn: 0.3635624	total: 54.8ms	remaining: 52.6ms
51:	learn: 0.3627848	total: 56ms	remaining: 51.7ms
52:	learn: 0.3620592	total: 57.2ms	remaining: 50.8ms
53:	learn: 0.3615718	total: 58.2ms	remaining: 49.6ms
54:	learn: 0.3608698	total: 59.2ms	remaining: 48.5ms
55:	learn: 0.3603348	total: 60.2ms	remaining: 47.3ms
56:	learn: 0.3596905	total: 61.3ms	remaining: 46.2ms
57:	learn: 0.3593098	total: 62.2ms	remaining: 45ms
58:

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


87:	learn: 0.3413306	total: 98.7ms	remaining: 13.5ms
88:	learn: 0.3409580	total: 99.9ms	remaining: 12.3ms
89:	learn: 0.3406490	total: 101ms	remaining: 11.2ms
90:	learn: 0.3395463	total: 102ms	remaining: 10.1ms
91:	learn: 0.3391242	total: 103ms	remaining: 8.97ms
92:	learn: 0.3386774	total: 104ms	remaining: 7.86ms
93:	learn: 0.3383173	total: 105ms	remaining: 6.73ms
94:	learn: 0.3379814	total: 106ms	remaining: 5.6ms
95:	learn: 0.3374867	total: 108ms	remaining: 4.48ms
96:	learn: 0.3369480	total: 109ms	remaining: 3.36ms
97:	learn: 0.3360569	total: 110ms	remaining: 2.24ms
98:	learn: 0.3349627	total: 111ms	remaining: 1.12ms
99:	learn: 0.3340554	total: 112ms	remaining: 0us
0:	learn: 0.6540322	total: 1.08ms	remaining: 107ms
1:	learn: 0.6231599	total: 2.07ms	remaining: 101ms
2:	learn: 0.5959686	total: 3.5ms	remaining: 113ms
3:	learn: 0.5739461	total: 4.57ms	remaining: 110ms
4:	learn: 0.5521638	total: 5.74ms	remaining: 109ms
5:	learn: 0.5363149	total: 6.83ms	remaining: 107ms
6:	learn: 0.5207553	t

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


0:	learn: 0.6542658	total: 1.33ms	remaining: 132ms
1:	learn: 0.6198890	total: 2.45ms	remaining: 120ms
2:	learn: 0.5933189	total: 3.49ms	remaining: 113ms
3:	learn: 0.5698573	total: 4.54ms	remaining: 109ms
4:	learn: 0.5504963	total: 5.46ms	remaining: 104ms
5:	learn: 0.5331454	total: 6.41ms	remaining: 100ms
6:	learn: 0.5172612	total: 7.28ms	remaining: 96.7ms
7:	learn: 0.5019410	total: 8.37ms	remaining: 96.3ms
8:	learn: 0.4888367	total: 9.33ms	remaining: 94.3ms
9:	learn: 0.4751901	total: 10.2ms	remaining: 91.8ms
10:	learn: 0.4667036	total: 11.2ms	remaining: 90.4ms
11:	learn: 0.4554429	total: 12.2ms	remaining: 89.7ms
12:	learn: 0.4471140	total: 13.1ms	remaining: 87.7ms
13:	learn: 0.4396868	total: 14.2ms	remaining: 87.1ms
14:	learn: 0.4320716	total: 15.5ms	remaining: 88ms
15:	learn: 0.4273553	total: 16.9ms	remaining: 88.6ms
16:	learn: 0.4210620	total: 17.8ms	remaining: 87.1ms
17:	learn: 0.4178642	total: 19ms	remaining: 86.6ms
18:	learn: 0.4125859	total: 20ms	remaining: 85.1ms
19:	learn: 0.40

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.r

Random Forest - Accuracy: 0.8321, F1-Score: 0.7373
Gradient Boosting - Accuracy: 0.8128, F1-Score: 0.7082
XGBoost - Accuracy: 0.8209, F1-Score: 0.7183
LightGBM - Accuracy: 0.8229, F1-Score: 0.7148
CatBoost - Accuracy: 0.8250, F1-Score: 0.7276
SVM - Accuracy: 0.7653, F1-Score: 0.6824
Dummy - Accuracy: 0.5064, F1-Score: 0.4413


  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


### K-Fold CV

In [21]:
from sklearn.model_selection import KFold

# Initialize 5-fold cross-validator
kfold = KFold(n_splits=5, shuffle=False)

results1 = {}

# Loop over each model
for model_name, model1 in models1.items():
    accuracies = []
    f1_scores = []

    # Loop over each fold in 5-fold cross-validation
    for train_index, test_index in kfold.split(X, y):
        # Split the data into training and testing sets for the current fold
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Apply SMOTE to the training data to balance class distribution
        X_train_oversampled, y_train_oversampled = smote.fit_resample(X_train, y_train)

        # Train the model on the oversampled training data
        model1.fit(X_train_oversampled, y_train_oversampled.ravel())

        # Predict the target on the test data
        y_pred = model1.predict(X_test)

        # Evaluate the prediction
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='macro')

        accuracies.append(accuracy)
        f1_scores.append(f1)

    # Compute average accuracy and F1-score across all folds
    avg_accuracy = sum(accuracies) / len(accuracies)
    avg_f1_score = sum(f1_scores) / len(f1_scores)

    results1[model_name] = {
        'Average Accuracy': avg_accuracy,
        'Average F1-Score (macro)': avg_f1_score
    }

print()
for model_name, metrics in results1.items():
    print(f'{model_name} - Accuracy: {metrics["Average Accuracy"]:.4f}, F1-Score: {metrics["Average F1-Score (macro)"]:.4f}')

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.r

[LightGBM] [Info] Number of positive: 1717, number of negative: 1717
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000482 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 93
[LightGBM] [Info] Number of data points in the train set: 3434, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1779, number of negative: 1779
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000264 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 95
[LightGBM] [Info] Number of data points in the train set: 3558, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[Lig

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


7:	learn: 0.5047565	total: 7.97ms	remaining: 91.7ms
8:	learn: 0.4926700	total: 9.14ms	remaining: 92.4ms
9:	learn: 0.4837468	total: 10.3ms	remaining: 92.4ms
10:	learn: 0.4752106	total: 11.4ms	remaining: 92.5ms
11:	learn: 0.4658290	total: 12.6ms	remaining: 92.2ms
12:	learn: 0.4574750	total: 13.6ms	remaining: 91ms
13:	learn: 0.4475692	total: 14.6ms	remaining: 89.4ms
14:	learn: 0.4387164	total: 15.6ms	remaining: 88.5ms
15:	learn: 0.4316335	total: 16.6ms	remaining: 86.9ms
16:	learn: 0.4249580	total: 17.6ms	remaining: 85.8ms
17:	learn: 0.4205795	total: 18.7ms	remaining: 85.2ms
18:	learn: 0.4162052	total: 19.8ms	remaining: 84.3ms
19:	learn: 0.4121890	total: 20.9ms	remaining: 83.4ms
20:	learn: 0.4085819	total: 21.9ms	remaining: 82.5ms
21:	learn: 0.4048234	total: 23.1ms	remaining: 82ms
22:	learn: 0.4017888	total: 24.4ms	remaining: 81.8ms
23:	learn: 0.3984529	total: 26ms	remaining: 82.3ms
24:	learn: 0.3962894	total: 27.1ms	remaining: 81.4ms
25:	learn: 0.3941453	total: 28.4ms	remaining: 80.9ms
26

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


58:	learn: 0.3434172	total: 52.6ms	remaining: 36.6ms
59:	learn: 0.3426292	total: 53.7ms	remaining: 35.8ms
60:	learn: 0.3417498	total: 54.7ms	remaining: 35ms
61:	learn: 0.3412897	total: 55.5ms	remaining: 34ms
62:	learn: 0.3405638	total: 56.4ms	remaining: 33.1ms
63:	learn: 0.3401523	total: 57.3ms	remaining: 32.3ms
64:	learn: 0.3397454	total: 58.2ms	remaining: 31.3ms
65:	learn: 0.3388610	total: 59.1ms	remaining: 30.4ms
66:	learn: 0.3383522	total: 59.9ms	remaining: 29.5ms
67:	learn: 0.3373024	total: 61ms	remaining: 28.7ms
68:	learn: 0.3368619	total: 62.1ms	remaining: 27.9ms
69:	learn: 0.3360874	total: 63.1ms	remaining: 27ms
70:	learn: 0.3353999	total: 63.9ms	remaining: 26.1ms
71:	learn: 0.3351321	total: 64.5ms	remaining: 25.1ms
72:	learn: 0.3347625	total: 65.3ms	remaining: 24.2ms
73:	learn: 0.3341967	total: 66.2ms	remaining: 23.3ms
74:	learn: 0.3335832	total: 67.2ms	remaining: 22.4ms
75:	learn: 0.3328408	total: 68.2ms	remaining: 21.5ms
76:	learn: 0.3320705	total: 69ms	remaining: 20.6ms
77:

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


0:	learn: 0.6579363	total: 1.18ms	remaining: 117ms
1:	learn: 0.6290010	total: 1.98ms	remaining: 96.9ms
2:	learn: 0.6004149	total: 2.85ms	remaining: 92.3ms
3:	learn: 0.5779761	total: 3.8ms	remaining: 91.3ms
4:	learn: 0.5578175	total: 4.69ms	remaining: 89.2ms
5:	learn: 0.5423065	total: 5.54ms	remaining: 86.7ms
6:	learn: 0.5227655	total: 6.38ms	remaining: 84.8ms
7:	learn: 0.5101880	total: 7.3ms	remaining: 83.9ms
8:	learn: 0.4995132	total: 8.21ms	remaining: 83ms
9:	learn: 0.4858153	total: 9.23ms	remaining: 83.1ms
10:	learn: 0.4755228	total: 10.1ms	remaining: 81.8ms
11:	learn: 0.4652177	total: 11ms	remaining: 80.9ms
12:	learn: 0.4562008	total: 12ms	remaining: 80.1ms
13:	learn: 0.4507826	total: 13ms	remaining: 79.7ms
14:	learn: 0.4438381	total: 13.7ms	remaining: 77.7ms
15:	learn: 0.4386896	total: 14.6ms	remaining: 76.6ms
16:	learn: 0.4329875	total: 15.4ms	remaining: 75.3ms
17:	learn: 0.4289006	total: 16.4ms	remaining: 74.8ms
18:	learn: 0.4249671	total: 17.4ms	remaining: 74.1ms
19:	learn: 0.4

  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())



Random Forest - Accuracy: 0.8251, F1-Score: 0.7897
Gradient Boosting - Accuracy: 0.8131, F1-Score: 0.7681
XGBoost - Accuracy: 0.8134, F1-Score: 0.7702
LightGBM - Accuracy: 0.8138, F1-Score: 0.7715
CatBoost - Accuracy: 0.8127, F1-Score: 0.7732
SVM - Accuracy: 0.7711, F1-Score: 0.7345
Dummy - Accuracy: 0.4833, F1-Score: 0.4565


  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())
  model1.fit(X_train_oversampled, y_train_oversampled.ravel())


## Label: LONG_INTERACTION

In [22]:
print(encoded_data['LONG_INTERACTION_availability'].value_counts())

LONG_INTERACTION_availability
False    1471
True     1434
Name: count, dtype: int64


### Model building and LOSO CV

In [23]:
# Prepare features (X) and target variable (y) for predicting LONG_INTERACTION
X = encoded_data.drop(columns=['LONG_INTERACTION_availability', 'SHORT_INTERACTION_availability','uid'], axis=1)
y = encoded_data['LONG_INTERACTION_availability']
groups = encoded_data['uid']

# Initialize Leave-One-Group-Out cross-validator
logo = LeaveOneGroupOut()

# Initialize SMOTE for oversampling to handle class imbalance
smote = SMOTE(random_state=42)

# Define models to evaluate
models2 = {
    'Random Forest': RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42),
    'XGBoost': XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42),
    'LightGBM': LGBMClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42),
    'CatBoost': CatBoostClassifier(iterations=100, depth=5, learning_rate=0.1, loss_function='Logloss', cat_features=[0], random_seed=42),
    'SVM': SVC(random_state=42),
    'Dummy': DummyClassifier(strategy="stratified", random_state=42)
}

results2 = {}

# Loop over each model
for model_name, model2 in models2.items():
    accuracies = []
    f1_scores = []

    # Loop over each fold in Leave-One-Group-Out cross-validation
    for i, (train_index, test_index) in enumerate(logo.split(X, y, groups)):
        # Split the data into training and testing sets for the current fold
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Apply SMOTE to the training data to balance class distribution
        X_train_oversampled, y_train_oversampled = smote.fit_resample(X_train, y_train)

        # Train the model on the oversampled training data
        model2.fit(X_train_oversampled, y_train_oversampled.ravel())

        # Predict the target on the test data
        y_pred = model2.predict(X_test)

        # Evaluate the prediction
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='macro')

        accuracies.append(accuracy)
        f1_scores.append(f1)

    # Compute average accuracy and F1-score across all folds
    avg_accuracy = sum(accuracies) / len(accuracies)
    avg_f1_score = sum(f1_scores) / len(f1_scores)

    results2[model_name] = {
        'Average Accuracy': avg_accuracy,
        'Average F1-Score (macro)': avg_f1_score
    }

print()
for model_name, metrics in results2.items():
    print(f'{model_name} - Accuracy: {metrics["Average Accuracy"]:.4f}, F1-Score: {metrics["Average F1-Score (macro)"]:.4f}')

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.r

[LightGBM] [Info] Number of positive: 1428, number of negative: 1428
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000218 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 91
[LightGBM] [Info] Number of data points in the train set: 2856, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1447, number of negative: 1447
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000262 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 94
[LightGBM] [Info] Number of data points in the train set: 2894, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[Lig

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


[LightGBM] [Info] Number of positive: 1367, number of negative: 1367
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000207 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 93
[LightGBM] [Info] Number of data points in the train set: 2734, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1446, number of negative: 1446
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000251 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 90
[LightGBM] [Info] Number of data points in the train set: 2892, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[Lig

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


[LightGBM] [Info] Number of positive: 1449, number of negative: 1449
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000265 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 93
[LightGBM] [Info] Number of data points in the train set: 2898, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1461, number of negative: 1461
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000279 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 93
[LightGBM] [Info] Number of data points in the train set: 2922, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[Lig

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


[LightGBM] [Info] Number of positive: 1402, number of negative: 1402
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000238 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 94
[LightGBM] [Info] Number of data points in the train set: 2804, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1403, number of negative: 1403
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000288 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 93
[LightGBM] [Info] Number of data points in the train set: 2806, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[Lig

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


[LightGBM] [Info] Number of positive: 1409, number of negative: 1409
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000240 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 93
[LightGBM] [Info] Number of data points in the train set: 2818, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1408, number of negative: 1408
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000223 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 94
[LightGBM] [Info] Number of data points in the train set: 2816, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[Lig

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


[LightGBM] [Info] Number of positive: 1420, number of negative: 1420
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000233 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 93
[LightGBM] [Info] Number of data points in the train set: 2840, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1448, number of negative: 1448
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000249 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 93
[LightGBM] [Info] Number of data points in the train set: 2896, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


[LightGBM] [Info] Number of positive: 1426, number of negative: 1426
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000221 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 93
[LightGBM] [Info] Number of data points in the train set: 2852, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1442, number of negative: 1442
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000251 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 94
[LightGBM] [Info] Number of data points in the train set: 2884, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[Lig

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


[LightGBM] [Info] Number of positive: 1426, number of negative: 1426
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000286 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 93
[LightGBM] [Info] Number of data points in the train set: 2852, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1401, number of negative: 1401
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000216 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 93
[LightGBM] [Info] Number of data points in the train set: 2802, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[Lig

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


[LightGBM] [Info] Number of positive: 1427, number of negative: 1427
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000249 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 93
[LightGBM] [Info] Number of data points in the train set: 2854, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1447, number of negative: 1447
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000227 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 91
[LightGBM] [Info] Number of data points in the train set: 2894, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[Lig

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


0:	learn: 0.6732736	total: 1.06ms	remaining: 105ms
1:	learn: 0.6600556	total: 2.12ms	remaining: 104ms
2:	learn: 0.6460058	total: 3.07ms	remaining: 99.2ms
3:	learn: 0.6334347	total: 4.04ms	remaining: 97ms
4:	learn: 0.6221658	total: 5.08ms	remaining: 96.4ms
5:	learn: 0.6149543	total: 6.12ms	remaining: 95.9ms
6:	learn: 0.6058536	total: 7.19ms	remaining: 95.5ms
7:	learn: 0.5982074	total: 8.13ms	remaining: 93.5ms
8:	learn: 0.5921289	total: 9.05ms	remaining: 91.5ms
9:	learn: 0.5864431	total: 9.87ms	remaining: 88.8ms
10:	learn: 0.5802393	total: 10.7ms	remaining: 86.6ms
11:	learn: 0.5768071	total: 11.3ms	remaining: 83ms
12:	learn: 0.5731137	total: 12.1ms	remaining: 81.1ms
13:	learn: 0.5682223	total: 13ms	remaining: 79.8ms
14:	learn: 0.5649563	total: 14ms	remaining: 79.1ms
15:	learn: 0.5620625	total: 14.8ms	remaining: 77.5ms
16:	learn: 0.5590504	total: 15.7ms	remaining: 76.5ms
17:	learn: 0.5560078	total: 16.6ms	remaining: 75.5ms
18:	learn: 0.5534920	total: 17.4ms	remaining: 74ms
19:	learn: 0.55

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


58:	learn: 0.5140736	total: 49.2ms	remaining: 34.2ms
59:	learn: 0.5135503	total: 50.1ms	remaining: 33.4ms
60:	learn: 0.5124588	total: 51.1ms	remaining: 32.6ms
61:	learn: 0.5120246	total: 51.9ms	remaining: 31.8ms
62:	learn: 0.5115202	total: 52.7ms	remaining: 31ms
63:	learn: 0.5112285	total: 53.5ms	remaining: 30.1ms
64:	learn: 0.5102717	total: 54.3ms	remaining: 29.2ms
65:	learn: 0.5096763	total: 55.1ms	remaining: 28.4ms
66:	learn: 0.5093965	total: 55.9ms	remaining: 27.5ms
67:	learn: 0.5090117	total: 56.7ms	remaining: 26.7ms
68:	learn: 0.5086832	total: 57.5ms	remaining: 25.8ms
69:	learn: 0.5081965	total: 58.4ms	remaining: 25ms
70:	learn: 0.5078398	total: 59.2ms	remaining: 24.2ms
71:	learn: 0.5075060	total: 60ms	remaining: 23.3ms
72:	learn: 0.5068997	total: 60.8ms	remaining: 22.5ms
73:	learn: 0.5064369	total: 61.5ms	remaining: 21.6ms
74:	learn: 0.5062015	total: 62.3ms	remaining: 20.8ms
75:	learn: 0.5055216	total: 63.1ms	remaining: 19.9ms
76:	learn: 0.5047771	total: 64ms	remaining: 19.1ms
7

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


0:	learn: 0.6752536	total: 906us	remaining: 89.8ms
1:	learn: 0.6564110	total: 1.84ms	remaining: 90.4ms
2:	learn: 0.6413954	total: 2.78ms	remaining: 89.9ms
3:	learn: 0.6280239	total: 3.66ms	remaining: 87.9ms
4:	learn: 0.6175496	total: 4.56ms	remaining: 86.7ms
5:	learn: 0.6086112	total: 5.44ms	remaining: 85.2ms
6:	learn: 0.6013492	total: 6.26ms	remaining: 83.2ms
7:	learn: 0.5945288	total: 7.01ms	remaining: 80.7ms
8:	learn: 0.5865160	total: 7.77ms	remaining: 78.6ms
9:	learn: 0.5806374	total: 8.71ms	remaining: 78.4ms
10:	learn: 0.5763626	total: 9.5ms	remaining: 76.8ms
11:	learn: 0.5707648	total: 10.3ms	remaining: 75.3ms
12:	learn: 0.5666754	total: 11.1ms	remaining: 74.1ms
13:	learn: 0.5615618	total: 11.9ms	remaining: 73.1ms
14:	learn: 0.5582197	total: 12.8ms	remaining: 72.8ms
15:	learn: 0.5546155	total: 13.7ms	remaining: 72ms
16:	learn: 0.5515972	total: 14.5ms	remaining: 70.8ms
17:	learn: 0.5491549	total: 15.3ms	remaining: 69.6ms
18:	learn: 0.5450136	total: 16.3ms	remaining: 69.3ms
19:	lea

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


0:	learn: 0.6755830	total: 960us	remaining: 95.1ms
1:	learn: 0.6595141	total: 1.86ms	remaining: 91.1ms
2:	learn: 0.6448277	total: 2.6ms	remaining: 84.2ms
3:	learn: 0.6307711	total: 3.37ms	remaining: 80.9ms
4:	learn: 0.6199887	total: 4.18ms	remaining: 79.3ms
5:	learn: 0.6117225	total: 4.93ms	remaining: 77.2ms
6:	learn: 0.6033415	total: 5.65ms	remaining: 75.1ms
7:	learn: 0.5955739	total: 6.43ms	remaining: 74ms
8:	learn: 0.5894047	total: 7.22ms	remaining: 73ms
9:	learn: 0.5826759	total: 8.05ms	remaining: 72.5ms
10:	learn: 0.5772361	total: 8.93ms	remaining: 72.2ms
11:	learn: 0.5725233	total: 9.73ms	remaining: 71.4ms
12:	learn: 0.5692325	total: 10.5ms	remaining: 70.4ms
13:	learn: 0.5665515	total: 11.5ms	remaining: 70.9ms
14:	learn: 0.5635586	total: 12.4ms	remaining: 70.3ms
15:	learn: 0.5598536	total: 13.2ms	remaining: 69.4ms
16:	learn: 0.5562456	total: 14ms	remaining: 68.5ms
17:	learn: 0.5545188	total: 14.8ms	remaining: 67.3ms
18:	learn: 0.5521596	total: 15.5ms	remaining: 66.2ms
19:	learn: 

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


35:	learn: 0.5245592	total: 31.8ms	remaining: 56.5ms
36:	learn: 0.5241210	total: 32.7ms	remaining: 55.6ms
37:	learn: 0.5224270	total: 34ms	remaining: 55.4ms
38:	learn: 0.5215101	total: 35ms	remaining: 54.8ms
39:	learn: 0.5210796	total: 36.2ms	remaining: 54.4ms
40:	learn: 0.5204936	total: 37.4ms	remaining: 53.9ms
41:	learn: 0.5193116	total: 38.4ms	remaining: 53ms
42:	learn: 0.5189201	total: 39.2ms	remaining: 52ms
43:	learn: 0.5182188	total: 40.1ms	remaining: 51ms
44:	learn: 0.5174598	total: 41ms	remaining: 50.2ms
45:	learn: 0.5164153	total: 41.9ms	remaining: 49.1ms
46:	learn: 0.5154628	total: 42.6ms	remaining: 48.1ms
47:	learn: 0.5146149	total: 43.5ms	remaining: 47.1ms
48:	learn: 0.5144039	total: 44.3ms	remaining: 46.1ms
49:	learn: 0.5138390	total: 45.1ms	remaining: 45.1ms
50:	learn: 0.5132746	total: 46.1ms	remaining: 44.3ms
51:	learn: 0.5129388	total: 46.9ms	remaining: 43.3ms
52:	learn: 0.5120230	total: 47.7ms	remaining: 42.3ms
53:	learn: 0.5115587	total: 48.5ms	remaining: 41.4ms
54:	l

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


0:	learn: 0.6736425	total: 1.12ms	remaining: 111ms
1:	learn: 0.6578399	total: 1.87ms	remaining: 91.6ms
2:	learn: 0.6449716	total: 2.7ms	remaining: 87.2ms
3:	learn: 0.6311271	total: 3.44ms	remaining: 82.5ms
4:	learn: 0.6201360	total: 4.24ms	remaining: 80.6ms
5:	learn: 0.6116309	total: 4.98ms	remaining: 78ms
6:	learn: 0.6012942	total: 5.77ms	remaining: 76.7ms
7:	learn: 0.5936300	total: 6.51ms	remaining: 74.8ms
8:	learn: 0.5871513	total: 7.28ms	remaining: 73.7ms
9:	learn: 0.5813983	total: 8.09ms	remaining: 72.8ms
10:	learn: 0.5753365	total: 8.94ms	remaining: 72.3ms
11:	learn: 0.5699506	total: 9.88ms	remaining: 72.5ms
12:	learn: 0.5661557	total: 10.7ms	remaining: 71.4ms
13:	learn: 0.5637447	total: 11.6ms	remaining: 71.1ms
14:	learn: 0.5604563	total: 12.3ms	remaining: 69.9ms
15:	learn: 0.5577090	total: 13.1ms	remaining: 69ms
16:	learn: 0.5539414	total: 14ms	remaining: 68.3ms
17:	learn: 0.5513956	total: 14.8ms	remaining: 67.5ms
18:	learn: 0.5493019	total: 15.7ms	remaining: 66.9ms
19:	learn: 

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


49:	learn: 0.5181774	total: 47.4ms	remaining: 47.4ms
50:	learn: 0.5176844	total: 48.3ms	remaining: 46.4ms
51:	learn: 0.5171259	total: 49.3ms	remaining: 45.5ms
52:	learn: 0.5165083	total: 50.2ms	remaining: 44.6ms
53:	learn: 0.5157659	total: 51.2ms	remaining: 43.6ms
54:	learn: 0.5150454	total: 52.1ms	remaining: 42.6ms
55:	learn: 0.5144979	total: 53.3ms	remaining: 41.9ms
56:	learn: 0.5141801	total: 54.4ms	remaining: 41ms
57:	learn: 0.5137414	total: 55.3ms	remaining: 40ms
58:	learn: 0.5133401	total: 56.3ms	remaining: 39.1ms
59:	learn: 0.5125669	total: 57.7ms	remaining: 38.5ms
60:	learn: 0.5118053	total: 59.2ms	remaining: 37.8ms
61:	learn: 0.5111565	total: 60.5ms	remaining: 37.1ms
62:	learn: 0.5107311	total: 61.5ms	remaining: 36.1ms
63:	learn: 0.5100178	total: 62.4ms	remaining: 35.1ms
64:	learn: 0.5094512	total: 63.3ms	remaining: 34.1ms
65:	learn: 0.5090745	total: 64.3ms	remaining: 33.1ms
66:	learn: 0.5083753	total: 65.3ms	remaining: 32.2ms
67:	learn: 0.5080294	total: 66.7ms	remaining: 31.4

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


0:	learn: 0.6729883	total: 957us	remaining: 94.8ms
1:	learn: 0.6547568	total: 1.82ms	remaining: 89ms
2:	learn: 0.6408192	total: 2.65ms	remaining: 85.8ms
3:	learn: 0.6277582	total: 3.78ms	remaining: 90.7ms
4:	learn: 0.6184069	total: 4.63ms	remaining: 88ms
5:	learn: 0.6094812	total: 5.45ms	remaining: 85.4ms
6:	learn: 0.5991737	total: 6.29ms	remaining: 83.6ms
7:	learn: 0.5917821	total: 7.17ms	remaining: 82.4ms
8:	learn: 0.5864350	total: 7.94ms	remaining: 80.3ms
9:	learn: 0.5799255	total: 8.89ms	remaining: 80.1ms
10:	learn: 0.5744835	total: 9.77ms	remaining: 79ms
11:	learn: 0.5707108	total: 10.7ms	remaining: 78.4ms
12:	learn: 0.5668970	total: 11.7ms	remaining: 78.5ms
13:	learn: 0.5639691	total: 12.6ms	remaining: 77.6ms
14:	learn: 0.5611036	total: 13.5ms	remaining: 76.4ms
15:	learn: 0.5586272	total: 14.3ms	remaining: 75.2ms
16:	learn: 0.5546703	total: 15ms	remaining: 73.4ms
17:	learn: 0.5529556	total: 15.8ms	remaining: 72.1ms
18:	learn: 0.5514666	total: 16.6ms	remaining: 70.9ms
19:	learn: 0

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


63:	learn: 0.5117358	total: 59.6ms	remaining: 33.5ms
64:	learn: 0.5113619	total: 60.4ms	remaining: 32.5ms
65:	learn: 0.5109137	total: 61.3ms	remaining: 31.6ms
66:	learn: 0.5103779	total: 62.2ms	remaining: 30.6ms
67:	learn: 0.5101935	total: 63ms	remaining: 29.7ms
68:	learn: 0.5097250	total: 63.9ms	remaining: 28.7ms
69:	learn: 0.5092978	total: 64.7ms	remaining: 27.7ms
70:	learn: 0.5090135	total: 65.5ms	remaining: 26.8ms
71:	learn: 0.5086586	total: 66.4ms	remaining: 25.8ms
72:	learn: 0.5080593	total: 67.1ms	remaining: 24.8ms
73:	learn: 0.5072373	total: 68ms	remaining: 23.9ms
74:	learn: 0.5064737	total: 68.9ms	remaining: 23ms
75:	learn: 0.5060121	total: 69.6ms	remaining: 22ms
76:	learn: 0.5055593	total: 70.5ms	remaining: 21.1ms
77:	learn: 0.5051374	total: 71.4ms	remaining: 20.1ms
78:	learn: 0.5048400	total: 72.3ms	remaining: 19.2ms
79:	learn: 0.5043473	total: 73.2ms	remaining: 18.3ms
80:	learn: 0.5038081	total: 74ms	remaining: 17.4ms
81:	learn: 0.5030898	total: 74.8ms	remaining: 16.4ms
82:

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


0:	learn: 0.6742434	total: 1.06ms	remaining: 105ms
1:	learn: 0.6579488	total: 2.14ms	remaining: 105ms
2:	learn: 0.6436572	total: 3.08ms	remaining: 99.4ms
3:	learn: 0.6309117	total: 4.01ms	remaining: 96.2ms
4:	learn: 0.6212527	total: 4.81ms	remaining: 91.5ms
5:	learn: 0.6121543	total: 5.67ms	remaining: 88.9ms
6:	learn: 0.6029182	total: 6.43ms	remaining: 85.5ms
7:	learn: 0.5956557	total: 7.26ms	remaining: 83.5ms
8:	learn: 0.5899261	total: 8.12ms	remaining: 82.1ms
9:	learn: 0.5825455	total: 8.96ms	remaining: 80.7ms
10:	learn: 0.5769312	total: 9.74ms	remaining: 78.8ms
11:	learn: 0.5719078	total: 10.6ms	remaining: 77.6ms
12:	learn: 0.5681077	total: 11.5ms	remaining: 76.7ms
13:	learn: 0.5655139	total: 12.4ms	remaining: 76.4ms
14:	learn: 0.5623192	total: 13.3ms	remaining: 75.2ms
15:	learn: 0.5588682	total: 14.1ms	remaining: 74ms
16:	learn: 0.5562341	total: 15ms	remaining: 73.1ms
17:	learn: 0.5536591	total: 15.9ms	remaining: 72.2ms
18:	learn: 0.5518323	total: 16.7ms	remaining: 71.3ms
19:	learn

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


94:	learn: 0.4913191	total: 81.4ms	remaining: 4.28ms
95:	learn: 0.4909907	total: 82.3ms	remaining: 3.43ms
96:	learn: 0.4906482	total: 83.2ms	remaining: 2.57ms
97:	learn: 0.4901013	total: 84.1ms	remaining: 1.72ms
98:	learn: 0.4894252	total: 84.9ms	remaining: 857us
99:	learn: 0.4891418	total: 85.8ms	remaining: 0us
0:	learn: 0.6719836	total: 925us	remaining: 91.7ms
1:	learn: 0.6568132	total: 1.8ms	remaining: 88.2ms
2:	learn: 0.6427954	total: 2.78ms	remaining: 89.8ms
3:	learn: 0.6300043	total: 3.58ms	remaining: 85.9ms
4:	learn: 0.6179403	total: 4.45ms	remaining: 84.5ms
5:	learn: 0.6106905	total: 5.28ms	remaining: 82.7ms
6:	learn: 0.6031510	total: 6.17ms	remaining: 82ms
7:	learn: 0.5979860	total: 7.22ms	remaining: 83ms
8:	learn: 0.5911677	total: 8.12ms	remaining: 82.1ms
9:	learn: 0.5867709	total: 8.9ms	remaining: 80.1ms
10:	learn: 0.5809553	total: 9.69ms	remaining: 78.4ms
11:	learn: 0.5760519	total: 10.5ms	remaining: 77.1ms
12:	learn: 0.5712759	total: 11.5ms	remaining: 76.7ms
13:	learn: 0.5

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


0:	learn: 0.6725729	total: 1.19ms	remaining: 118ms
1:	learn: 0.6579490	total: 1.86ms	remaining: 91.3ms
2:	learn: 0.6441442	total: 2.71ms	remaining: 87.8ms
3:	learn: 0.6337863	total: 3.6ms	remaining: 86.4ms
4:	learn: 0.6221006	total: 4.38ms	remaining: 83.3ms
5:	learn: 0.6139591	total: 5.31ms	remaining: 83.2ms
6:	learn: 0.6053515	total: 6.24ms	remaining: 82.9ms
7:	learn: 0.5977206	total: 7.11ms	remaining: 81.8ms
8:	learn: 0.5913200	total: 7.89ms	remaining: 79.8ms
9:	learn: 0.5854877	total: 8.68ms	remaining: 78.1ms
10:	learn: 0.5790802	total: 9.46ms	remaining: 76.6ms
11:	learn: 0.5743325	total: 10.3ms	remaining: 75.7ms
12:	learn: 0.5698098	total: 11.2ms	remaining: 75ms
13:	learn: 0.5665252	total: 12.1ms	remaining: 74.2ms
14:	learn: 0.5634073	total: 12.9ms	remaining: 73.4ms
15:	learn: 0.5596343	total: 13.8ms	remaining: 72.5ms
16:	learn: 0.5562043	total: 14.7ms	remaining: 71.6ms
17:	learn: 0.5542427	total: 15.5ms	remaining: 70.8ms
18:	learn: 0.5512715	total: 16.4ms	remaining: 69.8ms
19:	lea

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


0:	learn: 0.6736733	total: 1.19ms	remaining: 118ms
1:	learn: 0.6595895	total: 1.95ms	remaining: 95.7ms
2:	learn: 0.6446850	total: 2.73ms	remaining: 88.3ms
3:	learn: 0.6354788	total: 3.55ms	remaining: 85.1ms
4:	learn: 0.6240958	total: 4.31ms	remaining: 81.9ms
5:	learn: 0.6157235	total: 5.18ms	remaining: 81.2ms
6:	learn: 0.6079237	total: 6.11ms	remaining: 81.2ms
7:	learn: 0.6009289	total: 6.88ms	remaining: 79.2ms
8:	learn: 0.5943992	total: 7.69ms	remaining: 77.8ms
9:	learn: 0.5890223	total: 8.69ms	remaining: 78.2ms
10:	learn: 0.5829211	total: 9.62ms	remaining: 77.8ms
11:	learn: 0.5781541	total: 10.5ms	remaining: 77.2ms
12:	learn: 0.5749514	total: 11.3ms	remaining: 75.7ms
13:	learn: 0.5723470	total: 12.2ms	remaining: 74.7ms
14:	learn: 0.5699884	total: 13.2ms	remaining: 74.5ms
15:	learn: 0.5678259	total: 13.9ms	remaining: 72.9ms
16:	learn: 0.5651097	total: 14.6ms	remaining: 71.4ms
17:	learn: 0.5615926	total: 15.4ms	remaining: 70.2ms
18:	learn: 0.5593888	total: 16.3ms	remaining: 69.5ms
19:	

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


52:	learn: 0.5148309	total: 45.1ms	remaining: 40ms
53:	learn: 0.5142017	total: 46ms	remaining: 39.1ms
54:	learn: 0.5133744	total: 47ms	remaining: 38.4ms
55:	learn: 0.5126256	total: 47.9ms	remaining: 37.6ms
56:	learn: 0.5120342	total: 48.7ms	remaining: 36.7ms
57:	learn: 0.5115081	total: 49.5ms	remaining: 35.8ms
58:	learn: 0.5111041	total: 50.4ms	remaining: 35ms
59:	learn: 0.5105664	total: 51.2ms	remaining: 34.2ms
60:	learn: 0.5098247	total: 52.2ms	remaining: 33.3ms
61:	learn: 0.5094683	total: 53ms	remaining: 32.5ms
62:	learn: 0.5088660	total: 53.9ms	remaining: 31.6ms
63:	learn: 0.5083890	total: 54.7ms	remaining: 30.8ms
64:	learn: 0.5078701	total: 55.7ms	remaining: 30ms
65:	learn: 0.5075740	total: 56.6ms	remaining: 29.2ms
66:	learn: 0.5068480	total: 57.5ms	remaining: 28.3ms
67:	learn: 0.5063955	total: 58.5ms	remaining: 27.5ms
68:	learn: 0.5061943	total: 59.5ms	remaining: 26.7ms
69:	learn: 0.5058279	total: 60.4ms	remaining: 25.9ms
70:	learn: 0.5055142	total: 61.3ms	remaining: 25ms
71:	lea

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.r


Random Forest - Accuracy: 0.6750, F1-Score: 0.6253
Gradient Boosting - Accuracy: 0.6656, F1-Score: 0.5979
XGBoost - Accuracy: 0.6761, F1-Score: 0.6098
LightGBM - Accuracy: 0.6785, F1-Score: 0.6040
CatBoost - Accuracy: 0.6832, F1-Score: 0.6093
SVM - Accuracy: 0.6528, F1-Score: 0.6026
Dummy - Accuracy: 0.4883, F1-Score: 0.4562


  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


### K-Fold CV

In [24]:
from sklearn.model_selection import KFold

# Initialize 5-fold cross-validator
kfold = KFold(n_splits=5, shuffle=False)

results2 = {}

# Loop over each model
for model_name, model2 in models2.items():
    accuracies = []
    f1_scores = []

    # Loop over each fold in 5-fold cross-validation
    for train_index, test_index in kfold.split(X, y):
        # Split the data into training and testing sets for the current fold
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Apply SMOTE to the training data to balance class distribution
        X_train_oversampled, y_train_oversampled = smote.fit_resample(X_train, y_train)

        # Train the model on the oversampled training data
        model2.fit(X_train_oversampled, y_train_oversampled.ravel())

        # Predict the target on the test data
        y_pred = model2.predict(X_test)

        # Evaluate the prediction
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='macro')

        accuracies.append(accuracy)
        f1_scores.append(f1)

    # Compute average accuracy and F1-score across all folds
    avg_accuracy = sum(accuracies) / len(accuracies)
    avg_f1_score = sum(f1_scores) / len(f1_scores)

    results2[model_name] = {
        'Average Accuracy': avg_accuracy,
        'Average F1-Score (macro)': avg_f1_score
    }

print()
for model_name, metrics in results2.items():
    print(f'{model_name} - Accuracy: {metrics["Average Accuracy"]:.4f}, F1-Score: {metrics["Average F1-Score (macro)"]:.4f}')

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.r

[LightGBM] [Info] Number of positive: 1191, number of negative: 1191
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000176 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 88
[LightGBM] [Info] Number of data points in the train set: 2382, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1175, number of negative: 1175
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000160 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 91
[LightGBM] [Info] Number of data points in the train set: 2350, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[Lig

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


[LightGBM] [Info] Number of positive: 1203, number of negative: 1203
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000193 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 91
[LightGBM] [Info] Number of data points in the train set: 2406, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
0:	learn: 0.6761370	total: 1.04ms	remaining: 103ms
1:	learn: 0.6611543	total: 1.91ms	remaining: 93.5ms
2:	learn: 0.6444082	total: 2.87ms	remaining: 92.6ms
3:	learn: 0.6295709	total: 3.73ms	remaining: 89.5ms
4:	learn: 0.6194630	total: 4.74ms	remaining: 90ms
5:	learn: 0.6113600	total: 5.59ms	remaining: 87.5ms
6:	learn: 0.6057875	total: 6.18ms	remaining: 82.1ms
7:	learn: 0.6000464	total: 6.97ms	remaining: 80.1ms
8:	learn: 0.5918606	total: 7.74ms	remaining: 78.2ms
9:	learn: 0.5851696	total: 8.53ms	rem

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


0:	learn: 0.6690320	total: 1.06ms	remaining: 105ms
1:	learn: 0.6503122	total: 2ms	remaining: 97.9ms
2:	learn: 0.6359477	total: 2.73ms	remaining: 88.3ms
3:	learn: 0.6221520	total: 3.46ms	remaining: 83ms
4:	learn: 0.6121655	total: 4.22ms	remaining: 80.1ms
5:	learn: 0.6023864	total: 4.92ms	remaining: 77.2ms
6:	learn: 0.5939578	total: 5.65ms	remaining: 75.1ms
7:	learn: 0.5883694	total: 6.28ms	remaining: 72.2ms
8:	learn: 0.5805109	total: 7ms	remaining: 70.8ms
9:	learn: 0.5732297	total: 7.75ms	remaining: 69.8ms
10:	learn: 0.5695402	total: 8.54ms	remaining: 69.1ms
11:	learn: 0.5643380	total: 9.26ms	remaining: 67.9ms
12:	learn: 0.5605267	total: 9.99ms	remaining: 66.9ms
13:	learn: 0.5559062	total: 10.7ms	remaining: 66ms
14:	learn: 0.5513940	total: 11.5ms	remaining: 64.9ms
15:	learn: 0.5472348	total: 12.2ms	remaining: 64ms
16:	learn: 0.5430882	total: 12.9ms	remaining: 63.2ms
17:	learn: 0.5404386	total: 13.8ms	remaining: 62.9ms
18:	learn: 0.5371817	total: 14.5ms	remaining: 62ms
19:	learn: 0.53438

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())


0:	learn: 0.6749488	total: 981us	remaining: 97.2ms
1:	learn: 0.6609188	total: 1.92ms	remaining: 94ms
2:	learn: 0.6484952	total: 3.04ms	remaining: 98.4ms
3:	learn: 0.6353003	total: 3.96ms	remaining: 95.1ms
4:	learn: 0.6238569	total: 4.92ms	remaining: 93.5ms
5:	learn: 0.6192477	total: 5.36ms	remaining: 84ms
6:	learn: 0.6114134	total: 6.25ms	remaining: 83ms
7:	learn: 0.6055302	total: 7.27ms	remaining: 83.6ms
8:	learn: 0.5994066	total: 8.2ms	remaining: 82.9ms
9:	learn: 0.5949007	total: 9.13ms	remaining: 82.1ms
10:	learn: 0.5890909	total: 9.98ms	remaining: 80.8ms
11:	learn: 0.5840232	total: 11.1ms	remaining: 81.1ms
12:	learn: 0.5806912	total: 12ms	remaining: 80.6ms
13:	learn: 0.5767384	total: 12.9ms	remaining: 79.3ms
14:	learn: 0.5734428	total: 13.7ms	remaining: 77.9ms
15:	learn: 0.5699424	total: 14.6ms	remaining: 76.4ms
16:	learn: 0.5666100	total: 15.4ms	remaining: 75.3ms
17:	learn: 0.5636922	total: 16.3ms	remaining: 74.2ms
18:	learn: 0.5611353	total: 17.2ms	remaining: 73.3ms
19:	learn: 0.

  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())



Random Forest - Accuracy: 0.6640, F1-Score: 0.6614
Gradient Boosting - Accuracy: 0.6664, F1-Score: 0.6621
XGBoost - Accuracy: 0.6661, F1-Score: 0.6608
LightGBM - Accuracy: 0.6637, F1-Score: 0.6595
CatBoost - Accuracy: 0.6678, F1-Score: 0.6635
SVM - Accuracy: 0.5966, F1-Score: 0.5951
Dummy - Accuracy: 0.4926, F1-Score: 0.4908


  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
  model2.fit(X_train_oversampled, y_train_oversampled.ravel())
