In [13]:
from data_preprocessing import load_data_in_chunks

application_train = load_data_in_chunks('data/application_train.csv')

In [14]:
import pandas as pd

# Initialize an empty dataframe
df_features = pd.DataFrame()

# Adding basic identifiers
df_features['SK_ID_CURR'] = application_train['SK_ID_CURR']

# Add the target feature (TARGET)
df_features['TARGET'] = application_train['TARGET']

# Adding features based on correlation analysis
df_features['AMT_CREDIT'] = application_train['AMT_CREDIT']
df_features['AMT_ANNUITY'] = application_train['AMT_ANNUITY']
df_features['AMT_GOODS_PRICE'] = application_train['AMT_GOODS_PRICE']
df_features['AMT_INCOME_TOTAL'] = application_train['AMT_INCOME_TOTAL']

# Adding external assessments
df_features['EXT_SOURCE_2'] = application_train['EXT_SOURCE_2']
df_features['EXT_SOURCE_3'] = application_train['EXT_SOURCE_3']

# Creating new features based on existing data
df_features['CREDIT_INCOME_RATIO'] = df_features['AMT_CREDIT'] / df_features['AMT_INCOME_TOTAL']
df_features['ANNUITY_INCOME_RATIO'] = df_features['AMT_ANNUITY'] / df_features['AMT_INCOME_TOTAL']
df_features['CREDIT_TERM'] = df_features['AMT_CREDIT'] / df_features['AMT_ANNUITY']

# Feature Engineering
df_features['ANNUITY_CREDIT_RATIO'] = application_train['AMT_ANNUITY'] / application_train['AMT_CREDIT']
df_features['AGE_YEARS'] = application_train['DAYS_BIRTH'] / -365
df_features['YEARS_EMPLOYED'] = application_train['DAYS_EMPLOYED'] / -365

df_features.head()

Unnamed: 0,SK_ID_CURR,TARGET,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,AMT_INCOME_TOTAL,EXT_SOURCE_2,EXT_SOURCE_3,CREDIT_INCOME_RATIO,ANNUITY_INCOME_RATIO,CREDIT_TERM,ANNUITY_CREDIT_RATIO,AGE_YEARS,YEARS_EMPLOYED
0,100002,1,406597.5,24700.5,351000.0,202500.0,0.262949,0.139376,2.007889,0.121978,16.461103,0.060749,25.920548,1.745205
1,100003,0,1293502.5,35698.5,1129500.0,270000.0,0.622246,,4.79075,0.132217,36.234085,0.027598,45.931507,3.254795
2,100004,0,135000.0,6750.0,135000.0,67500.0,0.555912,0.729567,2.0,0.1,20.0,0.05,52.180822,0.616438
3,100006,0,312682.5,29686.5,297000.0,135000.0,0.650442,,2.316167,0.2199,10.532818,0.094941,52.068493,8.326027
4,100007,0,513000.0,21865.5,513000.0,121500.0,0.322738,,4.222222,0.179963,23.461618,0.042623,54.608219,8.323288


In [15]:
bureau = load_data_in_chunks('data/bureau.csv')

# Aggregation of data from bureau

bureau_agg = bureau.groupby('SK_ID_CURR').agg({
    'DAYS_CREDIT': ['mean', 'max', 'min'],
    'CREDIT_DAY_OVERDUE': ['mean', 'max'],
    'AMT_CREDIT_MAX_OVERDUE': ['mean', 'max'],
    'CNT_CREDIT_PROLONG': ['sum'],
    'AMT_CREDIT_SUM': ['mean', 'sum'],
    'AMT_CREDIT_SUM_DEBT': ['mean', 'sum'],
    'AMT_CREDIT_SUM_OVERDUE': ['mean', 'sum'],
    'AMT_CREDIT_SUM_LIMIT': ['mean', 'sum'],
    'DAYS_CREDIT_UPDATE': ['mean', 'max', 'min'],
    'DAYS_CREDIT_ENDDATE': ['mean', 'max', 'min'],
}).reset_index()

# Rename columns after aggregation
bureau_agg.columns = ['_'.join(col).strip() for col in bureau_agg.columns.values]
bureau_agg.rename(columns={'SK_ID_CURR_': 'SK_ID_CURR'}, inplace=True)

# Merge with base dataframe
df_features = df_features.merge(bureau_agg, on='SK_ID_CURR', how='left')
df_features.head()

Unnamed: 0,SK_ID_CURR,TARGET,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,AMT_INCOME_TOTAL,EXT_SOURCE_2,EXT_SOURCE_3,CREDIT_INCOME_RATIO,ANNUITY_INCOME_RATIO,...,AMT_CREDIT_SUM_OVERDUE_mean,AMT_CREDIT_SUM_OVERDUE_sum,AMT_CREDIT_SUM_LIMIT_mean,AMT_CREDIT_SUM_LIMIT_sum,DAYS_CREDIT_UPDATE_mean,DAYS_CREDIT_UPDATE_max,DAYS_CREDIT_UPDATE_min,DAYS_CREDIT_ENDDATE_mean,DAYS_CREDIT_ENDDATE_max,DAYS_CREDIT_ENDDATE_min
0,100002,1,406597.5,24700.5,351000.0,202500.0,0.262949,0.139376,2.007889,0.121978,...,0.0,0.0,7997.141113,31988.564453,-499.875,-7.0,-1185.0,-349.0,780.0,-1072.0
1,100003,0,1293502.5,35698.5,1129500.0,270000.0,0.622246,,4.79075,0.132217,...,0.0,0.0,202500.0,810000.0,-816.0,-43.0,-2131.0,-544.5,1216.0,-2434.0
2,100004,0,135000.0,6750.0,135000.0,67500.0,0.555912,0.729567,2.0,0.1,...,0.0,0.0,0.0,0.0,-532.0,-382.0,-682.0,-488.5,-382.0,-595.0
3,100006,0,312682.5,29686.5,297000.0,135000.0,0.650442,,2.316167,0.2199,...,,,,,,,,,,
4,100007,0,513000.0,21865.5,513000.0,121500.0,0.322738,,4.222222,0.179963,...,0.0,0.0,0.0,0.0,-783.0,-783.0,-783.0,-783.0,-783.0,-783.0


In [16]:
credit_card_balance = load_data_in_chunks('data/credit_card_balance.csv')

# Aggregation of data from credit_card_balance
credit_card_balance_agg = credit_card_balance.groupby('SK_ID_CURR').agg({
    'MONTHS_BALANCE': ['mean', 'max', 'min'],
    'AMT_BALANCE': ['mean', 'max', 'min'],
    'AMT_CREDIT_LIMIT_ACTUAL': ['mean', 'max', 'min'],
    'AMT_DRAWINGS_ATM_CURRENT': ['mean', 'max', 'min'],
    'AMT_DRAWINGS_CURRENT': ['mean', 'max', 'min'],
    'AMT_INST_MIN_REGULARITY': ['mean', 'max', 'min'],
    'AMT_PAYMENT_TOTAL_CURRENT': ['mean', 'max', 'min'],
    'CNT_DRAWINGS_ATM_CURRENT': ['mean', 'max', 'min'],
}).reset_index()

credit_card_balance_agg.columns = ['_'.join(col).strip() for col in credit_card_balance_agg.columns.values]
credit_card_balance_agg.rename(columns={'SK_ID_CURR_': 'SK_ID_CURR'}, inplace=True)

df_features = df_features.merge(credit_card_balance_agg, on='SK_ID_CURR', how='left')
df_features.head()

Unnamed: 0,SK_ID_CURR,TARGET,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,AMT_INCOME_TOTAL,EXT_SOURCE_2,EXT_SOURCE_3,CREDIT_INCOME_RATIO,ANNUITY_INCOME_RATIO,...,AMT_DRAWINGS_CURRENT_min,AMT_INST_MIN_REGULARITY_mean,AMT_INST_MIN_REGULARITY_max,AMT_INST_MIN_REGULARITY_min,AMT_PAYMENT_TOTAL_CURRENT_mean,AMT_PAYMENT_TOTAL_CURRENT_max,AMT_PAYMENT_TOTAL_CURRENT_min,CNT_DRAWINGS_ATM_CURRENT_mean,CNT_DRAWINGS_ATM_CURRENT_max,CNT_DRAWINGS_ATM_CURRENT_min
0,100002,1,406597.5,24700.5,351000.0,202500.0,0.262949,0.139376,2.007889,0.121978,...,,,,,,,,,,
1,100003,0,1293502.5,35698.5,1129500.0,270000.0,0.622246,,4.79075,0.132217,...,,,,,,,,,,
2,100004,0,135000.0,6750.0,135000.0,67500.0,0.555912,0.729567,2.0,0.1,...,,,,,,,,,,
3,100006,0,312682.5,29686.5,297000.0,135000.0,0.650442,,2.316167,0.2199,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,
4,100007,0,513000.0,21865.5,513000.0,121500.0,0.322738,,4.222222,0.179963,...,,,,,,,,,,


In [17]:
pos_cash_balance = load_data_in_chunks('data/POS_CASH_balance.csv')

# Aggregation of data from pos_cash_balance
pos_cash_balance_agg = pos_cash_balance.groupby('SK_ID_CURR').agg({
    'MONTHS_BALANCE': ['mean', 'max', 'min'],
    'CNT_INSTALMENT': ['mean', 'max', 'min'],
    'CNT_INSTALMENT_FUTURE': ['mean', 'max', 'min'],
    'NAME_CONTRACT_STATUS': ['nunique'],  # количество уникальных статусов контрактов
}).reset_index()

pos_cash_balance_agg.columns = ['_'.join(col).strip() for col in pos_cash_balance_agg.columns.values]
pos_cash_balance_agg.rename(columns={'SK_ID_CURR_': 'SK_ID_CURR'}, inplace=True)

df_features = df_features.merge(pos_cash_balance_agg, on='SK_ID_CURR', how='left')
df_features.head()

Unnamed: 0,SK_ID_CURR,TARGET,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,AMT_INCOME_TOTAL,EXT_SOURCE_2,EXT_SOURCE_3,CREDIT_INCOME_RATIO,ANNUITY_INCOME_RATIO,...,MONTHS_BALANCE_mean_y,MONTHS_BALANCE_max_y,MONTHS_BALANCE_min_y,CNT_INSTALMENT_mean,CNT_INSTALMENT_max,CNT_INSTALMENT_min,CNT_INSTALMENT_FUTURE_mean,CNT_INSTALMENT_FUTURE_max,CNT_INSTALMENT_FUTURE_min,NAME_CONTRACT_STATUS_nunique
0,100002,1,406597.5,24700.5,351000.0,202500.0,0.262949,0.139376,2.007889,0.121978,...,-10.0,-1.0,-19.0,24.0,24.0,24.0,15.0,24.0,6.0,1.0
1,100003,0,1293502.5,35698.5,1129500.0,270000.0,0.622246,,4.79075,0.132217,...,-43.785714,-18.0,-77.0,10.107142,12.0,6.0,5.785714,12.0,0.0,2.0
2,100004,0,135000.0,6750.0,135000.0,67500.0,0.555912,0.729567,2.0,0.1,...,-25.5,-24.0,-27.0,3.75,4.0,3.0,2.25,4.0,0.0,2.0
3,100006,0,312682.5,29686.5,297000.0,135000.0,0.650442,,2.316167,0.2199,...,-9.619048,-1.0,-20.0,12.0,48.0,1.0,8.65,48.0,0.0,3.0
4,100007,0,513000.0,21865.5,513000.0,121500.0,0.322738,,4.222222,0.179963,...,-33.636364,-1.0,-77.0,15.333333,24.0,10.0,8.969697,24.0,0.0,3.0


In [18]:
installments_payments = load_data_in_chunks('data/installments_payments.csv')

# Aggregation of data from installments payments
installments_payments_agg = installments_payments.groupby('SK_ID_CURR').agg({
    'NUM_INSTALMENT_VERSION': ['nunique'],
    'NUM_INSTALMENT_NUMBER': ['mean', 'max', 'min'],
    'DAYS_INSTALMENT': ['mean', 'max', 'min'],
    'DAYS_ENTRY_PAYMENT': ['mean', 'max', 'min'],
    'AMT_INSTALMENT': ['mean', 'max', 'min'],
    'AMT_PAYMENT': ['mean', 'max', 'min'],
}).reset_index()

installments_payments_agg.columns = ['_'.join(col).strip() for col in installments_payments_agg.columns.values]
installments_payments_agg.rename(columns={'SK_ID_CURR_': 'SK_ID_CURR'}, inplace=True)

df_features = df_features.merge(installments_payments_agg, on='SK_ID_CURR', how='left')
df_features.head()

Unnamed: 0,SK_ID_CURR,TARGET,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,AMT_INCOME_TOTAL,EXT_SOURCE_2,EXT_SOURCE_3,CREDIT_INCOME_RATIO,ANNUITY_INCOME_RATIO,...,DAYS_INSTALMENT_min,DAYS_ENTRY_PAYMENT_mean,DAYS_ENTRY_PAYMENT_max,DAYS_ENTRY_PAYMENT_min,AMT_INSTALMENT_mean,AMT_INSTALMENT_max,AMT_INSTALMENT_min,AMT_PAYMENT_mean,AMT_PAYMENT_max,AMT_PAYMENT_min
0,100002,1,406597.5,24700.5,351000.0,202500.0,0.262949,0.139376,2.007889,0.121978,...,-565.0,-315.421051,-49.0,-587.0,11559.24707,53093.746094,9251.775391,11559.24707,53093.746094,9251.775391
1,100003,0,1293502.5,35698.5,1129500.0,270000.0,0.622246,,4.79075,0.132217,...,-2310.0,-1385.319946,-544.0,-2324.0,64754.585938,560835.375,6662.970215,64754.585938,560835.375,6662.970215
2,100004,0,135000.0,6750.0,135000.0,67500.0,0.555912,0.729567,2.0,0.1,...,-784.0,-761.666687,-727.0,-795.0,7096.154785,10573.964844,5357.25,7096.154785,10573.964844,5357.25
3,100006,0,312682.5,29686.5,297000.0,135000.0,0.650442,,2.316167,0.2199,...,-545.0,-271.625,-12.0,-575.0,62947.089844,691786.875,2482.919922,62947.089844,691786.875,2482.919922
4,100007,0,513000.0,21865.5,513000.0,121500.0,0.322738,,4.222222,0.179963,...,-2326.0,-1032.242432,-14.0,-2318.0,12666.444336,22678.785156,1821.780029,12214.05957,22678.785156,0.18


In [19]:
previous_application = load_data_in_chunks('data/previous_application.csv')

# Aggregate data from previous_application
previous_application_agg = previous_application.groupby('SK_ID_CURR').agg({
    'AMT_ANNUITY': ['mean', 'max', 'min'],
    'AMT_APPLICATION': ['mean', 'max', 'min'],
    'AMT_CREDIT': ['mean', 'max', 'min'],
    'AMT_DOWN_PAYMENT': ['mean', 'max', 'min'],
    'AMT_GOODS_PRICE': ['mean', 'max', 'min'],
    'HOUR_APPR_PROCESS_START': ['mean', 'max', 'min'],
    'RATE_DOWN_PAYMENT': ['mean', 'max', 'min'],
    'CNT_PAYMENT': ['mean', 'max', 'min'],
}).reset_index()

previous_application_agg.columns = ['_'.join(col).strip() for col in previous_application_agg.columns.values]
previous_application_agg.rename(columns={'SK_ID_CURR_': 'SK_ID_CURR'}, inplace=True)

df_features = df_features.merge(previous_application_agg, on='SK_ID_CURR', how='left')
df_features.head()

Unnamed: 0,SK_ID_CURR,TARGET,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,AMT_INCOME_TOTAL,EXT_SOURCE_2,EXT_SOURCE_3,CREDIT_INCOME_RATIO,ANNUITY_INCOME_RATIO,...,AMT_GOODS_PRICE_min,HOUR_APPR_PROCESS_START_mean,HOUR_APPR_PROCESS_START_max,HOUR_APPR_PROCESS_START_min,RATE_DOWN_PAYMENT_mean,RATE_DOWN_PAYMENT_max,RATE_DOWN_PAYMENT_min,CNT_PAYMENT_mean,CNT_PAYMENT_max,CNT_PAYMENT_min
0,100002,1,406597.5,24700.5,351000.0,202500.0,0.262949,0.139376,2.007889,0.121978,...,179055.0,9.0,9.0,9.0,0.0,0.0,0.0,24.0,24.0,24.0
1,100003,0,1293502.5,35698.5,1129500.0,270000.0,0.622246,,4.79075,0.132217,...,68809.5,14.666667,17.0,12.0,0.05003,0.100061,0.0,10.0,12.0,6.0
2,100004,0,135000.0,6750.0,135000.0,67500.0,0.555912,0.729567,2.0,0.1,...,24282.0,5.0,5.0,5.0,0.212008,0.212008,0.212008,4.0,4.0,4.0
3,100006,0,312682.5,29686.5,297000.0,135000.0,0.650442,,2.316167,0.2199,...,26912.339844,14.666667,15.0,12.0,0.163412,0.21783,0.108994,23.0,48.0,0.0
4,100007,0,513000.0,21865.5,513000.0,121500.0,0.322738,,4.222222,0.179963,...,17176.5,12.333333,15.0,8.0,0.159516,0.21889,0.100143,20.666666,48.0,10.0


In [20]:
df_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307511 entries, 0 to 307510
Columns: 110 entries, SK_ID_CURR to CNT_PAYMENT_min
dtypes: float32(80), float64(28), int32(1), int8(1)
memory usage: 161.0 MB


In [21]:
# Remove columns with more than 50% missing values
threshold = 0.5
df_features = df_features.loc[:, df_features.isnull().mean() < threshold]
# Remove rows with more than 50% missing values
df_features = df_features.loc[df_features.isnull().mean(axis=1) < threshold]
# Check for remaining missing values
missing_values = df_features.isnull().sum()
missing_values_percentage = (missing_values / df_features.shape[0]) * 100

# Create a dataframe with the number of missing values
missing_values_df = pd.DataFrame({
    'Missing Values': missing_values,
    'Percentage': missing_values_percentage
})

# Filtering columns with missing values
missing_values_df = missing_values_df[missing_values_df['Missing Values'] > 0]

# Sort by percentage of missing values
missing_values_df = missing_values_df.sort_values(by='Percentage', ascending=False)

# Displaying a dataframe with missing values
missing_values_df

Unnamed: 0,Missing Values,Percentage
AMT_CREDIT_MAX_OVERDUE_mean,115918,39.653810
AMT_CREDIT_MAX_OVERDUE_max,115918,39.653810
AMT_CREDIT_SUM_LIMIT_mean,60981,20.860686
EXT_SOURCE_3,57177,19.559395
AMT_CREDIT_SUM_DEBT_mean,48206,16.490550
...,...,...
AMT_GOODS_PRICE,267,0.091337
ANNUITY_CREDIT_RATIO,12,0.004105
CREDIT_TERM,12,0.004105
ANNUITY_INCOME_RATIO,12,0.004105


In [22]:
from imblearn.under_sampling import RandomUnderSampler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Приведение типов данных перед манипуляциями
df_features['SK_ID_CURR'] = df_features['SK_ID_CURR'].astype('int32')
df_features['TARGET'] = df_features['TARGET'].astype('int8')
float32_columns = df_features.columns.drop(['SK_ID_CURR', 'TARGET'])
df_features[float32_columns] = df_features[float32_columns].astype('float32')

# Separate features and target
X = df_features.drop(columns=['TARGET'])
y = df_features['TARGET']

# Apply RandomUnderSampler to balance the data
rus = RandomUnderSampler(sampling_strategy=0.3, random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)

# Combine the resampled data back into a single DataFrame
df_resampled = pd.concat([X_resampled, y_resampled], axis=1)

# Increase the number of iterations and set tolerance for early stopping
imputer = IterativeImputer(max_iter=10, tol=1e-3, random_state=0)

# Fit and transform the data
df_features_imputed = pd.DataFrame(imputer.fit_transform(df_resampled), columns=df_resampled.columns)

# Convert TARGET back to int8
df_features_imputed['TARGET'] = df_features_imputed['TARGET'].astype('int8')

# Convert other features back to float32
df_features_imputed[float32_columns] = df_features_imputed[float32_columns].astype('float32')

# Check for remaining missing values
missing_values = df_features_imputed.isnull().sum().sum()
print(f'Total missing values after imputation: {missing_values}')

# Save the imputed data
df_features_imputed.to_csv('data/df_features_imputed.csv', index=False)



Total missing values after imputation: 0


In [23]:
df_features_imputed['TARGET'].value_counts()

TARGET
0    79803
1    23941
Name: count, dtype: int64

In [24]:
import logging
from pycaret.classification import setup, compare_models, tune_model, finalize_model, save_model

# df_features_imputed = pd.read_csv('data/df_features_imputed.csv')

# Configure logging to write to a file
logging.basicConfig(
    filename='model_training.log',  # specify the log file name
    filemode='a',  # append to the file instead of overwriting
    format='%(asctime)s - %(levelname)s - %(message)s',  # log format
    level=logging.INFO  # log level
)

# Create and configure the model
logging.info("Setting up the model...")
clf_setup = setup(data=df_features_imputed, target='TARGET', session_id=42, n_jobs=-1, use_gpu=True, fold=3,
                  data_split_stratify=True, fix_imbalance=True, verbose=True, log_experiment=True,
                  experiment_name='classification_experiment')

# Define the list of models to compare
models = ['lightgbm', 'rf', 'et']

# Compare models and select the best model
logging.info("Comparing models...")
try:
    best_model = compare_models(include=models, sort='F1')
    logging.info(f'Best model: {best_model}')
except Exception as e:
    logging.error(f'Error in compare_models: {e}')
    best_model = None

# Check if best_model is not None and proceed with tuning
if best_model:
    # Tune the best model
    logging.info(f"Tuning model: {best_model}")
    try:
        tuned_model = tune_model(best_model, n_iter=10, optimize='F1')  # Increase n_iter for more thorough tuning
        logging.info(f'Tuned model: {tuned_model}')
    except Exception as e:
        logging.error(f'Error in tune_model: {e}')
        tuned_model = best_model  # Fall back to the best model without tuning

    # Finalize the tuned model
    logging.info(f"Finalizing model: {tuned_model}")
    try:
        final_model = finalize_model(tuned_model)
        logging.info(f'Final model: {final_model}')
    except Exception as e:
        logging.error(f'Error in finalize_model: {e}')
        final_model = tuned_model  # Fall back to the tuned model without finalization

    # Save the final model
    logging.info("Saving the final model...")
    try:
        model_path = save_model(final_model, 'best_model_pycaret')
        logging.info(f'Model saved at: {model_path}')
    except Exception as e:
        logging.error(f'Error in save_model: {e}')
else:
    logging.error("No best model found to tune and finalize.")

[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: Intel(R) UHD Graphics 620, Vendor: Intel(R) Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: Intel(R) UHD Graphics 620, Vendor: Intel(R) Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000

Unnamed: 0,Description,Value
0,Session id,42
1,Target,TARGET
2,Target type,Binary
3,Original data shape,"(103744, 86)"
4,Transformed data shape,"(142846, 86)"
5,Transformed train set shape,"(111722, 86)"
6,Transformed test set shape,"(31124, 86)"
7,Numeric features,85
8,Preprocess,True
9,Imputation type,simple


[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: Intel(R) UHD Graphics 620, Vendor: Intel(R) Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: Intel(R) UHD Graphics 620, Vendor: Intel(R) Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.8124,0.8041,0.3858,0.6601,0.4869,0.3816,0.4023,6.88
rf,Random Forest Classifier,0.7938,0.7765,0.4009,0.5767,0.4729,0.3499,0.3588,37.2167
et,Extra Trees Classifier,0.7809,0.7487,0.3788,0.5357,0.4438,0.3123,0.3196,13.78


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8128,0.8081,0.3938,0.6577,0.4927,0.3866,0.4058
1,0.8126,0.8023,0.4088,0.6494,0.5018,0.3936,0.4096
2,0.816,0.8072,0.4157,0.6614,0.5105,0.4044,0.4209
Mean,0.8138,0.8059,0.4061,0.6562,0.5016,0.3949,0.4121
Std,0.0016,0.0026,0.0091,0.005,0.0073,0.0073,0.0064


Fitting 3 folds for each of 10 candidates, totalling 30 fits
[LightGBM] [Info] Number of positive: 37240, number of negative: 37240
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 21619
[LightGBM] [Info] Number of data points in the train set: 74480, number of used features: 85
[LightGBM] [Info] Using GPU Device: Intel(R) UHD Graphics 620, Vendor: Intel(R) Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 77 dense feature groups (5.68 MB) transferred to GPU in 0.010033 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 77 dense feature groups (3.42 MB) transferred to GPU in 0.005804 secs. 1 sparse feature groups
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 77 dense feature groups (3.4