In [7]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score, roc_curve, auc
import matplotlib.pyplot as pltx
import seaborn as sns
import joblib

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
from sklearn.metrics import precision_recall_curve, classification_report, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import PolynomialFeatures

In [13]:
train_df = pd.read_csv("./train.csv")
test_df = pd.read_csv("./test.csv")

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15779985.0,Nwankwo,678.0,France,Male,29.0,4.0,0.00,3.0,1.0,0.0,180626.36,0.0
1,1,15650086.0,Ch'in,687.0,France,Female,34.0,1.0,0.00,2.0,0.0,1.0,63736.17,0.0
2,2,15733602.0,Thompson,682.0,France,Female,52.0,6.0,0.00,3.0,0.0,0.0,179655.87,1.0
3,3,15645794.0,Macleod,753.0,Germany,Male,44.0,6.0,83347.25,2.0,1.0,0.0,161407.48,0.0
4,4,15633840.0,Hsia,544.0,Germany,Female,55.0,0.0,107747.57,1.0,1.0,0.0,176580.86,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,14995,15807989.0,Ch'iu,585.0,France,Male,33.0,3.0,0.00,1.0,1.0,0.0,54191.38,0.0
14996,14996,15731781.0,Ch'ang,678.0,France,Male,34.0,6.0,0.00,2.0,1.0,1.0,53437.10,0.0
14997,14997,15667093.0,Goliwe,678.0,France,Female,54.0,4.0,0.00,1.0,1.0,0.0,147720.29,1.0
14998,14998,15732644.0,Fanucci,705.0,Spain,Female,40.0,7.0,0.00,2.0,1.0,0.0,149550.95,0.0


In [11]:
def split_data(df): 
    train_rest = df.drop(columns="Exited")
    train_tragets = df["Exited"]
    
    train_data, val_data, train_y, val_y = train_test_split(train_rest,
                                                        train_tragets, 
                                                        test_size=0.2, 
                                                        random_state=42, 
                                                        stratify=train_tragets
                                                        )
    
    train_data_df = pd.concat([train_data, train_y], axis=1)
    val_data_df = pd.concat([val_data, val_y], axis=1)
    
    return train_data_df, val_data_df

In [12]:
def delete_columns(dfs, columns_to_delete):
    for df in dfs: 
        df = df.drop(columns=columns_to_delete)
        
    return dfs

In [14]:
def separate_dfs(df, target):
    X_df = df.drop(columns=[target])
    Y_df = df[target]
    
    return X_df, Y_df

In [15]:
def get_cloumns_by_types(df):
    numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
    categorical_cols = df.select_dtypes(include='object').columns.tolist()

    return numeric_cols,categorical_cols

In [16]:
def get_preprocessor(numeric_cols,categorical_cols):
    numeric_transformer = Pipeline(steps=[
        ('scaler', MinMaxScaler())])


    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Combine preprocessors
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_cols),
            ('cat', categorical_transformer, categorical_cols)])
    
    return preprocessor

In [17]:
def preprocess_into_df(preprocessor, df,numeric_cols,categorical_cols, fit=False):
    if fit:
        processed_df = preprocessor.fit_transform(df)
    else:
        processed_df = preprocessor.transform(df)
        
    one_hot_names = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_cols)
    all_names = numeric_cols + list(one_hot_names)
    
    return pd.DataFrame(processed_df, columns=all_names)

In [None]:
def create_and_train_model(model, x_df, y_df):
    