# **Libraries**

In [39]:
import pandas as pd
import numpy as np

from feature_engine.selection import DropCorrelatedFeatures
from sklearn.feature_selection import mutual_info_classif
from feature_engine.imputation import RandomSampleImputer
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score

from lightgbm import LGBMClassifier


import functions
import importlib
importlib.reload(functions)

import time

import warnings

# **Display**

In [22]:
%matplotlib inline

pd.options.display.max_rows = 300000
pd.options.display.max_columns = 999
pd.options.display.max_colwidth = 500

warnings.filterwarnings("ignore")
warnings.simplefilter(action="ignore", category=FutureWarning)

pd.set_option('display.max_rows', 200)

size = 20

# **Data**

## **Load Data**

In [89]:
app_train = pd.read_csv(
    r"C:\Users\Dell\Documents\AI\Risk\Data\application_train.csv",
    index_col=False
)

# **Variables**

In [90]:
data = app_train.copy()
random_state = 101
target = 'TARGET'

# **Pre-Processing**

## **Drop Collinear Features**

In [91]:
dcf = DropCorrelatedFeatures(threshold=0.7)
data = dcf.fit_transform(data)

## **Drop Unneeded Features**

In [93]:
data.drop('SK_ID_CURR', axis=1, inplace=True)

## **Data Types**

## **Reduce Memory Usage**

In [95]:
data = functions.reduce_memory_usage(data)

Memory usage of dataframe is 185.34 MB
Memory usage after optimization is: 66.57 MB
Decreased by 64.1%


## **Data Types**

## **Column Separation**

In [96]:
numerical_cols = [cname for cname in data.columns if data[cname].dtype in ['int8', 'int16', 'int32', 'float16', 'float32' 'float64']]
categorical_cols = [cname for cname in data.columns if data[cname].dtype in ["object"]]

In [97]:
numerical_transformer = Pipeline(steps=[('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))])

preprocessor  = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)],
        remainder='passthrough')

# **LightGBM**

## **Train Test Split**

In [98]:
X = data.drop('TARGET', axis=1)
y = data['TARGET']

X, y = shuffle(X, y, random_state=random_state)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=random_state)

### **LGBM**

In [None]:
start_time = time.time()

pipeline = Pipeline([
    ('preprocessor', preprocessor),         
    ('classifier', LGBMClassifier(random_state=random_state, n_estimators=150, max_depth=4, learning_rate=0.1, verbose=-1)) 
])

cross_val_accuracy = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='precision_macro').mean()
print("Best cross-validation score: {:.2f}".format(cross_val_accuracy))

pipeline.fit(X_train, y_train)
test_score = pipeline.score(X_test, y_test)
print("Test set accuracy: {:.2f}".format(test_score))

end_time = time.time()  
elapsed_time = (end_time - start_time)/60 
print(f"Elapsed Time: {elapsed_time:.2f} minutes")