# **Libraries**

In [17]:
import pandas as pd
import numpy as np

from feature_engine.selection import DropCorrelatedFeatures
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

import lightgbm as lgb
from sklearn.metrics import roc_auc_score

from feature_engine.encoding import WoEEncoder

import functions
import importlib
importlib.reload(functions)

import time

import warnings

# **Display**

In [2]:
%matplotlib inline

pd.options.display.max_rows = 300000
pd.options.display.max_columns = 999
pd.options.display.max_colwidth = 500

warnings.filterwarnings("ignore")
warnings.simplefilter(action="ignore", category=FutureWarning)

pd.set_option('display.max_rows', 200)

size = 20

# **Data**

## **Load Data**

In [39]:
app_train = pd.read_csv(
    r"C:\Users\Dell\Documents\AI\Risk\Data\application_train.csv",
    index_col=False
)

previous_apps = pd.read_csv(
    r"C:\Users\Dell\Documents\AI\Risk\Data\previous_application.csv",
    index_col=False
)

# **Variables**

In [40]:
random_state = 101
target = 'TARGET'

## **Drop Collinear Features**

In [42]:
dcf = DropCorrelatedFeatures(threshold=0.7)
previous_apps = dcf.fit_transform(previous_apps)
app_train = dcf.fit_transform(app_train)

## **Reduce Memory Usage**

In [43]:
previous_apps = functions.reduce_memory_usage(previous_apps)

app_train = functions.reduce_memory_usage(app_train)

Memory usage of dataframe is 407.77 MB
Memory usage after optimization is: 277.15 MB
Decreased by 32.0%
Memory usage of dataframe is 187.69 MB
Memory usage after optimization is: 67.74 MB
Decreased by 63.9%


## **Missing Values**

In [34]:
functions.MissingValues(previous_apps)

Unnamed: 0,NumberMissing,PercentageMissing,DataType
RATE_INTEREST_PRIMARY,1664263,99.64,float16
RATE_INTEREST_PRIVILEGED,1664263,99.64,float16
RATE_DOWN_PAYMENT,895844,53.64,float16
AMT_DOWN_PAYMENT,895844,53.64,float32
NAME_TYPE_SUITE,820405,49.12,object
DAYS_LAST_DUE,673065,40.3,float32
DAYS_FIRST_DUE,673065,40.3,float32
DAYS_FIRST_DRAWING,673065,40.3,float32
NFLAG_INSURED_ON_APPROVAL,673065,40.3,float16
AMT_ANNUITY,372235,22.29,float32


## **Drop Features (More than 50% Missing)**

In [44]:
columns = ['RATE_INTEREST_PRIMARY', 'RATE_INTEREST_PRIVILEGED', 'RATE_DOWN_PAYMENT', 'AMT_DOWN_PAYMENT']
previous_apps = previous_apps.drop(columns, axis=1)

In [47]:
num_value = -99999

for col in previous_apps.select_dtypes(include=['float16', 'float32', 'float64']).columns:
    previous_apps[col].fillna(num_value, inplace=True)

In [45]:
num_value = -99999

for col in app_train.select_dtypes(include=['float16', 'float32', 'float64']).columns:
    app_train[col].fillna(num_value, inplace=True)

In [46]:
cat_value = 'UNKNOWN'

for col in previous_apps.select_dtypes(include=['object']).columns:
    previous_apps[col].fillna(cat_value, inplace=True)

In [48]:
cat_value = 'UNKNOWN'

for col in app_train.select_dtypes(include=['object']).columns:
    app_train[col].fillna(cat_value, inplace=True)

## **Aggregation**

In [71]:
aggregated_previous_app = previous_apps.groupby('SK_ID_CURR').agg({
    'SK_ID_PREV': 'count',  
    'AMT_ANNUITY': 'mean',    
    'DAYS_DECISION': ['mean', 'max', 'min'],  
    'CNT_PAYMENT': ['sum'],
    'DAYS_FIRST_DUE': lambda x: x.max() - x.min(),
    'DAYS_LAST_DUE': lambda x: x.max() - x.min()
}).reset_index()

# Flatten multi-level columns generated by the aggregations
aggregated_previous_app.columns = ['_'.join(col).strip() if type(col) is tuple else col for col in aggregated_previous_app.columns]

aggregated_previous_app = aggregated_previous_app.rename(columns={
    'SK_ID_CURR_': 'SK_ID_CURR', 
    'SK_ID_PREV_count': 'NUM_PREVIOUS_APPLICATIONS',
    'AMT_ANNUITY_mean': 'AVG_ANNUITY_AMOUNT',
    'DAYS_DECISION_mean': 'AVG_DAYS_DECISION',
    'DAYS_DECISION_max': 'MAX_DAYS_DECISION',
    'DAYS_DECISION_min': 'MIN_DAYS_DECISION',
    'CNT_PAYMENT_sum': 'SUM_CNT_PAYMENT',
    'DAYS_FIRST_DUE_<lambda>': 'RANGE_DAYS_FIRST_DUE',
    'DAYS_LAST_DUE_<lambda>': 'RANGE_DAYS_LAST_DUE',
})

aggregated_previous_app.head()

Unnamed: 0,SK_ID_CURR,NUM_PREVIOUS_APPLICATIONS,AVG_ANNUITY_AMOUNT,AVG_DAYS_DECISION,MAX_DAYS_DECISION,MIN_DAYS_DECISION,SUM_CNT_PAYMENT,RANGE_DAYS_FIRST_DUE,RANGE_DAYS_LAST_DUE
0,100001,1,3951.0,-1740.0,-1740,-1740,8.0,0.0,0.0
1,100002,1,9251.775391,-606.0,-606,-606,24.0,0.0,0.0
2,100003,3,56553.988281,-1305.0,-746,-2341,30.0,1594.0,1444.0
3,100004,1,5357.25,-815.0,-815,-815,4.0,0.0,0.0
4,100005,2,-47592.898438,-536.0,-315,-757,-inf,99293.0,99533.0


## **Merge Application Train and Previous Application Data**

In [72]:
data = aggregated_previous_app.merge(app_train[['SK_ID_CURR', 'TARGET']], on='SK_ID_CURR', how='inner')

## **Train Test Split**

In [73]:
X = data.drop('TARGET', axis=1)
y = data['TARGET']

X, y = shuffle(X, y, random_state=random_state)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=random_state)

### **LGBM**

In [74]:
model = lgb.LGBMClassifier(boosting_type='gbdt', 
                           num_leaves=31, 
                           max_depth=-1, 
                           learning_rate=0.1, 
                           n_estimators=100,
                           verbose=-1)

model.fit(X_train, y_train)

y_prob = model.predict_proba(X_test)[:, 1]  

auc_score = roc_auc_score(y_test, y_prob)
print(f"AUC Score: {auc_score:.2f}")

AUC Score: 0.61


## **Feature Importance**

In [75]:
feature_importance = model.feature_importances_
feature_names = X.columns

importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

importance_df

Unnamed: 0,Feature,Importance
2,AVG_ANNUITY_AMOUNT,475
0,SK_ID_CURR,430
4,MAX_DAYS_DECISION,387
5,MIN_DAYS_DECISION,373
3,AVG_DAYS_DECISION,328
7,RANGE_DAYS_FIRST_DUE,294
6,SUM_CNT_PAYMENT,259
8,RANGE_DAYS_LAST_DUE,250
1,NUM_PREVIOUS_APPLICATIONS,204
