In [23]:
#Import the packages we need
import os
import numpy as np 
import pandas as pd 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve,roc_auc_score,classification_report
from sklearn.model_selection import cross_val_score, train_test_split,GridSearchCV,KFold
from sklearn import linear_model,tree,linear_model
from lightgbm import LGBMClassifier
import re
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt

# Getting the data

In [3]:
data["TARGET"].value_counts()
#U nbalanced data, we may need to use undeersampling or oversampling to improve our performance

0    282686
1     24825
Name: TARGET, dtype: int64

In [4]:
x_train = data.drop(columns="TARGET")
y_train = data["TARGET"]

# Nested CV

In [6]:
dt = tree.DecisionTreeClassifier()
dt_grid ={
 "max_depth" : [3,6,9],
 "max_leaf_nodes": [5,10,15],
 "min_samples_leaf": [5,10,15]
}

In [7]:
lr = linear_model.LinearRegression()

In [8]:
rf = RandomForestClassifier()
rf_grid = {
    "n_estimators":[50,100,300],
    "max_depth":[4,8,16]
}

In [9]:
lgbm = LGBMClassifier()
lgbm_grid = {  
    "num_leaves":[5,10,15],
    "n_estimators":[300,500,1000],
    "learning_rate":[0.3,0.5,0.7]}

In [10]:
inner_cv = KFold(n_splits=3,shuffle = True,random_state = 1)
outer_cv = KFold(n_splits=3,shuffle = True,random_state = 1)
scoring = "roc_auc"

In [11]:
dt_clf = GridSearchCV(estimator = dt,param_grid = dt_grid, scoring = scoring, cv = inner_cv)
lr_reg = lr
rf_clf = GridSearchCV(estimator = rf,param_grid = rf_grid, scoring = scoring, cv = inner_cv)
lgbm_clf = GridSearchCV(estimator = lgbm,param_grid = lgbm_grid, scoring = scoring, cv = inner_cv)

In [14]:
dt_score = cross_val_score(estimator = tree.DecisionTreeClassifier(),X=x_train,y=y_train, scoring = scoring, cv = outer_cv)

In [15]:
lr_score = cross_val_score(estimator = lr,X=x_train,y=y_train, scoring = scoring, cv = outer_cv)

In [16]:
rf_score = cross_val_score(estimator = rf_clf,X=x_train,y=y_train,scoring = scoring, cv = outer_cv)

In [17]:
lgbm_score = cross_val_score(estimator = lgbm_clf,X=x_train,y=y_train,scoring = scoring, cv = outer_cv)

0.644
0.692
0.763
0.776


In [43]:
print(round(dt_score.mean()),3)
print(round(lr_score.mean()),3)
print(round(rf_score.mean()),3)
print(round(lgbm_score.mean()),3)

0.644
0.692
0.763
0.776


In [48]:
for i in [0.644,0.692,0.763,0.776]:
    print(i)

0.644
0.692
0.763
0.776


# Oversampling and Undersampling

In [None]:
#Because the target variable is unbalanced, we want to see if undersampling or oversampling will improve the performance,
#and then tune the hyparameters for the lgbm model

In [None]:
#Undersampling

In [20]:
for i in [25000,50000,75000,100000,120000,180000,282686]:
    df1 = data[data["TARGET"]==0].sample(n=i,replace=False,random_state=123)
    df2 = data[data["TARGET"]==1].sample(n=24825,replace=False,random_state=123)
    df = pd.concat([df1,df2],axis=0)
    X = df.drop(columns="TARGET")
    Y = df["TARGET"]
    x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=123)
    lgbm = LGBMClassifier()
    lgbm.fit(x_train,y_train)
    score = roc_auc_score(y_test.values,lgbm.predict_proba(x_test)[:,1])
    print(round(score,3))

0.783
0.773
0.785
0.781
0.781
0.772
0.776


In [5]:
#Oversampling

In [21]:
for i in [25000,50000,75000,100000,120000]:
    df1 = data[data["TARGET"]==0]
    df2 = data[data["TARGET"]==1].sample(n=24825,replace=True,random_state=123)
    df = pd.concat([df1,df2],axis=0)
    X = df.drop(columns="TARGET")
    Y = df["TARGET"]
    x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=123)
    lgbm = LGBMClassifier()
    lgbm.fit(x_train,y_train)
    score = roc_auc_score(y_test.values,lgbm.predict_proba(x_test)[:,1])
    print(score)

0.784
0.783
0.786
0.784
0.779
0.774
0.776


# Hyperparameter Tuning

In [None]:
#Then we run the model to tune the hyperparameter on the basis of oversampling

In [22]:
lgbm_clf = GridSearchCV(estimator = lgbm,param_grid = lgbm_grid, scoring = scoring, cv = inner_cv)
lgbm_clf.fit(x_train,y_train)

GridSearchCV(cv=KFold(n_splits=3, random_state=1, shuffle=True),
             estimator=LGBMClassifier(),
             param_grid={'learning_rate': [0.3, 0.5, 0.7],
                         'n_estimators': [300, 500, 1000],
                         'num_leaves': [5, 10, 15]},
             scoring='roc_auc')

In [23]:
lgbm_clf.best_params_

{'learning_rate': 0.3, 'n_estimators': 1000, 'num_leaves': 15}

In [36]:
lgbm_clf.best_score_

0.795

# Output the file

In [38]:
sub = pd.read_csv("/kaggle/input/data-feature-imp/test_feature_imp.csv")
sub["TARGET"] = lgbm_clf.predict_proba(sub)[:,1]
sub = sub[["SK_ID_CURR","TARGET"]]
sub.to_csv(f"newmodels47.csv",index=0)




In [54]:
x = 123
print(x)

123


In [55]:
print(x)

123
