In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMClassifier

In [None]:
train_df = pd.read_csv('train.csv')

In [None]:
train_df.head()

Unnamed: 0,id,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,0,42,technician,married,secondary,no,7.0,no,no,cellular,25.0,aug,117.0,3.0,-1.0,0.0,unknown,0.0
1,1,38,blue-collar,married,secondary,no,514.0,no,no,unknown,18.0,jun,185.0,1.0,-1.0,0.0,unknown,0.0
2,2,36,blue-collar,married,secondary,no,602.0,yes,no,unknown,14.0,may,111.0,2.0,-1.0,0.0,unknown,0.0
3,3,27,student,single,secondary,no,34.0,yes,no,unknown,28.0,may,10.0,2.0,-1.0,0.0,unknown,0.0
4,4,26,technician,married,secondary,no,889.0,yes,no,cellular,3.0,feb,902.0,1.0,-1.0,0.0,unknown,1.0


In [None]:
train_df.shape

(371708, 18)

In [None]:
test_df = pd.read_csv('test.csv')

In [None]:
train_df.isnull().sum()

Unnamed: 0,0
id,0
age,0
job,0
marital,1
education,1
default,1
balance,1
housing,1
loan,1
contact,1


In [None]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 371708 entries, 0 to 371707
Data columns (total 18 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   id         371708 non-null  int64  
 1   age        371708 non-null  int64  
 2   job        371708 non-null  object 
 3   marital    371707 non-null  object 
 4   education  371707 non-null  object 
 5   default    371707 non-null  object 
 6   balance    371707 non-null  float64
 7   housing    371707 non-null  object 
 8   loan       371707 non-null  object 
 9   contact    371707 non-null  object 
 10  day        371707 non-null  float64
 11  month      371707 non-null  object 
 12  duration   371707 non-null  float64
 13  campaign   371707 non-null  float64
 14  pdays      371707 non-null  float64
 15  previous   371707 non-null  float64
 16  poutcome   371707 non-null  object 
 17  y          371707 non-null  float64
dtypes: float64(7), int64(2), object(9)
memory usage: 51.0+ MB


In [None]:
#Checking if the target data is imbalanced
train_df['y'].value_counts(normalize=True)*100

Unnamed: 0_level_0,proportion
y,Unnamed: 1_level_1
0.0,87.988927
1.0,12.011073


In [None]:
train_df = train_df.dropna(subset=['y'])
X = train_df.drop('y',axis=1)
y = train_df['y']

In [None]:
X_test = test_df

In [None]:
X_train,X_val,y_train,y_val = train_test_split(X,y,test_size=0.2,random_state=42)

In [None]:
num_cols = []
obj_cols = []

for col in X_train:
  if X_train[col].dtype == 'object' or X_train[col].dtype.name == 'category':
    obj_cols.append(col)
  else:
    num_cols.append(col)

for col in X_test:
  if X_test[col].dtype == 'object' or X_test[col].dtype.name == 'category':
    obj_cols.append(col)
  else:
    num_cols.append(col)

In [None]:
num_pipeline = Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='median')),
        ('scaler',StandardScaler())
    ]
)

In [None]:
cat_pipeline = Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('one_hot_encoder',OneHotEncoder(handle_unknown='ignore',sparse_output=False))
    ]
)

In [None]:
preprocessor = ColumnTransformer(
    [
        ('num_pipeline',num_pipeline,num_cols),
        ('cat_pipleine',cat_pipeline,obj_cols)
    ]
)

In [None]:
model = Pipeline(
    steps=[
        ('preprocessor',preprocessor),
        ('model',LGBMClassifier(
            n_estimators=2500,
            learning_rate=0.025,

            num_leaves=48,

            min_child_samples=150,
            subsample=0.75,
            colsample_bytree=0.75,

            scale_pos_weight=7.34,
            reg_alpha=0.5,
            reg_lambda=2.0,

            force_row_wise=True,
            n_jobs=-1,
            random_state=42

  )
)
    ])

In [None]:
model.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 35661, number of negative: 261704
[LightGBM] [Info] Total Bins 2596
[LightGBM] [Info] Number of data points in the train set: 297365, number of used features: 104
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.119923 -> initscore=-1.993156
[LightGBM] [Info] Start training from score -1.993156


In [None]:
y_pred = model.predict_proba(X_val)[:,1]



In [None]:
y_train_prob = model.predict_proba(X_train)[:,1]
y_val_prob = model.predict_proba(X_val)[:,1]



In [None]:
train_auc = roc_auc_score(y_train,y_train_prob)
val_auc = roc_auc_score(y_val,y_val_prob)


In [None]:
print(f"Train AUC: {train_auc}")
print(f"Validation AUC: {val_auc}")

Train AUC: 0.98181363179625
Validation AUC: 0.9670149553759907


In [None]:
y_test_prob = model.predict_proba(X_test)[:, 1]



In [None]:
submission = pd.DataFrame({
    "id": test_df['id'],        # replace with column name given in problem
    "y": y_test_prob  # replace 'target' with required column name
})

In [None]:
submission.to_csv("submission_6.csv", index=False)