In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb
import xgboost as xgb

### Solving encoding issues

In [2]:
# First let's solve weird encoding in files
! mv train.csv train_weird_encoding.csv
! mv test.csv test_weird_encoding.csv

In [3]:
# Solving
import csv

with open("train_weird_encoding.csv", "r", encoding='ISO-8859-1') as in_file:
    with open("train.csv", "w", newline='') as out_file:
        reader = csv.reader(in_file)
        writer = csv.writer(out_file)
        for idx, row in enumerate(reader):
            try:
                writer.writerow(row)
            except Exception as e:
                print("Error on row: ", idx+1)
                print(e)
                print("\n")

In [4]:
! wc -l train_weird_encoding.csv

87018 train_weird_encoding.csv


In [5]:
! wc -l train.csv

87018 train.csv


In [6]:
# Solving
import csv

with open("test_weird_encoding.csv", "r", encoding='ISO-8859-1') as in_file:
    with open("test.csv", "w", newline='') as out_file:
        reader = csv.reader(in_file)
        writer = csv.writer(out_file)
        for idx, row in enumerate(reader):
            try:
                writer.writerow(row)
            except Exception as e:
                print("Error on row: ", idx+1)
                print(e)
                print("\n")

In [7]:
! wc -l test_weird_encoding.csv

37718 test_weird_encoding.csv


In [8]:
! wc -l test.csv

37718 test.csv


### Get the data

In [2]:
def get_data():
    train = pd.read_csv("./train.csv")
    test = pd.read_csv("./test.csv")
    return train, test

In [3]:
train_df, test_df = get_data()
train_df.shape, test_df.shape

((87017, 26), (37717, 24))

In [4]:
set(train_df.columns) - set(test_df.columns)

{'Disbursed', 'LoggedIn'}

In [5]:
print("Percentage of nulls in train dataset")
100 * train_df.loc[:,train_df.isnull().sum() > 0].isnull().sum().sort_values(ascending=False) / train_df.shape[0]

Percentage of nulls in train dataset


Processing_Fee           68.488916
Interest_Rate            68.137261
EMI_Loan_Submitted       68.137261
Loan_Amount_Submitted    39.773837
Loan_Tenure_Submitted    39.773837
Salary_Account           13.519197
City                      1.152648
Loan_Amount_Applied       0.081593
Loan_Tenure_Applied       0.081593
Existing_EMI              0.081593
Employer_Name             0.081593
dtype: float64

In [6]:
print("Percentage of nulls in test dataset")
100 * test_df.loc[:,test_df.isnull().sum() > 0].isnull().sum().sort_values(ascending=False) / test_df.shape[0]

Percentage of nulls in test dataset


Processing_Fee           68.260996
Interest_Rate            67.892462
EMI_Loan_Submitted       67.892462
Loan_Amount_Submitted    39.563062
Loan_Tenure_Submitted    39.563062
Salary_Account           13.354721
City                      1.055227
Employer_Name             0.111356
Loan_Amount_Applied       0.106053
Loan_Tenure_Applied       0.106053
Existing_EMI              0.106053
dtype: float64

### Nulls exist, let's keep it that way to test missing values property of xgboost and lightgbm

In [7]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87017 entries, 0 to 87016
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   ID                     87017 non-null  object 
 1   Gender                 87017 non-null  object 
 2   City                   86014 non-null  object 
 3   Monthly_Income         87017 non-null  int64  
 4   DOB                    87017 non-null  object 
 5   Lead_Creation_Date     87017 non-null  object 
 6   Loan_Amount_Applied    86946 non-null  float64
 7   Loan_Tenure_Applied    86946 non-null  float64
 8   Existing_EMI           86946 non-null  float64
 9   Employer_Name          86946 non-null  object 
 10  Salary_Account         75253 non-null  object 
 11  Mobile_Verified        87017 non-null  object 
 12  Var5                   87017 non-null  int64  
 13  Var1                   87017 non-null  object 
 14  Loan_Amount_Submitted  52407 non-null  float64
 15  Lo

In [8]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37717 entries, 0 to 37716
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   ID                     37717 non-null  object 
 1   Gender                 37717 non-null  object 
 2   City                   37319 non-null  object 
 3   Monthly_Income         37717 non-null  int64  
 4   DOB                    37717 non-null  object 
 5   Lead_Creation_Date     37717 non-null  object 
 6   Loan_Amount_Applied    37677 non-null  float64
 7   Loan_Tenure_Applied    37677 non-null  float64
 8   Existing_EMI           37677 non-null  float64
 9   Employer_Name          37675 non-null  object 
 10  Salary_Account         32680 non-null  object 
 11  Mobile_Verified        37717 non-null  object 
 12  Var5                   37717 non-null  int64  
 13  Var1                   37717 non-null  object 
 14  Loan_Amount_Submitted  22795 non-null  float64
 15  Lo

We  will only drop the features that are not present in test dataset

In [9]:
train_df = train_df.drop(["Disbursed","LoggedIn","ID"], axis=1)
test_df = test_df.drop(["ID"], axis=1)

We will also create our predicted variable and a suppor feature `"Annual_Income"`, `Earns_500k_plus`

In [10]:
train_df["Annual_Income"] = train_df["Monthly_Income"] * 12

In [11]:
train_df["Earns_500k_plus"] = (train_df["Annual_Income"] > 500000).astype(bool)

### Then we proceed to drop features that are proxies of the predicted variable in order to prevent data leakege

In [13]:
train_df = train_df.drop(["Monthly_Income","Annual_Income"], axis=1)
test_df = test_df.drop(["Monthly_Income"], axis=1)

train_df.shape, test_df.shape

((87017, 23), (37717, 22))

### THere is some data transformation required
- LGBM accept 4 types of variables: int, float, bool, category
- Convert datetime features to year, month, day integer variables
- Convert "object" columns to 'category'

In [14]:
# Handle datetime vlaues
train_df["DOB"] = pd.to_datetime(train_df["DOB"],format="%d-%b-%y")
test_df["DOB"] = pd.to_datetime(test_df["DOB"],format="%d-%b-%y")

train_df["Lead_Creation_Date"] = pd.to_datetime(train_df["Lead_Creation_Date"],format="%d-%b-%y")
test_df["Lead_Creation_Date"] = pd.to_datetime(test_df["Lead_Creation_Date"],format="%d-%b-%y")

In [15]:
def create_year_month_day_columns(dataframe, features):
    df = dataframe.copy()
    for feature in features:
        df[feature + "_day"] = df[feature].dt.day
        df[feature + "_month"] = df[feature].dt.month
        df[feature + "_year"] = df[feature].dt.year
    
    return df

In [16]:
train_df = create_year_month_day_columns(train_df,["DOB","Lead_Creation_Date"])
test_df = create_year_month_day_columns(test_df,["DOB","Lead_Creation_Date"])

train_df = train_df.drop(["DOB","Lead_Creation_Date"],axis=1)
test_df = test_df.drop(["DOB","Lead_Creation_Date"],axis=1)

In [17]:
# Convert categorical variables to "category" for LightGBM run properly
numerics = ["float64","int64","int32","bool"]
categorical_features = train_df.select_dtypes(exclude=numerics).columns.tolist()

train_df[categorical_features] = train_df[categorical_features].astype('category')
test_df[categorical_features] = test_df[categorical_features].astype('category')

In [18]:
# Let's start simple with a train_test_split of 30% validation data
from sklearn.model_selection import train_test_split

In [19]:
X_train, X_valid, y_train, y_valid = train_test_split(train_df.drop("Earns_500k_plus", axis=1),
                                                      train_df["Earns_500k_plus"], 
                                                      test_size=0.2)

In [20]:
import logging
import time

logger = logging.getLogger(__name__)

def log_execution_time(func):
    def wrapper(*args, **kwargs):
        start_time = time.time() 
        result = func(*args, **kwargs)
        end_time = time.time()
        execution_time = end_time - start_time
        logger.info(f"{func.__name__} took {execution_time:.6f} seconds to execute.")
        return result
    return wrapper

In [72]:
# Let's build our LightGBM model using default parameters

train_data = lgb.Dataset(X_train,
                         label=y_train,
                         feature_name=X_train.columns.tolist(), 
                         categorical_feature= categorical_features)
validation_data = lgb.Dataset(X_valid,
                              label= y_valid,
                              feature_name=X_valid.columns.tolist(), 
                              categorical_feature= categorical_features)

@log_execution_time
def train_lgb(train_data, validation_data):
    
    lgb_params = {
        "objective" : "binary",
        "metric": "auc"
    }

    num_rounds = 100
    
    return lgb.train(params = lgb_params, 
                     train_set = train_data, 
                     num_boost_round = num_rounds,
                     valid_sets = [validation_data],categorical_feature)

SyntaxError: positional argument follows keyword argument (783944461.py, line 25)

In [77]:
# train_lgb(train_data, validation_data)

In [25]:
X_train.select_dtypes(include="object")

Unnamed: 0,Gender,City,DOB,Lead_Creation_Date,Employer_Name,Salary_Account,Mobile_Verified,Var1,Filled_Form,Device_Type,Var2,Source
71199,Female,,02-Jan-70,18-Jul-15,COGNIZANT TECHNOLOGIES SERVICES PVT LTD,,N,HBXX,N,Web-browser,G,S122
27737,Female,Mumbai,17-Sep-85,01-Jun-15,CAPITA INDIA PVT LTD,Citibank,N,HBXX,N,Web-browser,B,S133
56429,Male,Ghaziabad,08-Jul-84,03-Jul-15,DELHI DISTRICT COURT,Syndicate Bank,Y,HBXA,N,Web-browser,G,S122
21472,Male,Delhi,28-Jul-82,25-May-15,AMERICAN EXPRESS INDIA LTD,ICICI Bank,Y,HBXX,N,Web-browser,B,S143
61034,Male,Baramati,16-Dec-86,07-Jul-15,KAAS FOOTWEAR PVT LTD.,ICICI Bank,Y,HBXX,N,Web-browser,G,S122
...,...,...,...,...,...,...,...,...,...,...,...,...
73283,Female,Bengaluru,23-Jun-87,20-Jul-15,MERIDIUM SERVICES AND LABS PVT LTD,Axis Bank,N,HBXX,N,Web-browser,G,S122
36970,Male,Patna,19-Dec-78,12-Jun-15,RELIANCE POWER LTD,HDFC Bank,Y,HBXD,Y,Mobile,C,S122
1444,Male,Hyderabad,03-Jun-83,03-May-15,FIITJEE HYDERABAD CLASSES LTD,Kotak Bank,Y,HBXH,N,Web-browser,B,S133
83544,Male,Bengaluru,19-Sep-94,29-Jul-15,DHL EXPRESS INDIA PVT LTD,ICICI Bank,Y,HAXA,Y,Mobile,G,S122


In [68]:
model = lgb.LGBMClassifier(objective="binary",eval_metric='logloss')

In [70]:
model.fit(X_train, y_train,
          feature_name = list(X_train.columns),
          categorical_feature = list(X_train.select_dtypes(include="category").columns))





In [27]:
model.score(X_valid, y_valid)

0.9470811307745346

In [66]:
pd.Series(model.feature_importances_,index= X_train.columns).sort_values(ascending=False)

Employer_Name               599
City                        442
Var5                        396
Existing_EMI                237
Loan_Amount_Submitted       234
DOB_year                    162
Loan_Amount_Applied         105
Var1                         96
Lead_Creation_Date_day       94
Interest_Rate                75
Salary_Account               74
Var4                         71
Lead_Creation_Date_month     66
EMI_Loan_Submitted           54
Source                       47
Loan_Tenure_Applied          47
Loan_Tenure_Submitted        43
Device_Type                  41
Processing_Fee               33
Var2                         32
DOB_day                      19
Mobile_Verified              16
DOB_month                    14
Gender                        2
Filled_Form                   1
Lead_Creation_Date_year       0
dtype: int32

In [28]:
from sklearn.ensemble import RandomForestClassifier

In [37]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer

In [46]:
numeric_imputer = SimpleImputer(missing_values=np.nan, strategy='median')
categ_imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

In [47]:
categorical_features = X_train.select_dtypes(include="category").columns.tolist()
non_categorical_features = list(set(X_train.columns) - set(categorical_features))

In [51]:
numeric_imputer.fit(X_train[non_categorical_features])
categ_imputer.fit(X_train[categorical_features])

X_rf_num_train = pd.DataFrame(numeric_imputer.transform(X_train[non_categorical_features]), index= X_train[non_categorical_features].index, columns=X_train[non_categorical_features].columns)
X_rf_ctg_train = pd.DataFrame(categ_imputer.transform(X_train[categorical_features]), index= X_train[categorical_features].index, columns=X_train[categorical_features].columns)

X_rf_num_valid = pd.DataFrame(numeric_imputer.transform(X_valid[non_categorical_features]), index= X_valid[non_categorical_features].index, columns=X_valid[non_categorical_features].columns)
X_rf_ctg_valid = pd.DataFrame(categ_imputer.transform(X_valid[categorical_features]), index= X_valid[categorical_features].index, columns=X_valid[categorical_features].columns)

X_rf_train = pd.concat([X_rf_num_train, X_rf_ctg_train],axis=1)
X_rf_valid = pd.concat([X_rf_num_valid, X_rf_ctg_valid],axis=1)

X_rf_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 69613 entries, 41925 to 67759
Data columns (total 26 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   EMI_Loan_Submitted        69613 non-null  float64
 1   DOB_month                 69613 non-null  float64
 2   Var4                      69613 non-null  float64
 3   Var5                      69613 non-null  float64
 4   Loan_Tenure_Submitted     69613 non-null  float64
 5   Existing_EMI              69613 non-null  float64
 6   Lead_Creation_Date_month  69613 non-null  float64
 7   Lead_Creation_Date_day    69613 non-null  float64
 8   Loan_Tenure_Applied       69613 non-null  float64
 9   Processing_Fee            69613 non-null  float64
 10  Lead_Creation_Date_year   69613 non-null  float64
 11  DOB_year                  69613 non-null  float64
 12  Loan_Amount_Submitted     69613 non-null  float64
 13  Loan_Amount_Applied       69613 non-null  float64
 14  DO

In [52]:
rf_categorical_encoder = OrdinalEncoder(handle_unknown="use_encoded_value",unknown_value=np.nan)

rf_categorical_encoder.fit(X_rf_train[categorical_features])

X_rf_train[categorical_features] = rf_categorical_encoder.transform(X_rf_train[categorical_features]).astype(int)
X_rf_valid[categorical_features] = rf_categorical_encoder.transform(X_rf_valid[categorical_features]).astype(int)

In [53]:
X_rf_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 69613 entries, 41925 to 67759
Data columns (total 26 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   EMI_Loan_Submitted        69613 non-null  float64
 1   DOB_month                 69613 non-null  float64
 2   Var4                      69613 non-null  float64
 3   Var5                      69613 non-null  float64
 4   Loan_Tenure_Submitted     69613 non-null  float64
 5   Existing_EMI              69613 non-null  float64
 6   Lead_Creation_Date_month  69613 non-null  float64
 7   Lead_Creation_Date_day    69613 non-null  float64
 8   Loan_Tenure_Applied       69613 non-null  float64
 9   Processing_Fee            69613 non-null  float64
 10  Lead_Creation_Date_year   69613 non-null  float64
 11  DOB_year                  69613 non-null  float64
 12  Loan_Amount_Submitted     69613 non-null  float64
 13  Loan_Amount_Applied       69613 non-null  float64
 14  DO

In [61]:
random_forest = RandomForestClassifier(criterion="log_loss", n_jobs=3)

In [62]:
random_forest.fit(X_rf_train, y_train)

In [60]:
random_forest.score(X_rf_valid, y_valid)

0.9433463571592737

In [65]:
pd.Series(random_forest.feature_importances_,index= X_rf_train.columns).sort_values(ascending=False)

Var5                        0.254189
Existing_EMI                0.127427
Loan_Amount_Submitted       0.115413
DOB_year                    0.047795
Loan_Amount_Applied         0.043365
Lead_Creation_Date_day      0.041984
Employer_Name               0.039184
EMI_Loan_Submitted          0.037501
City                        0.036247
Interest_Rate               0.033072
DOB_day                     0.032888
Var4                        0.027889
DOB_month                   0.025425
Salary_Account              0.018811
Var1                        0.017999
Device_Type                 0.015664
Loan_Tenure_Applied         0.015312
Processing_Fee              0.012819
Source                      0.011779
Lead_Creation_Date_month    0.009999
Loan_Tenure_Submitted       0.009906
Var2                        0.009230
Mobile_Verified             0.007865
Gender                      0.006073
Filled_Form                 0.002162
Lead_Creation_Date_year     0.000000
dtype: float64

### XGBoost

In [73]:
import xgboost as xgb
from sklearn.metrics import log_loss

In [81]:
xgb_model = xgb.XGBClassifier(n_estimators=100, 
                              objective='binary:logistic',
                              eval_metric=log_loss,
                              enable_categorical=True)

In [82]:
xgb_model.fit(X_train,y_train)

ValueError: Experimental support for categorical data is not implemented for current tree method yet.