# Code 5
- Numerical/Categorical EDA
- Advanced Numerical Imputation and Transformation
- Logistic Model

## Import Libraries

In [1]:
import pandas as pd

## Import Data

In [2]:
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')

## We Combine both Datasets to perform Missing Value Imputation

In [3]:
all_data = [train, test]

In [4]:
# Combine both to check Missing Value
pd.concat([train, test], axis = 0).isnull().sum()

Loan_ID                0
Gender                24
Married                3
Dependents            25
Education              0
Self_Employed         55
ApplicantIncome        2
CoapplicantIncome      0
LoanAmount            27
Loan_Amount_Term      20
Credit_History        79
Property_Area          0
Loan_Status          367
dtype: int64

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [6]:
train['Loan_Status'] = train['Loan_Status'].apply(lambda x: 1 if x == 'Y' else 0) 

## 1/ Impute Numerical Variables
- Instead of preprocessing one feature at a time, we will do them in bulk

In [7]:
# Identify all Numerical features
numerical_features = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History']

In [8]:
# Now lets use the proper methods of preprocessing
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler 

### 1.1 Impute

In [9]:
num_impute = KNNImputer(n_neighbors=2, weights="uniform")

In [10]:
# Fit for Impute 
num_impute.fit(train[numerical_features])

KNNImputer(n_neighbors=2)

In [11]:
# Impute Train
train[numerical_features]  = num_impute.transform(train[numerical_features])

In [12]:
# Impute Test
test[numerical_features]  = num_impute.transform(test[numerical_features])

### 1.2 Feature Transformation

In [13]:
num_scaler = MaxAbsScaler()

In [14]:
# Fit for Making them Normal
num_scaler.fit(train[numerical_features])

MaxAbsScaler()

In [15]:
# Transform Train
train[numerical_features]  = num_scaler.transform(train[numerical_features])

In [16]:
# Transform Train
test[numerical_features]  = num_scaler.transform(test[numerical_features])

## 2/ Impute Categorical Variables
- Lets do the same for Categorical Variables

In [17]:
# Identify all Categorical Features
categorical_features = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area']

In [18]:
# Now lets use the proper methods of preprocessing
from sklearn.impute import SimpleImputer

### 2.1 Impute

In [19]:
cat_impute = SimpleImputer(strategy='most_frequent')

In [20]:
# Fit for Impute 
cat_impute.fit(train[categorical_features])

SimpleImputer(strategy='most_frequent')

In [21]:
# Impute Train
train[categorical_features]  = cat_impute.transform(train[categorical_features])

In [22]:
# Impute Test
test[categorical_features]  = cat_impute.transform(test[categorical_features])

### 2.2 Categorical Encoders

In [23]:
import category_encoders as ce

- encoder = ce.BackwardDifferenceEncoder(cols=[...])
- encoder = ce.BaseNEncoder(cols=[...])
- encoder = ce.BinaryEncoder(cols=[...])
- encoder = ce.CatBoostEncoder(cols=[...])
- encoder = ce.CountEncoder(cols=[...])
- encoder = ce.GLMMEncoder(cols=[...])
- encoder = ce.HashingEncoder(cols=[...])
- encoder = ce.HelmertEncoder(cols=[...])
- encoder = ce.JamesSteinEncoder(cols=[...])
- encoder = ce.LeaveOneOutEncoder(cols=[...])
- encoder = ce.MEstimateEncoder(cols=[...])
- encoder = ce.OneHotEncoder(cols=[...])
- encoder = ce.OrdinalEncoder(cols=[...])
- encoder = ce.SumEncoder(cols=[...])
- encoder = ce.PolynomialEncoder(cols=[...])
- encoder = ce.TargetEncoder(cols=[...])
- encoder = ce.WOEEncoder(cols=[...])

- encoder.fit(X, y)
- X_cleaned = encoder.transform(X_dirty)

In [24]:
cat_encoder = ce.TargetEncoder()

In [25]:
cat_encoder.fit(train[categorical_features], train['Loan_Status'])

TargetEncoder(cols=['Gender', 'Married', 'Dependents', 'Education',
                    'Self_Employed', 'Property_Area'])

https://www.kaggle.com/matleonard/categorical-encodings

In [27]:
cat_encoded = cat_encoder.transform(train[categorical_features])

In [28]:
cat_encoded.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,Property_Area
0,0.691235,0.629108,0.686111,0.708333,0.68797,0.658416
1,0.691235,0.718204,0.647059,0.708333,0.68797,0.614525
2,0.691235,0.718204,0.686111,0.708333,0.682927,0.658416
3,0.691235,0.718204,0.686111,0.61194,0.68797,0.658416
4,0.691235,0.629108,0.686111,0.708333,0.68797,0.658416


In [29]:
train[categorical_features] = cat_encoded

In [30]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             614 non-null    float64
 2   Married            614 non-null    float64
 3   Dependents         614 non-null    float64
 4   Education          614 non-null    float64
 5   Self_Employed      614 non-null    float64
 6   ApplicantIncome    614 non-null    float64
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         614 non-null    float64
 9   Loan_Amount_Term   614 non-null    float64
 10  Credit_History     614 non-null    float64
 11  Property_Area      614 non-null    float64
 12  Loan_Status        614 non-null    int64  
dtypes: float64(11), int64(1), object(1)
memory usage: 62.5+ KB


In [31]:
#train = train.drop(categorical_features, axis = 1)

In [32]:
train.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,0.691235,0.629108,0.686111,0.708333,0.68797,0.07221,0.0,0.201429,0.75,1.0,0.658416,1
1,LP001003,0.691235,0.718204,0.647059,0.708333,0.68797,0.05658,0.036192,0.182857,0.75,1.0,0.614525,0
2,LP001005,0.691235,0.718204,0.686111,0.708333,0.682927,0.037037,0.0,0.094286,0.75,1.0,0.658416,1
3,LP001006,0.691235,0.718204,0.686111,0.61194,0.68797,0.031889,0.056592,0.171429,0.75,1.0,0.658416,1
4,LP001008,0.691235,0.629108,0.686111,0.708333,0.68797,0.074074,0.0,0.201429,0.75,1.0,0.658416,1


### Now lets do the same for test

In [33]:
test_cat_encoded = cat_encoder.transform(test[categorical_features])

In [34]:
test[categorical_features] = test_cat_encoded

In [35]:
#test = test.drop(categorical_features, axis = 1)

## Now Re Check all of them

In [36]:
# Combine both to check Missing Value
pd.concat([train, test], axis = 0).isnull().sum()

Loan_ID                0
Gender                 0
Married                0
Dependents             0
Education              0
Self_Employed          0
ApplicantIncome        0
CoapplicantIncome      0
LoanAmount             0
Loan_Amount_Term       0
Credit_History         0
Property_Area          0
Loan_Status          367
dtype: int64

## Prepare Data for Model Model

In [37]:
# Check Variables
test.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,0.691235,0.718204,0.686111,0.708333,0.68797,0.070617,0.0,0.157143,0.75,1.0,0.658416
1,LP001022,0.691235,0.718204,0.647059,0.708333,0.68797,0.037975,0.036,0.18,0.75,1.0,0.658416
2,LP001031,0.691235,0.718204,0.752475,0.708333,0.68797,0.061728,0.0432,0.297143,0.75,1.0,0.658416
3,LP001035,0.691235,0.718204,0.752475,0.708333,0.68797,0.028889,0.061104,0.142857,0.75,1.0,0.658416
4,LP001051,0.691235,0.629108,0.686111,0.61194,0.68797,0.040444,0.0,0.111429,0.75,1.0,0.658416


In [38]:
X = train.drop(['Loan_Status','Loan_ID'], axis =  1)
y = train['Loan_Status']

### Create Dummy Variables for  Categorical Variables

In [39]:
# No need for this step
#X = pd.get_dummies(X, drop_first = True)

## Build Model

In [40]:
#Start by importing library
from sklearn.linear_model import LogisticRegression

#Initialize
clf = LogisticRegression(max_iter = 1000)

In [41]:
model = clf.fit(X, y)

## Check Accuracy of Model on Train Data

In [42]:
# Predict on Train Data
y_pred = model.predict(X)

### We can use a Accuracy Function from Metrics

In [43]:
from sklearn.metrics import accuracy_score

In [44]:
y_true = train['Loan_Status']

In [45]:
print(f" Train Accuracy : {accuracy_score(y_true, y_pred):0.1%}")

 Train Accuracy : 79.6%


## Predict using Test X variables for Submitting to Competition

### Prepare X variables From test Data

In [46]:
X_test = test.drop(['Loan_ID'], axis =  1)

In [47]:
X_test.to_csv('X_test.csv', index = False)

In [48]:
#Covert to Dummies
#X_test = pd.get_dummies(X_test)

In [49]:
# We will use (Y/N) to match it to the Competition Submission Data
X_test_prep = model.predict(X_test)

## Create Submission File

In [52]:
sample_submission = pd.read_csv('input/sample_submission.csv')
sample_submission.head()

Unnamed: 0,Loan_ID,Loan_Status
0,LP001015,N
1,LP001022,N
2,LP001031,N
3,LP001035,N
4,LP001051,N


In [54]:
submission = pd.DataFrame({
    'Loan_ID' : test['Loan_ID'],
    'Loan_Status' : X_test_prep
})

In [55]:
submission['Loan_Status'] = submission['Loan_Status'].apply(lambda x: 'Y' if x == 1 else 0) 

## Export Submission File

In [56]:
submission.to_csv('O5_LR_Advance3_Impute.csv', index = False)

In [None]:
# LB Accuracy : TBD