# __Problem Statement__
Identify the customers eligible for loan amounts.
# __Hypothesis__


# __Getting the system ready and loading the data (Prepare Data)__

## Import libraries

In [1]:
import pandas as pd
import skimpy as sk #for data profiling

### Collect data

In [16]:
df = pd.read_csv('./data-for-project-1/raw_data.csv') #read the raw data from the raw_data.csv file
df.head() #show all features with first few rows of data

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


## Preprocessing and cleaning the data

### Removing irrelevant features

The Loan_ID feature will be irrelevant to our models, thus we can remove it.

In [17]:
irrelevant_features = 'Loan_ID' #feature to be removed
#removing of feature
df.drop(
    columns=irrelevant_features,
    inplace=True
)
#display data without Loan_ID
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


### Changing numeric features to categorical features

The Credit_History and Loan_Amount_Term features need to be converted to categorical features.

In [21]:
df['Credit_History'] = df['Credit_History'].astype(str) #converting credit history to categorical feature
df['Loan_Amount_Term'] = df['Loan_Amount_Term'].astype(str) #converting loan term to categorical feature
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             601 non-null    object 
 1   Married            611 non-null    object 
 2   Dependents         599 non-null    object 
 3   Education          614 non-null    object 
 4   Self_Employed      582 non-null    object 
 5   ApplicantIncome    614 non-null    int64  
 6   CoapplicantIncome  614 non-null    float64
 7   LoanAmount         592 non-null    float64
 8   Loan_Amount_Term   614 non-null    object 
 9   Credit_History     614 non-null    object 
 10  Property_Area      614 non-null    object 
 11  Loan_Status        614 non-null    object 
dtypes: float64(2), int64(1), object(9)
memory usage: 57.7+ KB


### Checking the cardinality of the features

In [22]:
# checking the cardinality of features
feature_cardinality = df.select_dtypes("object").nunique()
feature_cardinality

Gender               2
Married              2
Dependents           4
Education            2
Self_Employed        2
Loan_Amount_Term    11
Credit_History       3
Property_Area        3
Loan_Status          2
dtype: int64

There is no need to handle the cardinality of the features as no features have very low cardinality or very high cardinality.

### Understanding and profiling the data
Use the skimpy library to profile the data

In [23]:
sk.skim(df) #profile the dataframe of raw data

According to the skimpy summary above, we notice that there are 3 number features and 9 categorical features (which we will be encoding for better data understanding). The features LoanAmount, Loan-Amount_Term, Credit_History, Gender, Married, Dependents, and Self_Employed have missing values. Credit_History must also be changed to a string feature as it is categorical.

### Handling missing values

In [24]:
missing_num = ['LoanAmount']
for col in df.columns:
    if col in missing_num :
        df[col].fillna(
            df[col]
            .dropna()
            .mean(),
            inplace=True
        )
    else:
        df[col].fillna(
            df[col]
            .mode()[0],
            inplace=True
        )

sk.skim(df)


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.




A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.




A

### Encoding categorical variables

In [25]:
from category_encoders import OneHotEncoder

ohe = OneHotEncoder(
    use_cat_names=True,
    cols=['Gender','Married','Dependents','Self_Employed','Education','Property_Area','Loan_Status','Credit_History','Loan_Amount_Term']
)

encoded_df = ohe.fit_transform(df)
encoded_df.head()

Unnamed: 0,Gender_Male,Gender_Female,Married_No,Married_Yes,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Education_Graduate,Education_Not Graduate,...,Loan_Amount_Term_84.0,Loan_Amount_Term_12.0,Credit_History_1.0,Credit_History_0.0,Credit_History_<NA>,Property_Area_Urban,Property_Area_Rural,Property_Area_Semiurban,Loan_Status_Y,Loan_Status_N
0,1,0,1,0,1,0,0,0,1,0,...,0,0,1,0,0,1,0,0,1,0
1,1,0,0,1,0,1,0,0,1,0,...,0,0,1,0,0,0,1,0,0,1
2,1,0,0,1,1,0,0,0,1,0,...,0,0,1,0,0,1,0,0,1,0
3,1,0,0,1,1,0,0,0,0,1,...,0,0,1,0,0,1,0,0,1,0
4,1,0,1,0,1,0,0,0,1,0,...,0,0,1,0,0,1,0,0,1,0


In [26]:
encoded_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 34 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Gender_Male              614 non-null    int64  
 1   Gender_Female            614 non-null    int64  
 2   Married_No               614 non-null    int64  
 3   Married_Yes              614 non-null    int64  
 4   Dependents_0             614 non-null    int64  
 5   Dependents_1             614 non-null    int64  
 6   Dependents_2             614 non-null    int64  
 7   Dependents_3+            614 non-null    int64  
 8   Education_Graduate       614 non-null    int64  
 9   Education_Not Graduate   614 non-null    int64  
 10  Self_Employed_No         614 non-null    int64  
 11  Self_Employed_Yes        614 non-null    int64  
 12  ApplicantIncome          614 non-null    int64  
 13  CoapplicantIncome        614 non-null    float64
 14  LoanAmount               6

## Data exploration

In [27]:
# For Visualization
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sn

### Univariate analysis

In [28]:
# Prepare data to display
labels = (
    df['Loan_Status']
    .astype('str')
    .str.replace('0','No', regex=True)
    .str.replace('1','Yes', regex=True)
    .value_counts()
)

# Create figure using Plotly
fig = px.bar(
    data_frame=labels, 
    x=labels.index, 
    y=labels.values, 
    title=f'Class Imbalance', 
    color=labels.index
)

# Add titles & Display figure
fig.update_layout(xaxis_title='Label', yaxis_title='Number of Customers')
fig.show()

The BC Finance company would like to decrease the number of customers who are not eligible for a loan, which is depicted by the orange bar labeled N in the graph above.

### Bivariate/Multivariate analysis

#### Analysis of numeric features 

In [29]:
#Gaining insights on which features to use as numeric and categorical
df.select_dtypes('number').nunique()

ApplicantIncome      505
CoapplicantIncome    287
LoanAmount           204
dtype: int64

In [30]:
# Select features to plot
plot_cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']

# Plot numeric features against target
plt.Figure(figsize=(3,4))
for col in plot_cols:
    fig = px.box(data_frame=df[plot_cols], x=col, color=df['Loan_Status'], title=f'BoxPlot for {col} Feature against the Target')
    fig.update_layout(xaxis_title=f'{col} Feature')
    fig.show()

##### Visualization of numeric features without outliers

In [102]:
mask_appincome = df['ApplicantIncome'] < 6000 #mask for filtering the ApplicantIncome feature
df_mask1 = df[mask_appincome] #filtered dataframe from the applicant income mask 
mask_coincome = df['CoapplicantIncome'] < 2300 #mask for filtering the CoapplicantIncome feature
df_mask2 = df[mask_coincome]
mask_loanamount = df['LoanAmount'] < 175 #mask for filtering the LoanAmount feature
df_mask3 = df[mask_loanamount]

plot_cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']

plt.Figure(figsize=(3,4))
for col in plot_cols:
    if col == 'ApplicantIncome':
        fig = px.box(data_frame=df[mask_appincome], x=col, color= df['LoanStatus'] < 10140
        title=f'BoxPlot for {col} Feature against the Target without outliers')
        fig.update_layout(xaxis_title=f'{col} Feature')
        fig.show()
    elif col == 'CoapplicantIncome':
        fig = px.box(data_frame=df[mask_coincome], x=col,
        title=f'BoxPlot for {col} Feature against the Target without outliers')
        fig.update_layout(xaxis_title=f'{col} Feature')
        fig.show()
    else:
        fig = px.box(data_frame=df[mask_loanamount], x=col,
        title=f'BoxPlot for {col} Feature against the Target without outliers')
        fig.update_layout(xaxis_title=f'{col} Feature')
        fig.show()

#### Analysis of categorical features

In [99]:
cat_col = ['Loan_Amount_Term','Credit_History','Gender','Married','Dependents','Education','Self_Employed','Property_Area']
for col in cat_col:
    # Aggregate Category Feature
    new_df = pd.DataFrame(
        df[[col, 'Loan_Status']]
        .groupby(['Loan_Status'])
        .value_counts()
        .reset_index()
        )
    
    # Plot Category feature vs label
    fig = px.bar(
        data_frame=new_df, 
        x=col, 
        y='count', 
        facet_col='Loan_Status', 
        color=new_df['Loan_Status'].astype(str), # convert it to string to avoid continuous scale on legend
        title=f'{col} vs Target'
    )

    fig.update_layout(xaxis_title=col, yaxis_title='Number of Customers')
    fig.show()

## Evaluation metrics for this classification problem

## Model building part 1

## Feature engineering

## Model building part 2