In [None]:
#Import all the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### I - Loan Dataset

In [None]:
#Read the dataset 
loanData = pd.read_csv("loan.csv",low_memory=False)

# DtypeWarning: One of the column have mixed types.Hence set low_memory=False.


In [None]:

# Inspecting Data

loanData.head()

In [None]:
# Summary of a DataFrame.
loanData.info()

In [None]:
# Find the column names with all null values , so that we can discrad them off for analysis
loanData.columns[loanData.isnull().all()].tolist() 

 ### 2 -Data Handling and Cleaning

#### Data Quality Issues 
 - Column 47 seems to be containing data of multiple dtype
 - At least 24 columns , seems to be of object type.
 - Multiple columns seems to have all null values.


#### Treating Missing Values

In [None]:
#Droping columns with all null values
loanData.dropna(axis=1,how='all',inplace=True)

In [None]:
#Check again the rows with values which are less represented 
loanData.isna().sum()

- The column 'desc' contains lots of null values and seems not useful as we already have separate column 'Purpose'. Similarly  'title' is also not useful 
- The column 'funded_amnt_inv' is duplicate 


In [None]:
#Droping columns with majorly null values.
loanData.drop(columns = ['funded_amnt_inv','desc','title'],inplace=True)
loanData

- Employer Title replaces Employer Name for all loans listed after 9/23/2013, Hence we will fill missing values for this column based on this info

In [None]:
loanData=loanData[~( ( loanData.emp_title.isnull() ) & ( loanData.emp_length.isnull() ) )]

In [None]:
loanData['emp_title']=loanData['emp_title'].apply(lambda x: "Unnamed" if pd.isnull(x) else x)

- Since 'mths_since_last_delinq' and 'delinq_2yrs' are related , we can conclude value 0 for n/a values of   mths_since_last_delinq

In [None]:
loanData['mths_since_last_delinq']=loanData['mths_since_last_delinq'].apply(lambda x: 0 if pd.isnull(x) else x)

In [None]:
#Check again the rows with values which are less represented 
loanData.isna().sum()

In [None]:
loanData.last_pymnt_d.value_counts()

In [None]:
loanData.next_pymnt_d.value_counts()

In [None]:
#Checking rows where both next_pymnt_d and last_pymnt_d are NA
loanData[['issue_d','last_credit_pull_d','last_pymnt_d','term']][loanData.next_pymnt_d.isna() & loanData.last_pymnt_d.isna()]

In [None]:
# We can calculate last_payment_d for null values based on term length

In [None]:
loanData[['last_pymnt_d','next_pymnt_d']][~loanData.next_pymnt_d.isna()].value_counts()

In [None]:
#checking if There are any duplicate records
loanData.duplicated(['id']).sum()


In [None]:
# Describing Data
loanData.info()

In [None]:
loanData.term.value_counts()

In [None]:
# Clean the term column and convert this to float
loanData.term= loanData.term.apply(lambda x: x.split(" ")[1]).astype("float")
loanData.term

In [None]:
# Check interest rate type
loanData.int_rate.value_counts()

In [None]:
# Clean the int_rate column and convert this to float
loanData.int_rate= loanData.int_rate.apply(lambda x: x.replace("%","")).astype("float")
loanData.int_rate

In [None]:
loanData['last_credit_pull_d'].value_counts()

In [None]:
loanData[loanData['last_credit_pull_d'].isna()]

In [None]:
loanData[loanData['collections_12_mths_ex_med'].isna()]

In [None]:
plt.boxplot(loanData.int_rate)
plt.show()

In [None]:
#Checking  outliers
loanData[['loan_status']][loanData.int_rate>23]

In [None]:
#Checking outliers
plt.boxplot(loanData.loan_amnt)
plt.show()

In [None]:
loanData[loanData.loan_amnt>=31000]

In [None]:
plt.hist(loanData.loan_amnt,bins=7)
plt.show()

In [None]:
#Plot a pie chart
loanData['loan_status'].value_counts().plot.pie()
plt.show()