In [213]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

1. Reading the data
2. Describe, Shape, Info
3. Count missing values-> remove missing values
4. Label Encoder
5. One Hot Encoding
6. Scaling
7. Splitting Dataset

In [178]:
loan_data= pd.read_csv("loan_small.csv")

In [179]:
#Access the dataframe via iloc
subset= loan_data.iloc[0:3, 1:3]
subset.head()

Unnamed: 0,Gender,ApplicantIncome
0,,5849.0
1,Male,4583.0
2,Male,3000.0


In [180]:
#Access the dataframe using column names
subsetN= loan_data[['Gender', 'ApplicantIncome']][0:3]
subsetN.head()

Unnamed: 0,Gender,ApplicantIncome
0,,5849.0
1,Male,4583.0
2,Male,3000.0


In [181]:
dataset= pd.read_csv('loan_small_tsv.txt', sep='\t')
dataset.head()

Unnamed: 0,Loan_ID,Gender,ApplicantIncome,CoapplicantIncome,LoanAmount,Area,Loan_Status
0,LP001002,,5849.0,0.0,,urban,Y
1,LP001003,Male,4583.0,,128.0,semi,N
2,LP001005,Male,3000.0,0.0,66.0,,Y
3,LP001006,Female,2583.0,2358.0,120.0,semi,Y
4,LP001008,Male,,0.0,141.0,urban,Y


In [182]:
dataset.shape

(16, 7)

In [183]:
dataset.columns

Index(['Loan_ID', 'Gender', 'ApplicantIncome', 'CoapplicantIncome',
       'LoanAmount', 'Area', 'Loan_Status'],
      dtype='object')

In [184]:
dataset.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount
count,14.0,15.0,13.0
mean,4103.571429,2509.333333,140.923077
std,2858.096481,3147.98209,86.032418
min,1299.0,0.0,17.0
25%,2520.75,350.0,109.0
50%,3118.0,1526.0,125.0
75%,4858.25,2672.0,158.0
max,12841.0,10968.0,349.0


In [185]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16 entries, 0 to 15
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            16 non-null     object 
 1   Gender             14 non-null     object 
 2   ApplicantIncome    14 non-null     float64
 3   CoapplicantIncome  15 non-null     float64
 4   LoanAmount         13 non-null     float64
 5   Area               14 non-null     object 
 6   Loan_Status        16 non-null     object 
dtypes: float64(3), object(4)
memory usage: 1.0+ KB


In [186]:
dataset.isnull().sum(axis=0)  # No. of missing values by column

Loan_ID              0
Gender               2
ApplicantIncome      2
CoapplicantIncome    1
LoanAmount           3
Area                 2
Loan_Status          0
dtype: int64

In [187]:
#delete rows with missing values ->incase the number is less ->loss of data
#dataset.dropna()

#can delete based on null values in a specific column

#dataset.dropna(['Loan_Status'])

#mode for categorical variables
#mean/median for numerical variables

cols= ['Gender', 'Area', 'Loan_Status']
dataset[cols]= dataset[cols].fillna(dataset.mode().iloc[0])

In [188]:
cols= ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']
dataset[cols]= dataset[cols].fillna(dataset.mean().iloc[0])

In [189]:
dataset.head()

Unnamed: 0,Loan_ID,Gender,ApplicantIncome,CoapplicantIncome,LoanAmount,Area,Loan_Status
0,LP001002,Male,5849.0,0.0,4103.571429,urban,Y
1,LP001003,Male,4583.0,4103.571429,128.0,semi,N
2,LP001005,Male,3000.0,0.0,66.0,semi,Y
3,LP001006,Female,2583.0,2358.0,120.0,semi,Y
4,LP001008,Male,4103.571429,0.0,141.0,urban,Y


In [190]:
dt=dataset

In [191]:
#Label encodings for categorical data
dataset.dtypes

Loan_ID               object
Gender                object
ApplicantIncome      float64
CoapplicantIncome    float64
LoanAmount           float64
Area                  object
Loan_Status           object
dtype: object

In [192]:
cols= ['Gender', 'Area', 'Loan_Status']
dataset[cols]= dataset[cols].astype('category')

for col in cols:
    dataset[col]= dataset[col].cat.codes
    

In [193]:
#label encoding using sk learn
#label_encoder= LabelEncoder()
#for col in cols:
#   dt[col]= label_encoder.fit_transform(dt[col])
    

#dt.head()

In [194]:
dataset.head()

Unnamed: 0,Loan_ID,Gender,ApplicantIncome,CoapplicantIncome,LoanAmount,Area,Loan_Status
0,LP001002,1,5849.0,0.0,4103.571429,2,1
1,LP001003,1,4583.0,4103.571429,128.0,1,0
2,LP001005,1,3000.0,0.0,66.0,1,1
3,LP001006,0,2583.0,2358.0,120.0,1,1
4,LP001008,1,4103.571429,0.0,141.0,2,1


In [195]:
#Hot encoding 
# since the categories are now in numbers, all propoerties of numbers are followed here i.e, 3>2>1, they can be added, subtracted, etc

# 1-> 1 0 0
# 2-> 0 1 0
# 3-> 0 0 1


In [196]:
#Delete a column

dataset= dataset.drop(['Loan_ID'], axis=1)

In [197]:
#one hot encoding
dataset= pd.get_dummies(dataset, columns= ['Gender', 'Area', 'Loan_Status'])

In [198]:
dataset.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Gender_0,Gender_1,Area_0,Area_1,Area_2,Loan_Status_0,Loan_Status_1
0,5849.0,0.0,4103.571429,0,1,0,0,1,0,1
1,4583.0,4103.571429,128.0,0,1,0,1,0,1,0
2,3000.0,0.0,66.0,0,1,0,1,0,0,1
3,2583.0,2358.0,120.0,1,0,0,1,0,0,1
4,4103.571429,0.0,141.0,0,1,0,0,1,0,1


In [199]:
#one hot encoding using sklearn

#cols= ['Gender', 'Area', 'Loan_Status']
#dt.drop(["Loan_ID"])
#enc = OneHotEncoder(handle_unknown='ignore')
#enc.fit_transform(dt).toarray()


#dt.head()

Normalisation,
1. zscore = (X-mean)/std.deviation
2. minmax scaler, z= x-min/ (max-min)
3. 1/(1+exp(-x), exponential normalisation

In [200]:
scaler= StandardScaler()

In [201]:
dataset[['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']]= scaler.fit_transform(dataset[['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']])

In [203]:
dataset.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Gender_0,Gender_1,Area_0,Area_1,Area_2,Loan_Status_0,Loan_Status_1
0,0.677507,-0.878488,2.079255,0,1,0,0,1,0,1
1,0.186095,0.503258,-0.488174,0,1,0,1,0,1,0
2,-0.428363,-0.878488,-0.528213,0,1,0,1,0,0,1
3,-0.590226,-0.084507,-0.49334,1,0,0,1,0,0,1
4,0.0,-0.878488,-0.479778,0,1,0,0,1,0,1


In [208]:
#minmax scaler
mm_scaler= MinMaxScaler()
dt[['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']]= mm_scaler.fit_transform(dt[['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']])

In [210]:
dataset.drop(['Loan_Status_0'], axis=1)

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Gender_0,Gender_1,Area_0,Area_1,Area_2,Loan_Status_1
0,0.677507,-0.878488,2.079255,0,1,0,0,1,1
1,0.186095,0.503258,-0.488174,0,1,0,1,0,0
2,-0.428363,-0.878488,-0.528213,0,1,0,1,0,1
3,-0.590226,-0.084507,-0.49334,1,0,0,1,0,1
4,0.0,-0.878488,-0.479778,0,1,0,0,1,1
5,0.509821,0.53438,-0.398407,0,1,0,1,0,1
6,-0.687266,-0.368023,2.079255,0,1,0,1,0,1
7,-0.414389,-0.035346,-0.4688,1,0,0,1,0,0
8,-0.037873,-0.364656,-0.462342,0,1,1,0,0,1
9,3.391525,2.814632,-0.345452,0,1,0,1,0,0


In [211]:
X= dataset.iloc[:, :-1]
Y= dataset.iloc[:, -1]

In [214]:
x_train, x_test, y_train, y_test= train_test_split(X, Y, test_size=0.2, random_state=67)