# Instructor Do: Dealing with Categorical Data in ML

In [2]:
# initial imports
import pandas as pd
from pathlib import Path

## Dataset Information

The file `loans_data.csv`, contains simulated data about loans, there are a total of 500 records. Each row represents a loan application along an arbitrary year, where every column represents the following data about every loan application.

* `amount`: The loan amount in USD.
* `term`: The loan term in months.
* `month`: The month of the year when the loan was requested.
* `age`: Age of the loan applicant.
* `education`: Educational level of the loan applicant.
* `gender`: Gender of the loan applicant.
* `bad`: Stands for a bad or good loan applicant (`1` - bad, `0` - good).

In [3]:
# Load data
file_path = Path("../Resources/loans_data.csv")
loans_df = pd.read_csv(file_path)
loans_df.head()

Unnamed: 0,amount,term,month,age,education,gender,bad
0,1000,30,June,45,High School or Below,male,0
1,1000,30,July,50,Bachelor,female,0
2,1000,30,August,33,Bachelor,female,0
3,1000,15,September,27,college,male,0
4,1000,30,October,28,college,female,0


In [5]:
# Binary encoding using Pandas (multiple columns)
loans_binary_encoded = pd.get_dummies(loans_df, columns=["education", "gender"])
loans_binary_encoded.head()

Unnamed: 0,amount,term,month,age,bad,education_Bachelor,education_High School or Below,education_Master or Above,education_college,gender_female,gender_male
0,1000,30,June,45,0,0,1,0,0,0,1
1,1000,30,July,50,0,1,0,0,0,1,0
2,1000,30,August,33,0,1,0,0,0,1,0
3,1000,15,September,27,0,0,0,0,1,0,1
4,1000,30,October,28,0,0,0,0,1,1,0


In [16]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df2 = loans_binary_encoded.copy()
df2['month'] = le.fit_transform(df2['month']) 
df2.head()

Unnamed: 0,amount,term,month,age,bad,education_Bachelor,education_High School or Below,education_Master or Above,education_college,gender_female,gender_male
0,1000,30,6,45,0,0,1,0,0,0,1
1,1000,30,5,50,0,1,0,0,0,1,0
2,1000,30,1,33,0,1,0,0,0,1,0
3,1000,15,11,27,0,0,0,0,1,0,1
4,1000,30,10,28,0,0,0,0,1,1,0


In [17]:
df2.rename(columns={'month':'month_num'}, inplace=True)
df2.head()

Unnamed: 0,amount,term,month_num,age,bad,education_Bachelor,education_High School or Below,education_Master or Above,education_college,gender_female,gender_male
0,1000,30,6,45,0,0,1,0,0,0,1
1,1000,30,5,50,0,1,0,0,0,1,0
2,1000,30,1,33,0,1,0,0,0,1,0
3,1000,15,11,27,0,0,0,0,1,0,1
4,1000,30,10,28,0,0,0,0,1,1,0
