### **AdaBoost Classifier on HouseLoan Dataset**

**Step 1 : Import Necessary Libraries**

In [153]:
import pandas as pd 
import numpy as np 
from sklearn.preprocessing import OneHotEncoder 
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier 
from sklearn.metrics import mean_squared_error, r2_score

**Step 2 : Load the Dataset**

In [154]:
df = pd.read_csv("E:\\Machine Learning\\Datasets\\loan.csv")

In [155]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [156]:
df.shape

(614, 13)

In [157]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


**Step 3 : Data Preprocessing**

In [158]:
df.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [159]:
print(df['Gender'].mode())
df['Gender'] = df['Gender'].fillna('Male')

0    Male
Name: Gender, dtype: object


In [160]:
print(df['Married'].mode())
df['Married'] = df['Married'].fillna(df['Married'].mode())

0    Yes
Name: Married, dtype: object


In [161]:
print(df['LoanAmount'].median())
df['LoanAmount'] = df['LoanAmount'].fillna(df['LoanAmount'].median())

128.0


In [162]:
print(df['Loan_Amount_Term'].median())
df['Loan_Amount_Term'] = df['Loan_Amount_Term'].fillna(360.0)

360.0


In [163]:
print(df['Credit_History'].mode())
df['Credit_History'] = df['Credit_History'].fillna(1.0)

0    1.0
Name: Credit_History, dtype: float64


In [164]:
print(df['Dependents'].mode())
df['Dependents'] = df['Dependents'].fillna(0)

0    0
Name: Dependents, dtype: object


In [165]:
print(df['Self_Employed'].mode())
df['Self_Employed'] = df['Self_Employed'].fillna(0)

0    No
Name: Self_Employed, dtype: object


In [166]:
df['Married'].unique()

array(['No', 'Yes', nan], dtype=object)

In [167]:
mode_value = df['Married'].mode()[0] 
df['Married'].fillna(mode_value, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Married'].fillna(mode_value, inplace=True)


In [168]:
df.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [169]:
df.columns

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')

In [170]:
df_cat = df.select_dtypes(include = ['object'])
df_cat.columns

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'Property_Area', 'Loan_Status'],
      dtype='object')

In [171]:
df['Dependents'].unique()

array(['0', '1', '2', '3+', 0], dtype=object)

In [172]:
df['Dependents'] = df['Dependents'].replace('3+', '3')

**Step 4 : Encoding the categorical columns**

In [173]:
categorical_columns = df.select_dtypes(include = ['object'])
print(categorical_columns.columns)

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'Property_Area', 'Loan_Status'],
      dtype='object')


In [174]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

categorical_columns = ['Loan_ID','Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area']

df[categorical_columns] = df[categorical_columns].astype(str)


encoder = OneHotEncoder(drop='first', sparse_output=False)
encoded_columns = encoder.fit_transform(df[categorical_columns])
encoded_df = pd.DataFrame(encoded_columns, columns=encoder.get_feature_names_out(categorical_columns))

df = df.drop(columns=categorical_columns)
df_encoded = pd.concat([df, encoded_df], axis=1)

df_encoded.head()

Unnamed: 0,Dependents,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status,Loan_ID_LP001003,Loan_ID_LP001005,Loan_ID_LP001006,...,Loan_ID_LP002983,Loan_ID_LP002984,Loan_ID_LP002990,Gender_Male,Married_Yes,Education_Not Graduate,Self_Employed_No,Self_Employed_Yes,Property_Area_Semiurban,Property_Area_Urban
0,0,5849,0.0,128.0,360.0,1.0,Y,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
1,1,4583,1508.0,128.0,360.0,1.0,N,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
2,0,3000,0.0,66.0,360.0,1.0,Y,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0
3,0,2583,2358.0,120.0,360.0,1.0,Y,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0
4,0,6000,0.0,141.0,360.0,1.0,Y,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0


**Step 5 : Split the data into Training and Testing sets**

In [175]:
print(df_encoded.columns)


Index(['Dependents', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Loan_Status', 'Loan_ID_LP001003',
       'Loan_ID_LP001005', 'Loan_ID_LP001006',
       ...
       'Loan_ID_LP002983', 'Loan_ID_LP002984', 'Loan_ID_LP002990',
       'Gender_Male', 'Married_Yes', 'Education_Not Graduate',
       'Self_Employed_No', 'Self_Employed_Yes', 'Property_Area_Semiurban',
       'Property_Area_Urban'],
      dtype='object', length=627)


In [179]:
# Select features (X) and target (y)
X = df_encoded[['Dependents', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Loan_ID_LP001003',
       'Loan_ID_LP001005', 'Loan_ID_LP001006', 'Loan_ID_LP001008',
       'Loan_ID_LP002984', 'Loan_ID_LP002990', 'Gender_Male', 'Married_Yes',
       'Education_Not Graduate', 'Self_Employed_No', 'Self_Employed_Yes',
       'Property_Area_Semiurban', 'Property_Area_Urban']]

# Target variable (y)
y = df_encoded['Loan_Status']

**Step 6 : Train the Decision Tree Clssifier**

In [180]:
# Step 2: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 3: Create and train the DecisionTreeClassifier
clf = DecisionTreeClassifier( random_state=42)
clf.fit(X_train, y_train)

**Step 7 : Make Predictions And Evaluate the model**

In [181]:
# Step 4: Evaluate the model
accuracy = clf.score(X_train, y_train)

accuracy

1.0

**Step 8 : Performing AdaBoost Classifier**

In [182]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [198]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

# Create a DecisionTreeClassifier as the base estimator
estimator = DecisionTreeClassifier(max_depth=1)

# Create an AdaBoostClassifier with the custom base estimator
adaboost_clf = AdaBoostClassifier(estimator=estimator,  # Change here
                                  n_estimators=50,
                                  learning_rate=1.0,
                                  random_state=42)

# Now you can fit the AdaBoost model to your data
adaboost_clf.fit(X_train, y_train)  # Assuming X_train and y_train are defined




In [199]:
y_pred = adaboost_clf.predict(X_test)

# Step 6: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)

In [200]:
accuracy

0.7783783783783784