#**Recursive Feature Elimination (RFE)**

---



In [85]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score


In [86]:
df = pd.read_csv('loan_approval_dataset.csv')
df.drop('loan_id', axis=1, inplace=True)
df.head(5)

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


In [87]:

le = LabelEncoder()

df[' education'] = le.fit_transform(df[' education'])
df[' self_employed'] = le.fit_transform(df[' self_employed'])
df[' loan_status'] = le.fit_transform(df[' loan_status'])

df.head(5)


Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,2,0,0,9600000,29900000,12,778,2400000,17600000,22700000,8000000,0
1,0,1,1,4100000,12200000,8,417,2700000,2200000,8800000,3300000,1
2,3,0,0,9100000,29700000,20,506,7100000,4500000,33300000,12800000,1
3,3,0,0,8200000,30700000,8,467,18200000,3300000,23300000,7900000,1
4,5,1,1,9800000,24200000,20,382,12400000,8200000,29400000,5000000,1


In [88]:
df1 = df

In [89]:
import plotly.express as px
import plotly.graph_objs as go
correlation_matrix = df.corr()

# Plotting the heatmap using Plotly
fig = go.Figure(data=go.Heatmap(
                   z=correlation_matrix.values,
                   x=correlation_matrix.columns,
                   y=correlation_matrix.columns,
                   colorscale='Viridis'))

fig.update_layout(title='Heatmap of Feature Correlations',
                  xaxis_nticks=36)

fig.show()

In [90]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4269 entries, 0 to 4268
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype
---  ------                     --------------  -----
 0    no_of_dependents          4269 non-null   int64
 1    education                 4269 non-null   int64
 2    self_employed             4269 non-null   int64
 3    income_annum              4269 non-null   int64
 4    loan_amount               4269 non-null   int64
 5    loan_term                 4269 non-null   int64
 6    cibil_score               4269 non-null   int64
 7    residential_assets_value  4269 non-null   int64
 8    commercial_assets_value   4269 non-null   int64
 9    luxury_assets_value       4269 non-null   int64
 10   bank_asset_value          4269 non-null   int64
 11   loan_status               4269 non-null   int64
dtypes: int64(12)
memory usage: 400.3 KB


In [91]:
df

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,2,0,0,9600000,29900000,12,778,2400000,17600000,22700000,8000000,0
1,0,1,1,4100000,12200000,8,417,2700000,2200000,8800000,3300000,1
2,3,0,0,9100000,29700000,20,506,7100000,4500000,33300000,12800000,1
3,3,0,0,8200000,30700000,8,467,18200000,3300000,23300000,7900000,1
4,5,1,1,9800000,24200000,20,382,12400000,8200000,29400000,5000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...
4264,5,0,1,1000000,2300000,12,317,2800000,500000,3300000,800000,1
4265,0,1,1,3300000,11300000,20,559,4200000,2900000,11000000,1900000,0
4266,2,1,0,6500000,23900000,18,457,1200000,12400000,18100000,7300000,1
4267,1,1,0,4100000,12800000,8,780,8200000,700000,14100000,5800000,0


In [92]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE

X = df.drop(columns=[' loan_status'])
y = df[' loan_status']

# Initialize the model
model = LogisticRegression(max_iter=1000)

# Initialize RFE with the model
rfe = RFE(model, n_features_to_select=10)  # Select top 9 features
rfe.fit(X, y)

# Get selected features
selected_features = X.columns[rfe.support_].tolist()
dropped_features = X.columns[~rfe.support_].tolist()

print("Selected Features:")
print(selected_features)

print("\nDropped Features:")
print(dropped_features)


Selected Features:
[' no_of_dependents', ' self_employed', ' income_annum', ' loan_amount', ' cibil_score', ' residential_assets_value', ' commercial_assets_value', ' luxury_assets_value', ' bank_asset_value']

Dropped Features:
[' education', ' loan_term']


In [93]:
df.drop(columns=dropped_features, inplace=True)
X = df.drop(columns=[' loan_status'])
y = df[' loan_status']
X

Unnamed: 0,no_of_dependents,self_employed,income_annum,loan_amount,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value
0,2,0,9600000,29900000,778,2400000,17600000,22700000,8000000
1,0,1,4100000,12200000,417,2700000,2200000,8800000,3300000
2,3,0,9100000,29700000,506,7100000,4500000,33300000,12800000
3,3,0,8200000,30700000,467,18200000,3300000,23300000,7900000
4,5,1,9800000,24200000,382,12400000,8200000,29400000,5000000
...,...,...,...,...,...,...,...,...,...
4264,5,1,1000000,2300000,317,2800000,500000,3300000,800000
4265,0,1,3300000,11300000,559,4200000,2900000,11000000,1900000
4266,2,0,6500000,23900000,457,1200000,12400000,18100000,7300000
4267,1,0,4100000,12800000,780,8200000,700000,14100000,5800000


In [94]:
from sklearn.linear_model import LogisticRegression
#Import train test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Initialize and fit logistic regression model
logreg = LogisticRegression(max_iter=1000, random_state=0)
logreg.fit(X_train, y_train)

# Predictions
y_pred_logreg = logreg.predict(X_test)

# Evaluate the model
accuracy_logreg_prev = accuracy_score(y_test, y_pred_logreg)
print("Logistic Regression Accuracy:", accuracy_logreg_prev)

X_train


Logistic Regression Accuracy: 0.6057767369242779


Unnamed: 0,no_of_dependents,self_employed,income_annum,loan_amount,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value
1023,4,1,1100000,2900000,554,2800000,1000000,4100000,1600000
728,1,0,3400000,8600000,312,6400000,6100000,8400000,3000000
133,4,1,6800000,13900000,479,11700000,9100000,25700000,9600000
2255,1,0,8900000,24000000,380,16500000,7400000,27700000,7500000
1044,3,1,2400000,4800000,895,6800000,300000,7300000,3300000
...,...,...,...,...,...,...,...,...,...
1033,3,0,600000,1200000,752,1100000,500000,2300000,700000
3264,5,1,800000,2100000,363,1200000,1000000,3100000,900000
1653,3,1,6400000,23500000,684,17200000,10300000,25300000,7400000
2607,4,0,5400000,12600000,786,300000,2500000,21000000,6600000


In [95]:
from sklearn.tree import DecisionTreeClassifier

# Initialize and fit decision tree classifier
dt_classifier = DecisionTreeClassifier(random_state=0)
dt_classifier.fit(X_train, y_train)

# Predictions
y_pred_dt = dt_classifier.predict(X_test)

# Evaluate the model
accuracy_dt_prev = accuracy_score(y_test, y_pred_dt)
print("\nDecision Tree Accuracy:", accuracy_dt_prev)



Decision Tree Accuracy: 0.9149102263856362


In [96]:
from sklearn.ensemble import RandomForestClassifier

# Initialize and fit random forest classifier
rf_classifier = RandomForestClassifier(random_state=0)
rf_classifier.fit(X_train, y_train)

# Predictions
y_pred_rf = rf_classifier.predict(X_test)

# Evaluate the model
accuracy_rf_prev = accuracy_score(y_test, y_pred_rf)
print("\nRandom Forest Accuracy:", accuracy_rf_prev)



Random Forest Accuracy: 0.955503512880562
