In [23]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [24]:
df = pd.read_csv("RuralCreditDataCleaned.csv")

In [25]:
df.head(5)

Unnamed: 0,age,annual_income,monthly_expenses,young_dependents,home_ownership,occupants_count,house_area,sanitary_availability,water_availabity,loan_tenure,loan_installments,loan_amount
0,22,36000.0,5000.0,2,1.0,4,70.0,1.0,0.5,12,12,5000.0
1,24,48000.0,4000.0,2,1.0,4,50.0,1.0,0.5,12,12,5000.0
2,26,7000.0,5000.0,2,1.0,5,50.0,1.0,0.5,12,50,7500.0
3,23,36000.0,3500.0,0,1.0,1,112.0,1.0,0.5,12,12,5000.0
4,23,36000.0,3500.0,0,1.0,1,112.0,1.0,0.5,12,12,5000.0


In [26]:
df.shape

(37198, 12)

In [27]:
df.isna().sum()

age                      0
annual_income            0
monthly_expenses         0
young_dependents         0
home_ownership           0
occupants_count          0
house_area               0
sanitary_availability    0
water_availabity         0
loan_tenure              0
loan_installments        0
loan_amount              0
dtype: int64

In [28]:
df["home_ownership"] = df["home_ownership"].astype(int)
df["sanitary_availability"] = df["sanitary_availability"].astype(int)

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37198 entries, 0 to 37197
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   age                    37198 non-null  int64  
 1   annual_income          37198 non-null  float64
 2   monthly_expenses       37198 non-null  float64
 3   young_dependents       37198 non-null  int64  
 4   home_ownership         37198 non-null  int32  
 5   occupants_count        37198 non-null  int64  
 6   house_area             37198 non-null  float64
 7   sanitary_availability  37198 non-null  int32  
 8   water_availabity       37198 non-null  float64
 9   loan_tenure            37198 non-null  int64  
 10  loan_installments      37198 non-null  int64  
 11  loan_amount            37198 non-null  float64
dtypes: float64(5), int32(2), int64(5)
memory usage: 3.1 MB


In [30]:
values = df.values
scaler = MinMaxScaler()
values = scaler.fit_transform(values)

x = values[:, :-1]
y = values[:, -1]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [31]:
def train_model(model):
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"Mean Squared Error: {mse}")
    print(f"R2 Score: {r2}")

In [32]:
random_forest = RandomForestRegressor()
ada_boost = AdaBoostRegressor()
gradient_boost = GradientBoostingRegressor()
extra_trees = ExtraTreesRegressor()

models = [random_forest, ada_boost, gradient_boost, extra_trees]

In [35]:
for a in models:
    print(f"Training {a}")
    train_model(a)
    print("\n\n")

Training RandomForestRegressor()
Mean Squared Error: 0.0002384537413241867
R2 Score: 0.10190381000923132



Training AdaBoostRegressor()
Mean Squared Error: 0.00026515802377807585
R2 Score: 0.00132659031415705



Training GradientBoostingRegressor()
Mean Squared Error: 0.000240841674963201
R2 Score: 0.09291005679219189



Training ExtraTreesRegressor()
Mean Squared Error: 0.00024509542220488333
R2 Score: 0.07688902827016575



