In [96]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error

In [97]:
# Load and display data
data = pd.read_csv("house_loan.csv")
data.head()

Unnamed: 0,Gender,Age,Income (USD),Income Stability,Property Age,Property Location,Property Price,Loan Sanction Amount (USD)
0,F,19,1641.25,Low,1651.25,Rural,59641.82,21026.420753
1,M,29,1989.71,Low,1990.71,Urban,179858.51,60595.183366
2,F,37,1849.91,Low,1856.91,Rural,117297.62,39181.648002
3,M,65,2735.18,High,2747.18,Rural,354417.72,128497.710865
4,F,62,4741.78,High,4740.78,Urban,82049.8,39386.919336


In [98]:
# Preprocess data, determine feature x and label y
columns = data.columns.tolist()     
columns.remove('Loan Sanction Amount (USD)')                 
feature_data = data[columns]       
target_data = data['Loan Sanction Amount (USD)']

In [99]:
feature_data.head()

Unnamed: 0,Gender,Age,Income (USD),Income Stability,Property Age,Property Location,Property Price
0,F,19,1641.25,Low,1651.25,Rural,59641.82
1,M,29,1989.71,Low,1990.71,Urban,179858.51
2,F,37,1849.91,Low,1856.91,Rural,117297.62
3,M,65,2735.18,High,2747.18,Rural,354417.72
4,F,62,4741.78,High,4740.78,Urban,82049.8


In [100]:
feature_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47297 entries, 0 to 47296
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             47297 non-null  object 
 1   Age                47297 non-null  int64  
 2   Income (USD)       47265 non-null  float64
 3   Income Stability   47285 non-null  object 
 4   Property Age       47263 non-null  float64
 5   Property Location  47294 non-null  object 
 6   Property Price     47297 non-null  float64
dtypes: float64(3), int64(1), object(3)
memory usage: 2.5+ MB


In [101]:
target_data.head()

0     21026.420753
1     60595.183366
2     39181.648002
3    128497.710865
4     39386.919336
Name: Loan Sanction Amount (USD), dtype: float64

In [102]:
# Impute missing values for numeric columns
numeric_columns = feature_data.select_dtypes(include=['float64', 'int64']).columns
imputer = SimpleImputer(strategy='mean')
feature_data[numeric_columns] = imputer.fit_transform(feature_data[numeric_columns])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feature_data[numeric_columns] = imputer.fit_transform(feature_data[numeric_columns])


In [103]:
# Fill missing values in categorical columns with a placeholder
categorical_columns = feature_data.select_dtypes(include=['object']).columns
feature_data[categorical_columns] = feature_data[categorical_columns].fillna('Missing')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feature_data[categorical_columns] = feature_data[categorical_columns].fillna('Missing')


In [104]:
# Label encode the gender column
label_encoder = LabelEncoder()
feature_data['Gender'] = label_encoder.fit_transform(feature_data['Gender'])
feature_data['Income Stability'] = label_encoder.fit_transform(feature_data['Income Stability'])
feature_data['Property Location'] = label_encoder.fit_transform(feature_data['Property Location'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feature_data['Gender'] = label_encoder.fit_transform(feature_data['Gender'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feature_data['Income Stability'] = label_encoder.fit_transform(feature_data['Income Stability'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feature_data['Property Location

In [105]:
# Split the data into training and test sets
trainX,testX, trainY,testY = train_test_split(feature_data, target_data, train_size=0.70)
print('Training:' + str(trainX.shape)) 
print('Test:' + str(testX.shape))  

Training:(33107, 7)
Test:(14190, 7)


In [106]:
# Initialize and fit the pipeline
model = Pipeline([('scaler', StandardScaler()),
                  ('linear_regression', LinearRegression())])
model.fit(trainX, trainY)

In [107]:
# Observe the bias (theta_0) parameter and 7 coefficients corresponding to 7 features x after learning
linear_model = model.named_steps['linear_regression']
print("Model intercept:", linear_model.intercept_)
print("Model coefficients:", linear_model.coef_)

Model intercept: 46566.06604399724
Model coefficients: [ -128.27703942 -1269.77568481   736.60234455 -3147.78079009
   888.20912989   -53.00244046 31906.17375209]


In [108]:
# Observe 7 features x of the first 5 samples in the Test set
testX[:5]

Unnamed: 0,Gender,Age,Income (USD),Income Stability,Property Age,Property Location,Property Price
39160,1,59.0,1844.31,1,1858.31,3,113806.81
31450,0,63.0,2114.76,0,2127.76,3,114003.75
45256,0,28.0,3329.31,1,3337.31,1,266129.06
16938,0,37.0,1277.58,1,1278.58,1,80622.69
14888,0,44.0,4614.25,1,4620.25,1,262015.16


In [109]:
# Observe labels y of the first 5 samples in the Test set
actual_values_5 = testY[:5]

In [110]:
# Make prediction on the first 5 samples in the Test set
predicted_values_5 = model.predict(testX[:5])

In [111]:
# Create a dataframe for visualization
comparison_df_5 = pd.DataFrame({
    'Sample Index': actual_values_5.index,
    'Actual Loan Amount': actual_values_5.values,
    'Predicted Loan Amount': predicted_values_5
})

comparison_df_5

Unnamed: 0,Sample Index,Actual Loan Amount,Predicted Loan Amount
0,39160,35892.396054,35907.089069
1,31450,47362.434915,47245.514289
2,45256,91401.198548,91476.200037
3,16938,26273.779786,26283.370391
4,14888,90066.493005,90144.683331


In [112]:
# Make prediction on the whole Test set
predicted_values = model.predict(testX)
actual_values = testY

In [113]:
# Create a dataframe for visualization
comparison_df = pd.DataFrame({
    'Sample Index': actual_values.index,
    'Actual Loan Amount': actual_values.values,
    'Predicted Loan Amount': predicted_values
})

comparison_df

Unnamed: 0,Sample Index,Actual Loan Amount,Predicted Loan Amount
0,39160,35892.396054,35907.089069
1,31450,47362.434915,47245.514289
2,45256,91401.198548,91476.200037
3,16938,26273.779786,26283.370391
4,14888,90066.493005,90144.683331
...,...,...,...
14185,28279,68923.518171,68982.147770
14186,36740,71042.793859,71085.678713
14187,17832,15837.813325,15834.249241
14188,38873,23822.875572,23805.899917


In [114]:
# Calculate mean absolute error to observe the performance of the learned model based on the predictions and the labels
mean_absolute_error(y_pred=predicted_values, y_true=testY)

176.76774876663512