In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib

In [4]:
# Load dataset
df = pd.read_csv('data/BostonHousing_ML_ready.csv')
print("Dataset shape:", df.shape)
print("Columns:", df.columns.tolist())

# Quick check
df.head()
df.info()
df.describe()


Dataset shape: (506, 14)
Columns: ['ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'PTRATIO', 'MEDV', 'CAT. MEDV', 'CRIM_log', 'TAX_log', 'LSTAT_log']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   ZN         506 non-null    float64
 1   INDUS      506 non-null    float64
 2   CHAS       506 non-null    int64  
 3   NOX        506 non-null    float64
 4   RM         506 non-null    float64
 5   AGE        506 non-null    float64
 6   DIS        506 non-null    float64
 7   RAD        506 non-null    int64  
 8   PTRATIO    506 non-null    float64
 9   MEDV       506 non-null    float64
 10  CAT. MEDV  506 non-null    int64  
 11  CRIM_log   506 non-null    float64
 12  TAX_log    506 non-null    float64
 13  LSTAT_log  506 non-null    float64
dtypes: float64(11), int64(3)
memory usage: 55.5 KB


Unnamed: 0,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,PTRATIO,MEDV,CAT. MEDV,CRIM_log,TAX_log,LSTAT_log
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,6.963439,11.136779,0.0,0.554695,6.275312,68.574901,3.783947,9.549407,18.463834,21.877075,0.0,0.740685,5.934259,2.474435
std,12.028788,6.860353,0.0,0.115878,0.630242,28.148861,2.069765,8.707259,2.143924,7.602976,0.0,0.864774,0.395337,0.536674
min,0.0,0.46,0.0,0.385,4.7785,2.9,1.1296,1.0,13.2,5.0625,0.0,0.0063,5.236442,1.004302
25%,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,17.4,17.025,0.0,0.078853,5.63479,2.073162
50%,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,19.05,21.2,0.0,0.228336,5.802118,2.514464
75%,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,20.2,25.0,0.0,1.542674,6.50279,2.887869
max,31.25,27.74,0.0,0.871,7.7305,100.0,9.8208,24.0,22.0,36.9625,0.0,2.309525,6.568078,3.495371


In [5]:
#Prepare features and target
# Drop MEDV and CAT. MEDV from input features
X = df[['ZN','INDUS','CHAS','NOX','RM','AGE','DIS','RAD','PTRATIO','CRIM_log','TAX_log','LSTAT_log']]
y = df['MEDV']

In [6]:
#Split data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
#train model using RandomForest as best accuarcy result 
from sklearn.ensemble import RandomForestRegressor
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

best_rf = RandomForestRegressor(
    n_estimators=200,
    max_depth=10,
    max_features='sqrt',
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=42,
    n_jobs=-1
)
best_rf.fit(X_train, y_train)

# Optional: evaluate
y_pred = best_rf.predict(X_test)
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("MAE :", mean_absolute_error(y_test, y_pred))
print("R2  :", r2_score(y_test, y_pred))

RMSE: 2.343957260175408
MAE : 1.7377957750003337
R2  : 0.8877027848700862


In [8]:
#save the new model (same path your Flask app uses)
from joblib import dump
import os
os.makedirs('model', exist_ok=True)

dump(best_rf, 'model/RandomForest_BostonHousing.joblib')
print("Model saved at 'model/RandomForest_BostonHousing.joblib'")

Model saved at 'model/RandomForest_BostonHousing.joblib'


In [9]:
#Test the model locally in SageMaker
from joblib import load
import pandas as pd

# Load the new model
model = load('model/RandomForest_BostonHousing.joblib')

# Test with one sample from your dataset
df = pd.read_csv('data/BostonHousing_ML_ready.csv')
X_test_sample = df[['ZN','INDUS','CHAS','NOX','RM','AGE','DIS','RAD','PTRATIO','CRIM_log','TAX_log','LSTAT_log']].iloc[[0]]

prediction = model.predict(X_test_sample)[0]
print("Sample prediction:", prediction)

Sample prediction: 27.390887428289883
