In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor

In [2]:
df = pd.read_csv('House_Price_Prediction_After_Feature_Selection.csv')

In [3]:
pd.options.display.float_format = '{:,.2f}'.format

In [4]:
df.head()

Unnamed: 0,Square_Footage,Num_Bedrooms,Lot_Size,House_Price
0,1360,2,0.6,262382.85
1,4272,3,4.75,985260.85
2,3592,1,3.63,777977.39
3,966,1,2.73,229698.92
4,4926,2,4.7,1041740.86


In [5]:
X = df.drop('House_Price', axis = 1)
y = df['House_Price']

In [6]:
# Creating a columm transformer that scales all the columns

preprocessor = ColumnTransformer(
    [
        ('scaling', StandardScaler(), ['Square_Footage', 'Num_Bedrooms', 'Lot_Size'])
    ]
)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
  ]
)

In [7]:
kfold = KFold(n_splits = 5, shuffle = True, random_state = 42)
scores = cross_val_score(pipeline, X, y, cv= kfold)

In [9]:
def model_selection_(model_name, model):
  # Step 1: First split - create training and holdout test sets
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
  preprocessor = ColumnTransformer(
    [
        ('scaling', StandardScaler(), ['Square_Footage', 'Num_Bedrooms', 'Lot_Size'])
    ]
  )

  pipeline = Pipeline([
      ('preprocessor', preprocessor),
      ('model', model)
    ]
  )
  kfold = KFold(n_splits = 5, shuffle = True, random_state = 42)
  scores = cross_val_score(pipeline, X_train, y_train, cv= kfold, scoring='r2')


  model.fit(X_train, y_train)

  y_pred = model.predict(X_test)

  mae = mean_absolute_error(y_test, y_pred)

  return [model_name, scores.mean(), mae]

In [10]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [15]:
metricsDF = pd.DataFrame(columns=['Model Name', 'R2 score', 'MAE'])

for model_name, model in model_dict.items():
  model_res = model_selection_(model_name, model)
  metricsDF.loc[len(metricsDF)] = model_res



In [17]:
metricsDF.sort_values(by = 'MAE')

Unnamed: 0,Model Name,R2 score,MAE
3,LASSO,0.99,18380.18
7,gradient boosting,0.99,18956.93
0,linear_reg,0.99,19711.29
2,ridge,0.99,21392.06
5,random forest,0.99,22014.55
10,xgboost,0.99,22172.19
6,extra trees,0.99,23185.54
8,adaboost,0.98,27075.02
4,decision tree,0.98,30954.7
1,svr,-0.0,224901.99


### Hyperparameter Tuning

In [18]:
from sklearn.model_selection import GridSearchCV

In [38]:
param_grid = {
    'model__alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
    'model__fit_intercept': [True, False],
    'model__max_iter': [1000, 2000, 5000],
    'model__tol': [1e-4, 1e-3, 1e-2],
    'model__selection': ['cyclic', 'random']
}

In [39]:
preprocessor = ColumnTransformer([
    ('scaling', StandardScaler(), ['Square_Footage', 'Num_Bedrooms', 'Lot_Size'])
],remainder='passthrough')


pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', Lasso())
])

In [40]:
kfold = KFold(n_splits=5, shuffle = True, random_state=42)
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=kfold,
    scoring='r2',
    n_jobs=-1,
    verbose=1
)

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [42]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 252 candidates, totalling 1260 fits


In [43]:
grid_search.best_estimator_

In [44]:
grid_search.best_score_

np.float64(0.9909210678761825)

In [45]:
final_pipe = grid_search.best_estimator_

In [46]:
final_pipe.fit(X_train, y_train)

In [48]:
y_pred = final_pipe.predict(X_test)

In [49]:
mean_absolute_error(y_test, y_pred)

20781.09904826975

# Exporting the model

In [50]:
import pickle

In [51]:
with open('pipeline.pkl', 'wb') as file:
  pickle.dump(final_pipe, file)

In [52]:
with open('df.pkl', 'wb') as file:
  pickle.dump(df, file)

In [53]:
df.head()

Unnamed: 0,Square_Footage,Num_Bedrooms,Lot_Size,House_Price
0,1360,2,0.6,262382.85
1,4272,3,4.75,985260.85
2,3592,1,3.63,777977.39
3,966,1,2.73,229698.92
4,4926,2,4.7,1041740.86


In [54]:
df['Lot_Size'].describe()

Unnamed: 0,Lot_Size
count,1000.0
mean,2.78
std,1.3
min,0.51
25%,1.67
50%,2.81
75%,3.92
max,4.99
