# Model Development Notebook

This notebook will start my exploration for the optimal model.

For model development, I will start off my implementing simplier models; then, I will dive into ensemble learning.

#### Basic Setup

In [1]:
# Setting up path search
import sys
sys.path.insert(1,'/Users/jinalshah/Jinal/Github Repos/House-Prices-Challenge-Solution/Code/Scripts' + 
               '/Data Preparation Scripts')

Essentially, in the above code, I utilized the sys module, which helps python navigate through the file system & what not. I have basically told python to look for custom python files in other directories as well.

In [2]:
# Disabling warnings -> this is not recommended, but I want to see the errors clearly
import warnings
warnings.filterwarnings("ignore")

#### Preparing the Data

In [3]:
# Importing modules & libraries
import pandas as pd
from DataPrepPipeline import Preparation
from sklearn.model_selection import train_test_split

In [4]:
# Getting the Raw Data 
raw_data = pd.read_csv('/Users/jinalshah/Jinal/Github Repos/House-Prices-Challenge-Solution/Data/train.csv')
raw_data

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [5]:
# Splitting Data into X & y
y = raw_data['SalePrice']
X = raw_data.drop('SalePrice',axis=1)

In [6]:
# Splitting data into training & testing
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, shuffle=True, random_state=0)

In [7]:
# Transforming the data
prep = Preparation(X_train, y_train, True)
train_prepared, y_train_prep = prep.transform()

In [8]:
train_prepared

Unnamed: 0,LotFrontage,LotArea,MasVnrArea,BsmtFinSF1,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,GrLivArea,GarageArea,...,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,"GarageDec_618 2000.0\n870 1960.0\n92 1920.0\n817 2000.0\n302 2000.0\n ... \n763 1990.0\n835 1990.0\n1216 1970.0\n559 2000.0\n684 1990.0\nName: GarageYrBlt, Length: 1168, dtype: float64",CatRemod_N,CatRemod_Y,CatYearBuilt_N,CatYearBuilt_Y
618,0.967941,0.490269,1.862785,-0.720279,1.998722,0.672230,1.602418,-0.877966,0.723953,0.587348,...,0,0,0,0,1,1,0,1,0,1
870,-0.324416,-0.613715,-0.723626,-1.248741,0.815342,0.034463,-0.674946,-0.877966,-1.435970,-0.076786,...,0,0,0,1,0,1,1,0,1,0
92,0.592526,0.747336,-0.723626,0.788009,-0.843911,0.016249,-0.434992,-0.877966,-1.208390,0.166902,...,0,0,0,1,0,1,0,1,1,0
817,0.121053,0.733563,0.756365,1.413311,-0.268528,0.537718,1.350558,-0.877966,0.485081,0.660831,...,0,0,0,1,0,1,0,1,0,1
302,1.831313,0.796404,0.766332,-1.248741,1.721419,0.522157,1.058525,-0.877966,0.208108,0.648949,...,0,0,0,1,0,1,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
763,0.671230,0.074959,2.432366,1.352513,-1.166719,0.336102,0.437640,1.245348,1.501946,0.659989,...,0,0,0,1,0,1,1,0,0,1
835,-0.324416,0.109443,-0.723626,0.354889,0.340807,0.192887,-0.111837,-0.877966,-0.901900,0.173543,...,0,0,0,1,0,1,1,0,1,0
1216,0.074522,-0.030188,-0.723626,-1.248741,-2.080373,-6.058731,0.560780,1.054395,0.843818,0.326400,...,0,0,0,1,0,1,1,0,0,1
559,0.121053,-2.013157,-0.207489,-1.248741,1.509511,0.419397,1.091418,-0.877966,0.239304,0.146603,...,0,0,0,1,0,1,0,1,0,1


In [9]:
y_train_prep

618     12.659737
870     11.603689
92      12.004574
817     12.509878
302     12.230770
          ...    
763     12.727841
835     11.759793
1216    11.626263
559     12.363081
684     12.305923
Name: SalePrice, Length: 1168, dtype: float64

## Model Development

In [30]:
# Importing model libraries
import numpy as np

# Evaluation libraries
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

# Models
from sklearn.linear_model import LinearRegression, Ridge, ElasticNet, SGDRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from catboost import Pool, CatBoostRegressor, cv

In [31]:
# Function to Get Results
def fit_ml_algo(algo,X_train,y_train,cv):
    
    # Building the model
    model = algo.fit(X_train.values, y_train.values)
    
    # Evaluation
    pred = model.predict(train_prepared)
    train_error = mean_squared_error(y_true=y_train,y_pred=pred,squared=False)
    cross_val = np.average(-cross_val_score(estimator=model,X=X_train,y=y_train,
                                scoring='neg_root_mean_squared_error',cv=cv))
    
    # Formatting
    float_formatter = "{:.4f}".format
    
    # Returning results
    return float_formatter(train_error), float_formatter(cross_val)

In [32]:
# Creating a dataframe to store the results
results = pd.DataFrame({
    'Model':[],
    'Training RSME':[],
    'Cross Val RSME':[],
})
results

Unnamed: 0,Model,Training RSME,Cross Val RSME


### Linear Regression

In [33]:
train_error, cross_val = fit_ml_algo(LinearRegression(),train_prepared, y_train_prep, 5)

In [34]:
results = results.append({
    'Model': 'Linear Regression',
    'Training RSME': train_error,
    'Cross Val RSME': cross_val,
},ignore_index=True)
results

Unnamed: 0,Model,Training RSME,Cross Val RSME
0,Linear Regression,0.0993,719158229.4369


### Ridge Regression

In [35]:
train_error, cross_val = fit_ml_algo(Ridge(alpha=3.0),train_prepared, y_train_prep, 5)

In [36]:
results = results.append({
    'Model': 'Ridge Regression',
    'Training RSME': train_error,
    'Cross Val RSME': cross_val,
},ignore_index=True)
results

Unnamed: 0,Model,Training RSME,Cross Val RSME
0,Linear Regression,0.0993,719158229.4369
1,Ridge Regression,0.1019,0.1231


### Elastic Net

In [37]:
train_error, cross_val = fit_ml_algo(ElasticNet(alpha=3.0),train_prepared, y_train_prep, 5)

In [38]:
results = results.append({
    'Model': 'ElasticNet Regression',
    'Training RSME': train_error,
    'Cross Val RSME': cross_val,
},ignore_index=True)
results

Unnamed: 0,Model,Training RSME,Cross Val RSME
0,Linear Regression,0.0993,719158229.4369
1,Ridge Regression,0.1019,0.1231
2,ElasticNet Regression,0.4018,0.4013


### Stochastic Gradient Descent

In [39]:
train_error, cross_val = fit_ml_algo(SGDRegressor(),train_prepared, y_train_prep, 5)

In [40]:
results = results.append({
    'Model': 'SGD Regressor',
    'Training RSME': train_error,
    'Cross Val RSME': cross_val,
},ignore_index=True)
results

Unnamed: 0,Model,Training RSME,Cross Val RSME
0,Linear Regression,0.0993,719158229.4369
1,Ridge Regression,0.1019,0.1231
2,ElasticNet Regression,0.4018,0.4013
3,SGD Regressor,0.1991,0.2403


### Support Vector Regression

In [41]:
train_error, cross_val = fit_ml_algo(SVR(kernel='rbf'),train_prepared, y_train_prep, 5)

In [42]:
results = results.append({
    'Model': 'Support Vector Regression',
    'Training RSME': train_error,
    'Cross Val RSME': cross_val,
},ignore_index=True)
results

Unnamed: 0,Model,Training RSME,Cross Val RSME
0,Linear Regression,0.0993,719158229.4369
1,Ridge Regression,0.1019,0.1231
2,ElasticNet Regression,0.4018,0.4013
3,SGD Regressor,0.1991,0.2403
4,Support Vector Regression,0.0782,0.1396


### K Nearest Neighbors

In [43]:
train_error, cross_val = fit_ml_algo(KNeighborsRegressor(n_neighbors=5),train_prepared, y_train_prep, 5)

In [44]:
results = results.append({
    'Model': 'K Nearest Neighbors',
    'Training RSME': train_error,
    'Cross Val RSME': cross_val,
},ignore_index=True)
results

Unnamed: 0,Model,Training RSME,Cross Val RSME
0,Linear Regression,0.0993,719158229.4369
1,Ridge Regression,0.1019,0.1231
2,ElasticNet Regression,0.4018,0.4013
3,SGD Regressor,0.1991,0.2403
4,Support Vector Regression,0.0782,0.1396
5,K Nearest Neighbors,0.1391,0.1692


### Decision Trees

In [45]:
train_error, cross_val = fit_ml_algo(DecisionTreeRegressor(),train_prepared, y_train_prep, 5)

In [46]:
results = results.append({
    'Model': 'Decision Tree',
    'Training RSME': train_error,
    'Cross Val RSME': cross_val,
},ignore_index=True)
results

Unnamed: 0,Model,Training RSME,Cross Val RSME
0,Linear Regression,0.0993,719158229.4369
1,Ridge Regression,0.1019,0.1231
2,ElasticNet Regression,0.4018,0.4013
3,SGD Regressor,0.1991,0.2403
4,Support Vector Regression,0.0782,0.1396
5,K Nearest Neighbors,0.1391,0.1692
6,Decision Tree,0.0,0.2366


### Random Forests

In [47]:
train_error, cross_val = fit_ml_algo(RandomForestRegressor(),train_prepared, y_train_prep, 5)

In [48]:
results = results.append({
    'Model': 'Random Forest',
    'Training RSME': train_error,
    'Cross Val RSME': cross_val,
},ignore_index=True)
results

Unnamed: 0,Model,Training RSME,Cross Val RSME
0,Linear Regression,0.0993,719158229.4369
1,Ridge Regression,0.1019,0.1231
2,ElasticNet Regression,0.4018,0.4013
3,SGD Regressor,0.1991,0.2403
4,Support Vector Regression,0.0782,0.1396
5,K Nearest Neighbors,0.1391,0.1692
6,Decision Tree,0.0,0.2366
7,Random Forest,0.0583,0.1587


### Catboost

#### Organizing Data for Training