## Phase: Modeling Building V2

This can be thought as V2 of the `Demo_1_Modeling.ipynb` example, with more advanced feature engineering.
The input comes from the `Demo_1_Preprocessing.ipynb` example, `outputs/housing_data_advanced_features.csv`
Aside from feature endineering, V2 model is identical to V1

In [1]:
import lineapy
import numpy as np
import pandas as pd

In [2]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [3]:
cleaned_data = pd.read_csv("outputs/housing_data_advanced_features.csv")

In [4]:
len(cleaned_data)

1998

In [5]:
cleaned_data = cleaned_data.dropna()

In [6]:
len(cleaned_data)

1998

In [7]:
train, val = train_test_split(cleaned_data, test_size=0.3, random_state=42)
X_train = train.drop(['SalePrice'], axis = 1)
y_train = train.loc[:, 'SalePrice']
X_val = val.drop(['SalePrice'], axis = 1)
y_val = val.loc[:, 'SalePrice']

In [8]:
X_train

Unnamed: 0,Gr_Liv_Area,Garage_Area,LA_v_1st,1st_v_2nd,wd_v_2nd,basement_value,Neighborhood=Blueste,Neighborhood=BrDale,Neighborhood=BrkSide,Neighborhood=ClearCr,...,Neighborhood=NoRidge,Neighborhood=NridgHt,Neighborhood=OldTown,Neighborhood=SWISU,Neighborhood=Sawyer,Neighborhood=SawyerW,Neighborhood=Somerst,Neighborhood=StoneBr,Neighborhood=Timber,Neighborhood=Veenker
557,2787,820,10.149829,0.721433,0.192712,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
843,1436,1488,8.641148,6.890110,0.000000,0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1651,2263,420,10.377682,1.061020,0.131148,0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1345,1559,812,5.195638,0.000000,0.000000,0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1160,1554,627,2.953668,0.000000,0.000000,0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1130,914,270,18.817287,0.000000,0.000000,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1294,922,308,10.596529,0.000000,0.000000,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
860,2082,484,8.389134,1.891667,0.388889,0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1459,1330,437,6.266165,0.000000,0.000000,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
y_train

557     260700
843     147000
1651    263800
1345    146500
1160    202300
         ...  
1130    134900
1294    109100
860     207500
1459    161140
1126    161900
Name: SalePrice, Length: 1398, dtype: int64

In [10]:
linear_model = LinearRegression(fit_intercept=True)

In [11]:
linear_model.fit(X_train, y_train)
y_fitted = linear_model.predict(X_train)
y_predicted = linear_model.predict(X_val)

In [12]:
X_val["Predicted Sales Price"] = y_predicted

## Use the prediction on split test data

Now that we have built the data, we want to take a look at how accurate we are

In [13]:
def rmse(predicted, actual):
    """
    Calculates RMSE from actual and predicted values
    Input:
      predicted (1D array): vector of predicted/fitted values
      actual (1D array): vector of actual values
    Output:
      a float, the root-mean square error
    """
    return np.sqrt(np.mean((actual - predicted)**2))

In [14]:
# NBVAL_IGNORE_OUTPUT
rmse(y_predicted, y_val)

41354.41767592295

## Use the prediction on new test data

In [15]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer

test_data = pd.read_csv("data/ames_test_cleaned.csv")

In [16]:
# NBVAL_IGNORE_OUTPUT
import numpy as np
test_data['LA_v_1st'] = test_data['Lot_Area'] / test_data['1st_Flr_SF']
test_data['1st_v_2nd'] = np.where(test_data['2nd_Flr_SF'] > 0, test_data['1st_Flr_SF'] / test_data['2nd_Flr_SF'], 0)
test_data['wd_v_2nd'] = np.where(test_data['2nd_Flr_SF'] > 0, test_data['Wood_Deck_SF'] / test_data['2nd_Flr_SF'], 0)
test_data = test_data.drop(['1st_Flr_SF', '2nd_Flr_SF'], axis=1)
basement_features = [x for x in test_data.columns if 'bsmt' in x.lower()]
def basement_value(pt):
    baths = pt['Bsmt_Full_Bath'] + 0.5 * pt['Bsmt_Half_Bath'] 
    large = pt['Total_Bsmt_SF'] > 300
    if pt['BsmtFin_Type_1'] in {'GLQ', 'ALQ'} and pt['Bsmt_Cond'] in {'Ex', 'Gd'} \
        and pt['Bsmt_Qual'] in {'Ex', 'Gd'} and baths > 0.5 and large:
        return 1
    return 0
test_data['basement_value'] = test_data.apply(basement_value, axis=1)
test_data = test_data.drop(basement_features, axis=1)
vec_enc = DictVectorizer()
vec_enc.fit(test_data[['Neighborhood']].to_dict(orient='records'))
Neighborhood_data = vec_enc.transform(test_data[['Neighborhood']].to_dict(orient='records')).toarray()
Neighborhood_cats = vec_enc.get_feature_names()
Neighborhood = pd.DataFrame(Neighborhood_data, columns=Neighborhood_cats)
Neighborhood = Neighborhood.drop(['Neighborhood=GrnHill', 'Neighborhood=Landmrk'], axis=1, errors='ignore')
test_data = pd.concat([test_data, Neighborhood], axis=1)
test_data = test_data.drop(columns=Neighborhood_cats[0])



In [17]:
# new_res = lineapy.run(after_load, {input_data: pd.read_csv("../ames_other_cleaned.csv")})

In [18]:
relevant_table = test_data.filter(regex=("Neighborhood=.|Gr_Liv_Area|Garage_Area|SalePrice|LA_v_1st|1st_v_2nd|wd_v_2nd|basement_value")).dropna()

In [19]:
relevant_table

Unnamed: 0,Gr_Liv_Area,Garage_Area,SalePrice,LA_v_1st,1st_v_2nd,wd_v_2nd,basement_value,Neighborhood=Blueste,Neighborhood=BrDale,Neighborhood=BrkSide,...,Neighborhood=NoRidge,Neighborhood=NridgHt,Neighborhood=OldTown,Neighborhood=SWISU,Neighborhood=Sawyer,Neighborhood=SawyerW,Neighborhood=Somerst,Neighborhood=StoneBr,Neighborhood=Timber,Neighborhood=Veenker
0,1338,582.0,213500,3.677130,0.000000,0.000000,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,1804,442.0,189000,7.295720,1.324742,0.180412,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1655,440.0,175900,13.106160,0.855381,0.176009,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1502,528.0,212000,4.540613,0.000000,0.000000,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,3279,841.0,538000,31.659172,1.063562,0.316551,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
925,1360,336.0,140000,9.841176,0.000000,0.000000,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
926,1092,286.0,71000,3.468864,1.000000,0.000000,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
927,1728,574.0,150900,7.314815,0.000000,0.000000,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
928,1126,484.0,160000,15.452931,0.000000,0.000000,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
# NBVAL_IGNORE_OUTPUT
y_test_predicted = linear_model.predict(relevant_table.drop(["SalePrice"], axis=1))

In [21]:
# NBVAL_SKIP
rmse(y_test_predicted, relevant_table['SalePrice'])

38338.34199976618

I've verified that the test results are still within the expected range.

In [22]:
from joblib import dump
dump(linear_model, "outputs/linea_model_housing.joblib")

['outputs/linea_model_housing.joblib']

## Task 2: App API

I want to deploy this model so the business folks come in and take my "suggested" values.

I would either have to learn flask and AWs to put a mini web app up, or make the business folks use a notebook (which involves setting up Python).

In [23]:
# NBVAL_SKIP
!rm outputs/linea_model_housing.joblib

In [24]:
artifact = lineapy.save(lineapy.file_system, "linea_model_housing_V2")

In [25]:
# NBVAL_SKIP
artifact.visualize()

In [26]:
artifact.to_airflow();

In [27]:
print(artifact.code)

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
cleaned_data = pd.read_csv("outputs/housing_data_advanced_features.csv")
cleaned_data = cleaned_data.dropna()
train, val = train_test_split(cleaned_data, test_size=0.3, random_state=42)
X_train = train.drop(['SalePrice'], axis = 1)
y_train = train.loc[:, 'SalePrice']
linear_model = LinearRegression(fit_intercept=True)
linear_model.fit(X_train, y_train)
from joblib import dump
dump(linear_model, "outputs/linea_model_housing.joblib")

