# Problem Statement
Building a model for a real estat agent to predict average size houses (houses within 3 SD of sq ft mean of a property)


___
Outside Research:
Based on the hedonic pricing system I will assume that there are specific characteristics to make note of when attempting to predict a houses value. 



| Original Characteristic | Feature Proxy | Explaination | Potential FE |
| --- | --- | --- | --- |
| Property Size     | Lot Area | Explain |FE Potential|
|Location Desireability    | Overall Quality | Explain |FE Potential|
|Amenities Proximity  |Cond 1 / Cond 2|Explain|FE Potential|
|Number of Rooms  |Bedrooms|Explain|FE Potential|
| Number of baths  |Full Bath / Half Bath|Explain|FE Potential|
|Age  |Year Built/Original Contstruction Date|Explain|FE Potential|
|Condition of the house  |Overall Cond|Explain|FE Potential|
|Construction Materials  |Remodel|Explain|FE Potential|






So based on thes
---

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error,max_error
from sklearn.model_selection import train_test_split



%matplotlib inline
from scipy import stats 
# adjusting display restrictions 
pd.options.display.max_columns = 100
pd.options.display.max_rows = 3000
# setting preference to remove scientific notation 
pd.options.display.float_format = '{:,.2f}'.format


In [2]:
# medium article inspiration for function: https://hersanyagci.medium.com/detecting-and-handling-outliers-with-pandas-7adbfcd5cad8#:~:text=As%20you%20can%20see%20this,best%20way%20to%20see%20outliers.
# the function below is to return columns and indexes of outliers based on the Turkey rule (described in article)
# the purpose is to reduce the sifting of each feature
def turkey_outliers(datframe):
    # want to make sure only looking at numerical catergories (https://stackoverflow.com/questions/25039626/how-do-i-find-numeric-columns-in-pandas)
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

    newdf = datframe.select_dtypes(include=numerics)

    # column names for two seperate dataframes and their respective outlier index lists 
    column_name=[]
    column_name2=[] 
    lwr_outlier_index=[]
    upr_outlier_index=[]

    #passes through each column in out numerical data frame
    for column in newdf.columns:
        # quartile calculations from each column
        Q25 = float(newdf[column].quantile(0.25))
        Q75 = float(newdf[column].quantile(0.75))

        # establishing bounds for reference based on Turkey's rule to detect outliers from medium article above
        IQR = Q75 - Q25
        acceptable_lower = Q25 - 1.25*(IQR)
        acceptable_upper = Q75 + 1.25*(IQR)

        # storing actual min/max values for outlier evaluation below
        min_series = newdf[column].min()
        max_series = newdf[column].max()

        # checking if the max and mins per column are greater than the acceptable bounds above
        if min_series > acceptable_lower:
            # appends the index of the first minimum value in the dataframe to the list
            column_name.append(column)
            lwr_outlier_index.append(newdf[column].idxmin(axis=0))
        if max_series > acceptable_upper:
            # appends the index of the first max value in the dataframe to the list
            column_name2.append(column)
            upr_outlier_index.append(newdf[column].idxmax())


    # creates data frames for each case of outliers to view column_names to minimize searching through all 65 features
    lwr_out_dict = {'column_name':column_name,'lwr_min_outlr_indx':lwr_outlier_index}
    lwr_out_df = pd.DataFrame(lwr_out_dict,columns=['column_name','lwr_min_outlr_indx'])

    upr_out_dict = {'column_name':column_name2,'upr_max_outlr_index':upr_outlier_index}
    upr_out_df = pd.DataFrame(upr_out_dict,columns=['column_name','upr_max_outlr_index'])

    return lwr_out_df, upr_out_df


In [4]:
def metric_reg(model,X_train,y_train,X_test,y_test):
    #regular R2 value
    R2_train = model.score(X_train,y_train)
    R2_test = model.score(X_test,y_test)


    #manual adjusted r2 score
    k= X_train.shape[1] # returns the # of features in model
    n=len(y_train)      # returns the # of rows/observations
    R2_train_adj = 1 - ((1-R2_train)*(n-1)/(n-k-1)) 
    #manual adjusted r2 score
    kt= X_test.shape[1] # returns the # of features in model
    nt=len(y_test)      # returns the # of rows/observations
    R2_test_adj = 1 - ((1-R2_test)*(nt-1)/(nt-kt-1)) 
    

    # MSE
    y_pred = model.predict(X_train)
    mse_train = mean_squared_error(y_train, y_pred)
    # MSE
    y_predt = model.predict(X_test)
    mse_test = mean_squared_error(y_test, y_predt)

    # Training RMSE
    RMSE_train = (mean_squared_error(y_train, y_pred, squared = False))
    # Testing RMSE
    RMSE_test = (mean_squared_error(y_test, y_predt, squared = False))
    
    #MAE 
    mae_train = mean_absolute_error(y_train, y_pred)
    mae_test = mean_absolute_error(y_test, y_predt)

    
    # calculate residuals
    residuals  = y_train - y_pred
    residualst = y_test - y_predt

    #SSE 
    SSE_train = sum(residuals**2)
    SSE_test = sum(residualst**2)

    #max error (max residual error captues worst case error b./w perdicted values and true value)      
    max_error_train = max_error(y_train, y_pred)
    max_error_test = max_error(y_test, y_predt)
  
    column_names=['R2_train','R2test','R2_adj_train','R2_test_adj','mse_train',
    'mse_test','RMSE_train','RMSE_test','max_error_train','max_error_test']
    
    list_metric = [R2_train,R2_test,R2_train_adj,R2_test_adj,mse_train,mse_test,RMSE_train,
    RMSE_test,RMSE_test,max_error_train,max_error_test]
    
    
    dictionary = dict(zip(column_names,list_metric))
    #print(dictionary)
    
    df =  pd.DataFrame([dictionary])

    return df

In [5]:
df_test = pd.read_csv('./datasets/test.csv')
df_test.head()

Unnamed: 0,id,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street,alley,lot_shape,land_contour,utilities,lot_config,land_slope,neighborhood,condition_1,condition_2,bldg_type,house_style,overall_qual,overall_cond,year_built,year_remod/add,roof_style,roof_matl,exterior_1st,exterior_2nd,mas_vnr_type,mas_vnr_area,exter_qual,exter_cond,foundation,bsmt_qual,bsmt_cond,bsmt_exposure,bsmtfin_type_1,bsmtfin_sf_1,bsmtfin_type_2,bsmtfin_sf_2,bsmt_unf_sf,total_bsmt_sf,heating,heating_qc,central_air,electrical,1st_flr_sf,2nd_flr_sf,low_qual_fin_sf,gr_liv_area,bsmt_full_bath,bsmt_half_bath,full_bath,half_bath,bedroom_abvgr,kitchen_abvgr,kitchen_qual,totrms_abvgrd,functional,fireplaces,fireplace_qu,garage_type,garage_yr_blt,garage_finish,garage_cars,garage_area,garage_qual,garage_cond,paved_drive,wood_deck_sf,open_porch_sf,enclosed_porch,3ssn_porch,screen_porch,pool_area,pool_qc,fence,misc_feature,misc_val,mo_sold,yr_sold,sale_type
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,AllPub,Inside,Gtl,OldTown,Norm,Norm,2fmCon,2Story,6,8,1910,1950,Gable,CompShg,AsbShng,AsbShng,,0.0,TA,Fa,Stone,Fa,TA,No,Unf,0,Unf,0,1020,1020,GasA,Gd,N,FuseP,908,1020,0,1928,0,0,2,0,4,2,Fa,9,Typ,0,,Detchd,1910.0,Unf,1,440,Po,Po,Y,0,60,112,0,0,0,,,,0,4,2006,WD
1,2718,905108090,90,RL,,9662,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Sawyer,Norm,Norm,Duplex,1Story,5,4,1977,1977,Gable,CompShg,Plywood,Plywood,,0.0,TA,TA,CBlock,Gd,TA,No,Unf,0,Unf,0,1967,1967,GasA,TA,Y,SBrkr,1967,0,0,1967,0,0,2,0,6,2,TA,10,Typ,0,,Attchd,1977.0,Fin,2,580,TA,TA,Y,170,0,0,0,0,0,,,,0,8,2006,WD
2,2414,528218130,60,RL,58.0,17104,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,7,5,2006,2006,Gable,CompShg,VinylSd,VinylSd,,0.0,Gd,TA,PConc,Gd,Gd,Av,GLQ,554,Unf,0,100,654,GasA,Ex,Y,SBrkr,664,832,0,1496,1,0,2,1,3,1,Gd,7,Typ,1,Gd,Attchd,2006.0,RFn,2,426,TA,TA,Y,100,24,0,0,0,0,,,,0,9,2006,New
3,1989,902207150,30,RM,60.0,8520,Pave,,Reg,Lvl,AllPub,Inside,Gtl,OldTown,Norm,Norm,1Fam,1Story,5,6,1923,2006,Gable,CompShg,Wd Sdng,Wd Sdng,,0.0,Gd,TA,CBlock,TA,TA,No,Unf,0,Unf,0,968,968,GasA,TA,Y,SBrkr,968,0,0,968,0,0,1,0,2,1,TA,5,Typ,0,,Detchd,1935.0,Unf,2,480,Fa,TA,N,0,0,184,0,0,0,,,,0,7,2007,WD
4,625,535105100,20,RL,,9500,Pave,,IR1,Lvl,AllPub,Inside,Gtl,NAmes,Norm,Norm,1Fam,1Story,6,5,1963,1963,Gable,CompShg,Plywood,Plywood,BrkFace,247.0,TA,TA,CBlock,Gd,TA,No,BLQ,609,Unf,0,785,1394,GasA,Gd,Y,SBrkr,1394,0,0,1394,1,0,1,1,3,1,TA,6,Typ,2,Gd,Attchd,1963.0,RFn,2,514,TA,TA,Y,0,76,0,0,185,0,,,,0,7,2009,WD


Sales Condition and Sale Price

In [12]:
# will dummify these features to obtain correlations  
features_of_interest = ['gr_liv_area','total_bsmt_sf', 'bedroom_abvgr', 'full_bath', 'half_bath' , 'condition_2' , 'year_built' , 'year_remod/add']
dummify1T = df_test[features_of_interest] 
dummify1T.head() 

Unnamed: 0,gr_liv_area,total_bsmt_sf,bedroom_abvgr,full_bath,half_bath,condition_2,year_built,year_remod/add
0,1928,1020,4,2,0,Norm,1910,1950
1,1967,1967,6,2,0,Norm,1977,1977
2,1496,654,3,2,1,Norm,2006,2006
3,968,968,2,1,0,Norm,1923,2006
4,1394,1394,3,1,1,Norm,1963,1963


In [13]:
# will dummify test features to test my first iteration model
features_of_interest = ['gr_liv_area','total_bsmt_sf', 'bedroom_abvgr', 'full_bath', 'half_bath' , 'condition_2' , 'year_built' , 'year_remod/add']
dummify1T = df_test[features_of_interest] 

dummified_1T = pd.get_dummies(data=dummify1,drop_first=True) 
dummified_1T.head() 

Unnamed: 0,gr_liv_area,total_bsmt_sf,bedroom_abvgr,full_bath,half_bath,year_built,year_remod/add,condition_2_Norm,condition_2_PosA
0,1928,1020,4,2,0,1910,1950,1,0
1,1967,1967,6,2,0,1977,1977,1,0
2,1496,654,3,2,1,2006,2006,1,0
3,968,968,2,1,0,1923,2006,1,0
4,1394,1394,3,1,1,1963,1963,1,0


In [56]:
lr = LinearRegression()

X = dummified_1.drop(columns=['saleprice'])
y = dummified_1['saleprice']

# test size spliter
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42,test_size=0.3)
lr.fit(X_train,y_train)
iter1 = metric_reg(lr,X_train,y_train,X_test,y_test)
iter1.head()

Unnamed: 0,R2_train,R2test,R2_adj_train,R2_test_adj,mse_train,mse_test,RMSE_train,RMSE_test,max_error_train,max_error_test
0,0.71,0.8,0.71,0.79,1847972300.07,1203154776.91,42988.05,34686.52,34686.52,597224.33


In [70]:
iter1.head() # from eda-train notebook

Unnamed: 0,R2_train,R2test,R2_adj_train,R2_test_adj,mse_train,mse_test,RMSE_train,RMSE_test,max_error_train,max_error_test
0,0.71,0.8,0.71,0.79,1847972300.07,1203154776.91,42988.05,34686.52,34686.52,597224.33
1,0.8,0.81,0.8,0.8,1229969526.59,1297621643.02,35070.92,36022.52,36022.52,231943.97


Clearly just removing those two outliers dramatically changed my R2 value. Additionaly we can note that the bias has also decresed and the difference between my RMSE and MSE values have also dramatically dropped. This is quite impressive for my first real iteration will now move onto potentially searching for more outliers to drop among the 