### Imports and Data Reading

In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import cross_val_score, train_test_split

In [48]:
# Make sure you use the CLEANED csv's! 

train = pd.read_csv("./datasets/train_cleaned.csv")
test = pd.read_csv("./datasets/test_cleaned.csv")

# including these in case I get rid of columns I want to get back O___o

train_source = pd.read_csv("./datasets/train_cleaned.csv")
test_source = pd.read_csv("./datasets/test_cleaned.csv")

### Attribute Selection and Dummifying 

As a reminder, here are the traits identified during EDA to use in the model:

**Categorical/Discrete:** Number of Rooms, Neighborhood, House Style, Basement Condition, Heating Type, Electricity Type, Garage Type, Pool Quality, MS SubClass, MS Zoning, Utility Type, Proximity to Conditions (1 and 2), Exterior Covering (1 and 2), Building Foundation Type, Miscellaneous Feature, Kitchen Quality

**Numerical/Continuous:** House Age, Remodel Age, Above Ground Living Area, Total Square Footage Available, Lot Area, Overall Quality/Condition Average

**Total Number of Variables (excluding trait dummying):** 24

In [49]:
# to start, let's isolate the columns we want to keep 

keep_test = ["MS SubClass", "MS Zoning", "Lot Area", "Utilities", "Neighborhood", "Condition 1", "Condition 2", 
        "House Style", "Exterior 1st", "Exterior 2nd", "Foundation", "Bsmt Cond", "Heating", "Electrical", 
        "Gr Liv Area", "Kitchen Qual", "TotRms AbvGrd", "Garage Type", "Pool QC", "Misc Feature", "Total SF",
        "Overall Qu Co Avg", "Age Sold", "Remodel Age"]

keep_train = keep_test + ["SalePrice"]

In [50]:
train = train[keep_train]

test = test[keep_test]

In [51]:
def dummify(df, column, prefix): 
    return pd.get_dummies(df, columns = [f"{column}"], prefix = f"{prefix}", drop_first = True)

In [52]:
categoricals = ["MS SubClass", "MS Zoning", "Utilities", "Neighborhood", "Condition 1", "Condition 2", 
                "House Style", "Exterior 1st", "Exterior 2nd", "Foundation", "Misc Feature", "Heating", "Electrical", 
                "Kitchen Qual", "Garage Type"]
prefixes = ["SubClass", "Zone", "Util", "Neighborhood", "Cond 1", "Cond 2", "Style", "Ext 1", "Ext 2", "Foundation", 
            "Misc", "Heat", "Elec", "Kitchen", "Garage"]

for cat in range(len(categoricals)): 
    train = dummify(train, f"{categoricals[cat]}", f"{prefixes[cat]}")
    
for cat in range(len(categoricals)): 
    test = dummify(test, f"{categoricals[cat]}", f"{prefixes[cat]}")

In [56]:
test.columns

Index(['Lot Area', 'Bsmt Cond', 'Gr Liv Area', 'TotRms AbvGrd', 'Pool QC',
       'Total SF', 'Overall Qu Co Avg', 'Age Sold', 'Remodel Age',
       'SubClass_30',
       ...
       'Elec_SBrkr', 'Kitchen_Fa', 'Kitchen_Gd', 'Kitchen_Po', 'Kitchen_TA',
       'Garage_Attchd', 'Garage_Basment', 'Garage_BuiltIn', 'Garage_CarPort',
       'Garage_Detchd'],
      dtype='object', length=120)

In [55]:
train.head()

Unnamed: 0,Lot Area,Bsmt Cond,Gr Liv Area,TotRms AbvGrd,Pool QC,Total SF,Overall Qu Co Avg,Age Sold,Remodel Age,SalePrice,...,Elec_Mix,Elec_SBrkr,Kitchen_Fa,Kitchen_Gd,Kitchen_TA,Garage_Attchd,Garage_Basment,Garage_BuiltIn,Garage_CarPort,Garage_Detchd
0,13517,TA,1479,6,,2204,7.0,34,5,130500,...,0,1,0,1,0,1,0,0,0,0
1,11492,TA,2122,8,,3035,6.0,13,12,220000,...,0,1,0,1,0,1,0,0,0,0
2,7922,TA,1057,5,,2114,6.0,57,3,109000,...,0,1,0,1,0,0,0,0,0,1
3,9802,TA,1444,7,,1828,5.0,4,3,174000,...,0,1,0,0,1,0,0,1,0,0
4,14235,Gd,1445,6,,2121,7.0,110,17,138500,...,0,1,0,0,1,0,0,0,0,1


In [None]:
ranked_categoricals = ["Bsmt Cond", "Pool QC"]

In [37]:
# need to dummify the categorical values 
# code will include steps for both training and test sets 

# neighborhoods
pd.get_dummies(train, columns = ["Neighborhood"], prefix = "Neighborhood", drop_first = True)
pd.get_dummies(test, columns = ["Neighborhood"], prefix = "Neighborhood", drop_first = True)


# MS Zoning 
pd.get_dummies(train, columns = ["MS Zoning"], prefix = "Zoning", drop_first = True)
pd.get_dummies(test, columns = ["MS Zoning"], prefix = "Zoning", drop_first = True)

# MS SubClass
pd.get_dummies(train, columns = ["MS SubClass"], prefix = "SubClass", drop_first = True)
pd.get_dummies(test, columns = ["MS SubClass"], prefix = "SubClass", drop_first = True)

# Utilities 
pd.get_dummies(train, columns = ["Utilities"], prefix = "Util", drop_first = True)
pd.get_dummies(test, columns = ["Utilities"], prefix = "Util", drop_first = True)

# Condition 1 and 2
pd.get_dummies(train, columns = ["Condition 1"], prefix = "Cond 1", drop_first = True)
pd.get_dummies(test, columns = ["Condition 1"], prefix = "Cond 1", drop_first = True)

pd.get_dummies(train, columns = ["Condition 2"], prefix = "Cond 2", drop_first = True)
pd.get_dummies(test, columns = ["Condition 2"], prefix = "Cond 2", drop_first = True)

# House Style
pd.get_dummies(train, columns = ["House Style"], prefix = "Style", drop_first = True)
pd.get_dummies(test, columns = ["House Style"], prefix = "Style", drop_first = True)

# 

Unnamed: 0,MS SubClass,MS Zoning,Lot Area,Utilities,Condition 1,Condition 2,House Style,Exterior 1st,Exterior 2nd,Foundation,...,Neighborhood_NoRidge,Neighborhood_NridgHt,Neighborhood_OldTown,Neighborhood_SWISU,Neighborhood_Sawyer,Neighborhood_SawyerW,Neighborhood_Somerst,Neighborhood_StoneBr,Neighborhood_Timber,Neighborhood_Veenker
0,60,RL,13517,AllPub,RRAe,Norm,2Story,HdBoard,Plywood,CBlock,...,0,0,0,0,1,0,0,0,0,0
1,60,RL,11492,AllPub,Norm,Norm,2Story,VinylSd,VinylSd,PConc,...,0,0,0,0,0,1,0,0,0,0
2,20,RL,7922,AllPub,Norm,Norm,1Story,VinylSd,VinylSd,CBlock,...,0,0,0,0,0,0,0,0,0,0
3,60,RL,9802,AllPub,Norm,Norm,2Story,VinylSd,VinylSd,PConc,...,0,0,0,0,0,0,0,0,1,0
4,50,RL,14235,AllPub,Norm,Norm,1.5Fin,Wd Sdng,Plywood,PConc,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2046,20,RL,11449,AllPub,Norm,Norm,1Story,VinylSd,VinylSd,PConc,...,0,0,0,0,0,0,0,0,1,0
2047,30,RL,12342,AllPub,Norm,Norm,1Story,VinylSd,VinylSd,CBlock,...,0,0,0,0,0,0,0,0,0,0
2048,50,RL,7558,AllPub,Norm,Norm,1.5Fin,BrkFace,Stone,BrkTil,...,0,0,0,0,0,0,0,0,0,0
2049,20,RL,10400,AllPub,Norm,Norm,1Story,Plywood,Plywood,CBlock,...,0,0,0,0,0,0,0,0,0,0
