In [1]:
# imports

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from scipy import stats
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# pandas settings
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

# seaborn settings
sns.set_style("whitegrid")

In [2]:
# importing dataset

train = ("datasets/train.csv")

ames_train = pd.read_csv(train)

In [3]:
# title cleanup

def edit_title (title):
    
    title = (title.replace(" ","_")).lower()
    
    return title

ames_train.rename(columns = lambda i:edit_title(i), inplace = True)

# 3 column names start with numbers. Replacing numbers with strings.

ames_train.rename(columns = {'1st_flr_sf':"first_flr_sf",  '2nd_flr_sf':'second_flr_sf',"3ssn_porch":"threessn_porch"}, inplace = True)

In [4]:
# initial useless data droplist

ames_train.drop(columns = ["id", "pid", "misc_feature", "misc_val"], inplace=True)

In [5]:
# A look through the dataset revealed several errors with the columns containing years.
check1 = ames_train[(ames_train["garage_yr_blt"] < ames_train["year_built"]) | (ames_train["garage_yr_blt"] > ames_train["yr_sold"])]
check2 = ames_train[(ames_train["year_remod/add"] < ames_train["year_built"]) | (ames_train["year_remod/add"] > ames_train["yr_sold"]) ]    
check3 = ames_train[(ames_train["year_built"] > ames_train["yr_sold"])] 

# Correcting for errors

for i in check3.index:
    ames_train.at[i, "year_built"] = ames_train.at[i, "yr_sold"]

for i in check1.index:
    ames_train.at[i, "garage_yr_blt"] = ames_train.at[i, "year_built"]
    
for i in check2.index:
    ames_train.at[i, "year_remod/add"] = ames_train.at[i, "year_built"]
    
# manually correcting the remaining row

ames_train.at[1885, "yr_sold"] = 2008

In [6]:
ames_train["garage_yr_blt"].fillna(ames_train["year_built"], inplace = True)

In [7]:
# creating list of continuous data columns

num_cols = [i for i in ames_train.columns if ames_train[i].dtypes == int or ames_train[i].dtypes == float]

# ms_subclass, while numerical in value, is nominal in nature. Removing it from the num_cols list.
# months years should also be classified as ordinal rather than numerical data

list_nonnums = ["ms_subclass", "year_built", "year_remod/add", "garage_yr_blt", "mo_sold", "yr_sold"]

for i in list_nonnums:
    num_cols.remove(i)

# The remanining data had to be cross examined with the data dictionary to determine if it was nominal or ordinal.

# Listing out norminal data

cat_nom_cols = ["ms_subclass", "ms_zoning","street","alley","land_contour","lot_config","neighborhood",
           "condition_1","condition_2","bldg_type","house_style","roof_style","roof_matl","exterior_1st",
           "exterior_2nd","mas_vnr_type","foundation","bsmtfin_type_1","bsmtfin_type_2","heating","garage_type",
           "paved_drive","sale_type","fence"]

# Creating ordinal data list.

cat_ord_cols = [i for i in ames_train.columns if i not in num_cols and i not in cat_nom_cols]

# Changing all num_cols to floats.

for i in num_cols:
    ames_train[i] = ames_train[i].map(lambda x:float(x))

#for the "ms_subclass" column, data should be strings insted of floats

ames_train["ms_subclass"] = ames_train["ms_subclass"].map(lambda x:str(x))

In [8]:
ames_train[cat_nom_cols] = ames_train[cat_nom_cols].fillna("NA")
ames_train[cat_ord_cols] = ames_train[cat_ord_cols].fillna("NA")
ames_train[num_cols] = ames_train[num_cols].fillna(0)

In [9]:
# lable encoding lot_shape

lot_shape_dict = {"NA":0, "Reg":1, "IR1": 2, "IR2": 3, "IR3":4}

# lable encoding central air

cenair_dict = {"NA":0,"N":1 ,"Y":2}

# lable encoding utilities

uti_dict = {"AllPub":4 ,"NoSewr":3 ,"NoSeWa":2 ,"ELO":1, "NA":0}

# lable encoding land_slope

land_slope_dict = {"NA":0, "Gtl":1, "Mod":2, "Sev":3}

# lable encoding exterqual, extercon, bsmtqual, bsmtcon, heatingqc, kitchenqual, fireplacequ, garagequal, garagecond
# poolqc

qualcon_dict = {"Ex":5, "Gd":4, "TA":3, "Fa":2, "Po":1, "NA":0}

# lable encoding bsmtexposure

bsmtexp_dict = {"Gd":4, "Av":3, "Mn":2, "No":1, "NA":0}

# lable encoding electrical
# since "mix" is the equivilant of NA, all mix values will be treated as averages (2.5) 

elec_dict = { "SBrkr":4, "FuseA":3, "FuseF":2, "FuseP":1, "Mix":2.5, "NA":0}

# lable encoding functional

func_dict = {"Typ":8 ,"Min1":7 ,"Min2":6 ,"Mod":5 ,"Maj1":4 ,"Maj2":3 ,"Sev":2 ,"Sal":1, "NA":0}

# lable encoding garagefinish

garfin_dict = {"Fin":3, "RFn":2, "Unf":1, "NA":0}

# Combining dictionaries:

combine_dict = {**garfin_dict,**func_dict,**elec_dict,**bsmtexp_dict,**qualcon_dict,**land_slope_dict,**uti_dict,**lot_shape_dict,**cenair_dict}


In [10]:
# applying dictionary to ordinal non numerical columns

for i in cat_ord_cols:
    if ames_train[i].dtype != int and i not in list_nonnums:
        ames_train[i] = ames_train[i].map(combine_dict)

# Data Cleaning and feature engineering

In [11]:
#drop_nom = ["fence", "paved_drive", "heating" ,"sale_type", "bsmtfin_type_2",
#        "roof_matl", "condition_1", "condition_2", "land_contour","alley", "street"]

#remaining_nom = [i for i in cat_nom_cols if i not in drop_nom]



In [12]:
# encoding building type to binary

#ames_train['bldg_type'].apply(lambda i:i if i == '1Fam' else "other_types")


In [13]:
# encoding paved drives to binary

#ames_train['paved_drive'].apply(lambda i:1 if i == 'Y' else 0)

In [14]:
# encoding fence to binary

# ames_train['fence'].apply(lambda i:0 if i == 'NA' else 1)

# cat_nom_cols.remove('fence')

In [15]:
#remaining_nom

In [16]:
dummy_noms = pd.get_dummies(ames_train[cat_nom_cols], drop_first = True)

In [17]:
ames_train.drop(columns = cat_nom_cols, inplace=True)

In [18]:
ames_train.drop(columns = ["garage_cars", "exter_qual","kitchen_qual","bsmt_qual","garage_finish","total_bsmt_sf", "totrms_abvgrd"], inplace = True)

In [19]:
final_ames = ames_train.merge(dummy_noms, left_index = True, right_index = True)

In [20]:
final_ames.describe()

Unnamed: 0,lot_frontage,lot_area,lot_shape,utilities,land_slope,overall_qual,overall_cond,year_built,year_remod/add,mas_vnr_area,exter_cond,bsmt_cond,bsmt_exposure,bsmtfin_sf_1,bsmtfin_sf_2,bsmt_unf_sf,heating_qc,central_air,electrical,first_flr_sf,second_flr_sf,low_qual_fin_sf,gr_liv_area,bsmt_full_bath,bsmt_half_bath,full_bath,half_bath,bedroom_abvgr,kitchen_abvgr,functional,fireplaces,fireplace_qu,garage_yr_blt,garage_area,garage_qual,garage_cond,wood_deck_sf,open_porch_sf,enclosed_porch,threessn_porch,screen_porch,pool_area,pool_qc,mo_sold,yr_sold,saleprice,ms_subclass_150,ms_subclass_160,ms_subclass_180,ms_subclass_190,...,exterior_2nd_Stone,exterior_2nd_Stucco,exterior_2nd_VinylSd,exterior_2nd_Wd Sdng,exterior_2nd_Wd Shng,mas_vnr_type_BrkFace,mas_vnr_type_NA,mas_vnr_type_None,mas_vnr_type_Stone,foundation_CBlock,foundation_PConc,foundation_Slab,foundation_Stone,foundation_Wood,bsmtfin_type_1_BLQ,bsmtfin_type_1_GLQ,bsmtfin_type_1_LwQ,bsmtfin_type_1_NA,bsmtfin_type_1_Rec,bsmtfin_type_1_Unf,bsmtfin_type_2_BLQ,bsmtfin_type_2_GLQ,bsmtfin_type_2_LwQ,bsmtfin_type_2_NA,bsmtfin_type_2_Rec,bsmtfin_type_2_Unf,heating_GasW,heating_Grav,heating_OthW,heating_Wall,garage_type_Attchd,garage_type_Basment,garage_type_BuiltIn,garage_type_CarPort,garage_type_Detchd,garage_type_NA,paved_drive_P,paved_drive_Y,sale_type_CWD,sale_type_Con,sale_type_ConLD,sale_type_ConLI,sale_type_ConLw,sale_type_New,sale_type_Oth,sale_type_WD,fence_GdWo,fence_MnPrv,fence_MnWw,fence_NA
count,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,...,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0
mean,57.944417,10065.208191,1.404193,3.998537,1.052657,6.11214,5.562165,1971.708435,1984.189176,98.626524,3.085812,2.929303,1.628961,442.084837,47.935641,567.451487,4.158459,1.931253,3.886641,1164.488055,329.329108,5.512921,1499.330083,0.427109,0.063384,1.577279,0.371039,2.843491,1.042906,7.80156,0.590931,1.76353,1976.632374,473.440761,2.803023,2.810336,93.83374,47.556802,22.571916,2.591419,16.511458,2.397855,0.015115,6.219893,2007.776207,181469.701609,0.000488,0.042906,0.005363,0.022428,...,0.002925,0.014627,0.351536,0.127743,0.030717,0.307167,0.010726,0.593857,0.081911,0.42077,0.451487,0.016577,0.002438,0.000975,0.097513,0.299854,0.049732,0.026816,0.089225,0.294003,0.023403,0.011214,0.029254,0.027304,0.039005,0.852755,0.009751,0.002438,0.000975,0.002925,0.591419,0.013164,0.064359,0.005363,0.261336,0.055095,0.019015,0.907362,0.004876,0.00195,0.008289,0.003413,0.002438,0.078011,0.00195,0.868357,0.039005,0.110678,0.004876,0.804973
std,33.137332,6742.488909,0.566514,0.049365,0.244264,1.426271,1.104497,30.177311,21.035007,174.32469,0.372792,0.572009,1.07944,461.195041,164.964052,445.022846,0.964112,0.253085,0.394408,396.446923,425.671046,51.06887,500.447829,0.522589,0.25159,0.549279,0.501043,0.826618,0.20979,0.907485,0.638516,1.807074,26.578653,216.135102,0.721253,0.716094,128.549416,66.747241,59.84511,25.229615,57.374204,37.78257,0.236366,2.744736,1.311911,79258.659352,0.022081,0.202694,0.073055,0.148107,...,0.054021,0.120084,0.477566,0.333884,0.172591,0.461432,0.103037,0.491232,0.274296,0.493803,0.497762,0.127712,0.049326,0.03122,0.296728,0.458305,0.217443,0.161585,0.285137,0.455704,0.151217,0.105327,0.168559,0.163007,0.193655,0.354437,0.09829,0.049326,0.03122,0.054021,0.491691,0.114006,0.245451,0.073055,0.43947,0.228221,0.136611,0.289995,0.069673,0.04413,0.090686,0.058335,0.049326,0.268254,0.04413,0.338185,0.193655,0.313809,0.069673,0.396318
min,0.0,1300.0,1.0,2.0,1.0,1.0,1.0,1872.0,1950.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,334.0,0.0,0.0,334.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1872.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,12789.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,43.5,7500.0,1.0,4.0,1.0,5.0,5.0,1953.5,1964.5,0.0,3.0,3.0,1.0,0.0,0.0,220.0,3.0,2.0,4.0,879.5,0.0,0.0,1129.0,0.0,0.0,1.0,0.0,2.0,1.0,8.0,0.0,0.0,1959.0,319.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2007.0,129825.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
50%,63.0,9430.0,1.0,4.0,1.0,6.0,5.0,1974.0,1993.0,0.0,3.0,3.0,1.0,368.0,0.0,474.0,5.0,2.0,4.0,1093.0,0.0,0.0,1444.0,0.0,0.0,2.0,0.0,3.0,1.0,8.0,1.0,1.0,1978.0,480.0,3.0,3.0,0.0,27.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,162500.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
75%,78.0,11513.5,2.0,4.0,1.0,7.0,6.0,2001.0,2004.0,159.0,3.0,3.0,2.0,733.5,0.0,811.0,5.0,2.0,4.0,1405.0,692.5,0.0,1728.5,1.0,0.0,2.0,1.0,3.0,1.0,8.0,1.0,4.0,2002.0,576.0,3.0,3.0,168.0,70.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
max,313.0,159000.0,4.0,4.0,3.0,10.0,9.0,2010.0,2010.0,1600.0,5.0,5.0,4.0,5644.0,1474.0,2336.0,5.0,2.0,4.0,5095.0,1862.0,1064.0,5642.0,3.0,2.0,4.0,2.0,8.0,3.0,8.0,4.0,5.0,2010.0,1418.0,5.0,5.0,1424.0,547.0,432.0,508.0,490.0,800.0,5.0,12.0,2010.0,611657.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# Polynomial data

# Automated Feature Selection

In [None]:
from sklearn.svm import LinearSVC
from sklearn.feature_selection import RFE
from sklearn.preprocessing import StandardScaler

features = list(final_ames.columns)
features.remove("saleprice")

X = final_ames[features]
y = final_ames[["saleprice"]]


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
y_scaled = scaler.fit_transform(y)

lr = LinearRegression(normalize=True)
lr.fit(X_scaled, y_scaled)

In [None]:
rfe = RFE(lr, n_features_to_select=50, verbose =3 )
rfe.fit(X_scaled,y_scaled)

In [None]:
cols = list(X.columns)
temp = pd.Series(rfe.support_,index = cols)
selected_features_rfe = temp[temp==True].index
print(selected_features_rfe)

In [26]:
# custom function that removes outliers based on IQR limits

def IQR_rule(dataset, column, off = "both"):
    
    error = 0
    
#    p25 = np.percentile(dataset[column], 25)
#    p75 = np.percentile(dataset[column], 75)
    
#    IQR = p75-p25
    
#    botlim = p25 - IQR*1.5
#    toplim = p75 + IQR*1.5
    
    botlim = np.percentile(dataset[column], 1)
    toplim = np.percentile(dataset[column], 99)
    
    if off == "both":
        outliers = dataset[(dataset[column] < botlim) | (dataset[column] > toplim)]
    elif off == "top":
        outliers = dataset[(dataset[column] > toplim)]
    elif off == "bot":
        outliers = dataset[(dataset[column] < botlim)]
    else:
        error = 1
    
    
    if error != 1:
        
        #mean = np.percentile(dataset[column], 50)
        #outliers.apply(lambda i:mean)
        
        #print(f'for the column {column}, {len(outliers)} rows were replaced')
                            
        dataset.drop(outliers.index, errors='ignore', axis=0, inplace = True)
        print(f'for the column {column}, {len(outliers)} rows were dropped')
    
    else:
        print ("error")

In [None]:
IQR_rule(final_ames, 'saleprice')

In [23]:
#top = abs(final_ames.corr()['saleprice']).sort_values().tail(41).index

top = ['lot_area', 'overall_qual', 'overall_cond', 'year_built',
       'year_remod/add', 'mas_vnr_area', 'bsmt_exposure', 'bsmtfin_sf_1',
       'bsmtfin_sf_2', 'bsmt_unf_sf', 'heating_qc', 'first_flr_sf',
       'second_flr_sf', 'gr_liv_area', 'bsmt_full_bath', 'functional',
       'fireplace_qu', 'garage_area', 'screen_porch', 'ms_subclass_20',
       'ms_subclass_30', 'ms_subclass_45', 'ms_subclass_50', 'ms_subclass_60',
       'ms_subclass_70', 'land_contour_HLS', 'neighborhood_GrnHill',
       'neighborhood_NoRidge', 'neighborhood_NridgHt', 'neighborhood_StoneBr',
       'condition_1_Norm', 'condition_1_PosN', 'bldg_type_Duplex',
       'roof_style_Gable', 'roof_style_Hip', 'roof_matl_CompShg',
       'roof_matl_Membran', 'roof_matl_Tar&Grv', 'roof_matl_WdShake',
       'roof_matl_WdShngl', 'exterior_1st_BrkFace', 'mas_vnr_type_BrkFace',
       'mas_vnr_type_NA', 'mas_vnr_type_None', 'mas_vnr_type_Stone',
       'bsmtfin_type_1_GLQ', 'bsmtfin_type_1_NA', 'bsmtfin_type_2_NA',
       'garage_type_NA', 'sale_type_New', 'saleprice']

#top =

In [24]:
top

['lot_area',
 'overall_qual',
 'overall_cond',
 'year_built',
 'year_remod/add',
 'mas_vnr_area',
 'bsmt_exposure',
 'bsmtfin_sf_1',
 'bsmtfin_sf_2',
 'bsmt_unf_sf',
 'heating_qc',
 'first_flr_sf',
 'second_flr_sf',
 'gr_liv_area',
 'bsmt_full_bath',
 'functional',
 'fireplace_qu',
 'garage_area',
 'screen_porch',
 'ms_subclass_20',
 'ms_subclass_30',
 'ms_subclass_45',
 'ms_subclass_50',
 'ms_subclass_60',
 'ms_subclass_70',
 'land_contour_HLS',
 'neighborhood_GrnHill',
 'neighborhood_NoRidge',
 'neighborhood_NridgHt',
 'neighborhood_StoneBr',
 'condition_1_Norm',
 'condition_1_PosN',
 'bldg_type_Duplex',
 'roof_style_Gable',
 'roof_style_Hip',
 'roof_matl_CompShg',
 'roof_matl_Membran',
 'roof_matl_Tar&Grv',
 'roof_matl_WdShake',
 'roof_matl_WdShngl',
 'exterior_1st_BrkFace',
 'mas_vnr_type_BrkFace',
 'mas_vnr_type_NA',
 'mas_vnr_type_None',
 'mas_vnr_type_Stone',
 'bsmtfin_type_1_GLQ',
 'bsmtfin_type_1_NA',
 'bsmtfin_type_2_NA',
 'garage_type_NA',
 'sale_type_New',
 'saleprice']

In [27]:
for i in top:
    IQR_rule(final_ames, i)


for the column lot_area, 33 rows were dropped
for the column overall_qual, 34 rows were dropped
for the column overall_cond, 7 rows were dropped
for the column year_built, 32 rows were dropped
for the column year_remod/add, 15 rows were dropped
for the column mas_vnr_area, 20 rows were dropped
for the column bsmt_exposure, 0 rows were dropped
for the column bsmtfin_sf_1, 20 rows were dropped
for the column bsmtfin_sf_2, 19 rows were dropped
for the column bsmt_unf_sf, 19 rows were dropped
for the column heating_qc, 2 rows were dropped
for the column first_flr_sf, 36 rows were dropped
for the column second_flr_sf, 19 rows were dropped
for the column gr_liv_area, 36 rows were dropped
for the column bsmt_full_bath, 12 rows were dropped
for the column functional, 0 rows were dropped
for the column fireplace_qu, 12 rows were dropped
for the column garage_area, 18 rows were dropped
for the column screen_porch, 18 rows were dropped
for the column ms_subclass_20, 0 rows were dropped
for the co

In [28]:
final_ames[top].shape

(1624, 51)

In [29]:
final_ames[top].to_csv('datasets/final_ames.csv')

In [None]:
final_ames.isnull().sum().sort_values(ascending = False)