## 1

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [77]:
train = pd.read_csv('AmesHousing.tsv', sep='\t', index_col='Order')

In [106]:
remove_na_pct_threshold = .25

## Read Data Descriptions

Need to check the number of missing values and read data descriptions to decide how to transform each column

In [78]:
number_rows_na = train.isna().sum().sort_values(ascending=False)
for index, row in zip(number_rows_na.index, number_rows_na):
    print(index + ' / ', str(row) + ' missing (' + str(round(row/len(modified_train)*100, 0)).replace('.0', '') + '%)')

Pool QC /  2917 missing (100%)
Misc Feature /  2824 missing (96%)
Alley /  2732 missing (93%)
Fence /  2358 missing (80%)
Fireplace Qu /  1422 missing (49%)
Lot Frontage /  490 missing (17%)
Garage Cond /  159 missing (5%)
Garage Finish /  159 missing (5%)
Garage Yr Blt /  159 missing (5%)
Garage Qual /  159 missing (5%)
Garage Type /  157 missing (5%)
Bsmt Exposure /  83 missing (3%)
BsmtFin Type 2 /  81 missing (3%)
Bsmt Qual /  80 missing (3%)
Bsmt Cond /  80 missing (3%)
BsmtFin Type 1 /  80 missing (3%)
Mas Vnr Area /  23 missing (1%)
Mas Vnr Type /  23 missing (1%)
Bsmt Half Bath /  2 missing (0%)
Bsmt Full Bath /  2 missing (0%)
BsmtFin SF 1 /  1 missing (0%)
Garage Cars /  1 missing (0%)
Electrical /  1 missing (0%)
Total Bsmt SF /  1 missing (0%)
Bsmt Unf SF /  1 missing (0%)
BsmtFin SF 2 /  1 missing (0%)
Garage Area /  1 missing (0%)
Sale Condition /  0 missing (0%)
Full Bath /  0 missing (0%)
Half Bath /  0 missing (0%)
Bedroom AbvGr /  0 missing (0%)
Kitchen AbvGr /  0 mis

### Manual Transform

Before applying brute force feature selection methods it is a good idea to investigate features more closely and see what they represent.

Removing sales between family members, because a variety of special deals may be involved, the details of which are unknown, as is the combined effect. Luckily, fewer than 50 of almost 3,000 data points involve family sales.

In [79]:
# modified_train['Pool QC'] = modified_train['Pool QC'].fillna('No Pool')

In [80]:
# # NA value in Pool QC indicates no pool, not missing data
# modified_train['Pool QC'] = modified_train['Pool QC'].fillna('No Pool')

In [116]:
# # Misc Feature and Val are too sparsely populated
# modified_train.drop('Misc Feature', axis='columns', inplace=True)
# modified_train.drop('Misc Val', axis='columns', inplace=True)

In [82]:
# # Alley is too sparsely populated
# modified_train.drop('Alley', axis='columns', inplace=True)

In [83]:
# # NA value in Fence indicates no fence, not missing data
# modified_train['Fence'] = modified_train['Fence'].fillna('No Fence')

In [84]:
# # NA value in Fireplace Qu indicates no fireplace, not missing data
# modified_train['Fireplace Qu'] = modified_train['Fireplace Qu'].fillna('No Fireplace')

In [94]:
# # NA values in the four garage columns indicate no garage, not missing data
# modified_train['Garage Cond'] = modified_train['Garage Cond'].fillna('No Garage')
# modified_train['Garage Finish'] = modified_train['Garage Finish'].fillna('No Garage')
# modified_train['Garage Qual'] = modified_train['Garage Qual'].fillna('No Garage')
# modified_train['Garage Type'] = modified_train['Garage Type'].fillna('No Garage')

In [95]:
# # with multiple features describing garage in detail, garage year built does not add a lot of value to the model
# modified_train.drop('Garage Yr Blt', axis='columns', inplace=True)

In [99]:
# # NA values in the five basement columns indicate no basement, not missing data
# modified_train['Bsmt Exposure'] = modified_train['Bsmt Exposure'].fillna('No Basement')
# modified_train['BsmtFin Type 2'] = modified_train['BsmtFin Type 2'].fillna('No Basement')
# modified_train['Bsmt Qual'] = modified_train['Bsmt Qual'].fillna('No Basement')
# modified_train['Bsmt Cond'] = modified_train['Bsmt Cond'].fillna('No Basement')
# modified_train['BsmtFin Type 1'] = modified_train['BsmtFin Type 1'].fillna('No Basement')

In [87]:
# # investigate missing values in Lot Frontage
# modified_train['Lot Frontage'].value_counts(dropna=False)

NaN      490
60.0     276
80.0     137
70.0     133
50.0     117
        ... 
195.0      1
137.0      1
135.0      1
22.0       1
153.0      1
Name: Lot Frontage, Length: 129, dtype: int64

Nothing indicates that missing values really represent 0 feet of street connected to property, and this feature has too many missing values, let's remove this column.

In [92]:
# modified_train.drop('Lot Frontage', axis='columns', inplace=True)

In [102]:
# # property ID, not useful for modeling
# modified_train.drop('PID', axis='columns', inplace=True)

In [108]:
# dataframe_len = len(modified_train)
# threshold = dataframe_len * remove_na_pct_threshold
# missing_counts = modified_train.isna().sum()
# keep_cols = missing_counts[missing_counts < threshold].index
# modified_train = modified_train[keep_cols]

In [110]:
# # remove rows with missing values
# modified_train.dropna(inplace=True)

In [117]:
number_rows_na_after_transform = modified_train.isna().sum().sort_values(ascending=False)
for index, row in zip(number_rows_na_after_transform.index, number_rows_na_after_transform):
    print(index + ' / ', str(row) + ' missing (' + str(round(row/len(modified_train)*100, 0)).replace('.0', '') + '%)')

MS SubClass /  0 missing (0%)
Garage Finish /  0 missing (0%)
Fireplace Qu /  0 missing (0%)
Fireplaces /  0 missing (0%)
Functional /  0 missing (0%)
TotRms AbvGrd /  0 missing (0%)
Kitchen Qual /  0 missing (0%)
Kitchen AbvGr /  0 missing (0%)
Bedroom AbvGr /  0 missing (0%)
Half Bath /  0 missing (0%)
Full Bath /  0 missing (0%)
Bsmt Half Bath /  0 missing (0%)
Bsmt Full Bath /  0 missing (0%)
Gr Liv Area /  0 missing (0%)
Low Qual Fin SF /  0 missing (0%)
2nd Flr SF /  0 missing (0%)
1st Flr SF /  0 missing (0%)
Garage Type /  0 missing (0%)
Garage Cars /  0 missing (0%)
Central Air /  0 missing (0%)
Garage Area /  0 missing (0%)
Sale Condition /  0 missing (0%)
Sale Type /  0 missing (0%)
Yr Sold /  0 missing (0%)
Mo Sold /  0 missing (0%)
Fence /  0 missing (0%)
Pool QC /  0 missing (0%)
Pool Area /  0 missing (0%)
Screen Porch /  0 missing (0%)
3Ssn Porch /  0 missing (0%)
Enclosed Porch /  0 missing (0%)
Open Porch SF /  0 missing (0%)
Wood Deck SF /  0 missing (0%)
Paved Drive

In [120]:
# convert yr built to age when sold
modified_train['Age'] = modified_train['Yr Sold'] - modified_train['Year Built']
# create years since remodel
modified_train['Years Since Built/Remodel'] = modified_train['Yr Sold'] - modified_train['Year Remod/Add']
# remove years, month
modified_train.drop(['Yr Sold', 'Year Built', 'Year Remod/Add', 'Mo Sold'], axis='columns', inplace=True)

In [121]:
# check data types
modified_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2903 entries, 1 to 2930
Data columns (total 73 columns):
 #   Column                     Non-Null Count  Dtype   
---  ------                     --------------  -----   
 0   MS SubClass                2903 non-null   category
 1   MS Zoning                  2903 non-null   object  
 2   Lot Area                   2903 non-null   int64   
 3   Street                     2903 non-null   object  
 4   Lot Shape                  2903 non-null   object  
 5   Land Contour               2903 non-null   object  
 6   Utilities                  2903 non-null   object  
 7   Lot Config                 2903 non-null   object  
 8   Land Slope                 2903 non-null   object  
 9   Neighborhood               2903 non-null   object  
 10  Condition 1                2903 non-null   object  
 11  Condition 2                2903 non-null   object  
 12  Bldg Type                  2903 non-null   object  
 13  House Style                2903 n

In [115]:
modified_train['MS SubClass'] = modified_train['MS SubClass'].astype('category') # this feature is categorical, not numeric

In [32]:
for rng_start, rng_end in zip(range(0, 79, 5), range(5, 85, 5)):
    print(train[train.columns[rng_start: rng_end]].head())

             PID  MS SubClass MS Zoning  Lot Frontage  Lot Area
Order                                                          
1      526301100           20        RL         141.0     31770
2      526350040           20        RH          80.0     11622
3      526351010           20        RL          81.0     14267
4      526353030           20        RL          93.0     11160
5      527105010           60        RL          74.0     13830
      Street Alley Lot Shape Land Contour Utilities
Order                                              
1       Pave   NaN       IR1          Lvl    AllPub
2       Pave   NaN       Reg          Lvl    AllPub
3       Pave   NaN       IR1          Lvl    AllPub
4       Pave   NaN       Reg          Lvl    AllPub
5       Pave   NaN       IR1          Lvl    AllPub
      Lot Config Land Slope Neighborhood Condition 1 Condition 2
Order                                                           
1         Corner        Gtl        NAmes        Norm      

In [17]:
def transform_features(dataframe):
    modified_train = dataframe.copy()
    # NA value in Pool QC indicates no pool, not missing data
    modified_train['Pool QC'] = modified_train['Pool QC'].fillna('No Pool')
    # Misc Feature and Val are too sparsely populated
modified_train.drop('Misc Feature', axis='columns', inplace=True)
modified_train.drop('Misc Val', axis='columns', inplace=True)
    # Alley is too sparsely populated
modified_train.drop('Alley', axis='columns', inplace=True)
    # NA value in Fence indicates no fence, not missing data
modified_train['Fence'] = modified_train['Fence'].fillna('No Fence')
    # NA value in Fireplace Qu indicates no fireplace, not missing data
modified_train['Fireplace Qu'] = modified_train['Fireplace Qu'].fillna('No Fireplace')
# NA values in the four garage columns indicate no garage, not missing data
modified_train['Garage Cond'] = modified_train['Garage Cond'].fillna('No Garage')
modified_train['Garage Finish'] = modified_train['Garage Finish'].fillna('No Garage')
modified_train['Garage Qual'] = modified_train['Garage Qual'].fillna('No Garage')
modified_train['Garage Type'] = modified_train['Garage Type'].fillna('No Garage')
# with multiple features describing garage in detail, garage year built does not add a lot of value to the model
modified_train.drop('Garage Yr Blt', axis='columns', inplace=True)
# NA values in the five basement columns indicate no basement, not missing data
modified_train['Bsmt Exposure'] = modified_train['Bsmt Exposure'].fillna('No Basement')
modified_train['BsmtFin Type 2'] = modified_train['BsmtFin Type 2'].fillna('No Basement')
modified_train['Bsmt Qual'] = modified_train['Bsmt Qual'].fillna('No Basement')
modified_train['Bsmt Cond'] = modified_train['Bsmt Cond'].fillna('No Basement')
modified_train['BsmtFin Type 1'] = modified_train['BsmtFin Type 1'].fillna('No Basement')
# investigate missing values in Lot Frontage
modified_train['Lot Frontage'].value_counts(dropna=False)
modified_train.drop('Lot Frontage', axis='columns', inplace=True)
# property ID, not useful for modeling
modified_train.drop('PID', axis='columns', inplace=True)
# remove remaining columns that have number of missing values greater than threshold specified
dataframe_len = len(modified_train)
threshold = dataframe_len * remove_na_pct_threshold
missing_counts = modified_train.isna().sum()
keep_cols = missing_counts[missing_counts < threshold].index
modified_train = modified_train[keep_cols]
# remove rows with missing values
modified_train.dropna(inplace=True)



    modified_train = train[train['Sale Type'] != 'Family']
    return dataframe

In [18]:


def select_features(dataframe):
    return dataframe[['Gr Liv Area', 'SalePrice']]

def train_and_test(dataframe):
    train = dataframe[:1460]
    test = dataframe[1460:]
    linear_model = LinearRegression()
    train_cols = dataframe.columns.drop('SalePrice')
    target_col = 'SalePrice'
    linear_model.fit(dataframe[train_cols], dataframe[target_col])
    predictions = linear_model.predict(test[train_cols])
    return np.sqrt(mean_squared_error(test[target_col], predictions))

## 2

In [21]:
tst = train.drop('PID', axis='columns')
tst

Unnamed: 0_level_0,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,Utilities,Lot Config,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,20,RL,141.0,31770,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,5,2010,WD,Normal,215000
2,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,Inside,...,0,,MnPrv,,0,6,2010,WD,Normal,105000
3,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,Gar2,12500,6,2010,WD,Normal,172000
4,20,RL,93.0,11160,Pave,,Reg,Lvl,AllPub,Corner,...,0,,,,0,4,2010,WD,Normal,244000
5,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,Inside,...,0,,MnPrv,,0,3,2010,WD,Normal,189900
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2926,80,RL,37.0,7937,Pave,,IR1,Lvl,AllPub,CulDSac,...,0,,GdPrv,,0,3,2006,WD,Normal,142500
2927,20,RL,,8885,Pave,,IR1,Low,AllPub,Inside,...,0,,MnPrv,,0,6,2006,WD,Normal,131000
2928,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,Inside,...,0,,MnPrv,Shed,700,7,2006,WD,Normal,132000
2929,20,RL,77.0,10010,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,4,2006,WD,Normal,170000


##  3

## 4

## 5

## 6

## 7

## 8

## 9

## 10

## 11

## 12

## 13

## 14

## 15