## 1

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [4]:
train = pd.read_csv('AmesHousing.tsv', sep='\t', index_col='Order')

In [5]:
# set threshold ratio - if number of missing values in a column is greater, remove column
remove_na_pct_threshold = .25

### Read Data Descriptions

Need to check the number of missing values and read data descriptions to decide how to transform each column

In [8]:
number_rows_na = train.isna().sum().sort_values(ascending=False)
for index, row in zip(number_rows_na.index, number_rows_na):
    print(index + ' / ', str(row) + ' missing (' + str(round(row/len(train)*100, 0)).replace('.0', '') + '%)')

Pool QC /  2917 missing (100%)
Misc Feature /  2824 missing (96%)
Alley /  2732 missing (93%)
Fence /  2358 missing (80%)
Fireplace Qu /  1422 missing (49%)
Lot Frontage /  490 missing (17%)
Garage Cond /  159 missing (5%)
Garage Finish /  159 missing (5%)
Garage Yr Blt /  159 missing (5%)
Garage Qual /  159 missing (5%)
Garage Type /  157 missing (5%)
Bsmt Exposure /  83 missing (3%)
BsmtFin Type 2 /  81 missing (3%)
Bsmt Qual /  80 missing (3%)
Bsmt Cond /  80 missing (3%)
BsmtFin Type 1 /  80 missing (3%)
Mas Vnr Area /  23 missing (1%)
Mas Vnr Type /  23 missing (1%)
Bsmt Half Bath /  2 missing (0%)
Bsmt Full Bath /  2 missing (0%)
BsmtFin SF 1 /  1 missing (0%)
Garage Cars /  1 missing (0%)
Electrical /  1 missing (0%)
Total Bsmt SF /  1 missing (0%)
Bsmt Unf SF /  1 missing (0%)
BsmtFin SF 2 /  1 missing (0%)
Garage Area /  1 missing (0%)
Sale Condition /  0 missing (0%)
Full Bath /  0 missing (0%)
Half Bath /  0 missing (0%)
Bedroom AbvGr /  0 missing (0%)
Kitchen AbvGr /  0 mis

### Transform Data

Before applying brute force feature selection methods it is a good idea to investigate features more closely and see what they represent.

Removing sales between family members, because a variety of special deals may be involved, the details of which are unknown, as is the combined effect. Luckily, fewer than 50 of almost 3,000 data points involve family sales.

Nothing indicates that missing values in 'Lot Frontage' really represent 0 feet of street connected to property, and this feature has too many missing values, let's remove this column.

In [26]:
def transform_features(dataframe):
    modified_train = dataframe.copy()
    # remove sales between family members
    modified_train = train[train['Sale Type'] != 'Family']
    # NA value in Pool QC indicates no pool, not missing data
    modified_train['Pool QC'] = modified_train['Pool QC'].fillna('No Pool')
    # Misc Feature and Val are too sparsely populated
    modified_train.drop('Misc Feature', axis='columns', inplace=True)
    modified_train.drop('Misc Val', axis='columns', inplace=True)
    # Alley is too sparsely populated
    modified_train.drop('Alley', axis='columns', inplace=True)
    # NA value in Fence indicates no fence, not missing data
    modified_train['Fence'] = modified_train['Fence'].fillna('No Fence')
    # NA value in Fireplace Qu indicates no fireplace, not missing data
    modified_train['Fireplace Qu'] = modified_train['Fireplace Qu'].fillna('No Fireplace')
    # NA values in the four garage columns indicate no garage, not missing data
    modified_train['Garage Cond'] = modified_train['Garage Cond'].fillna('No Garage')
    modified_train['Garage Finish'] = modified_train['Garage Finish'].fillna('No Garage')
    modified_train['Garage Qual'] = modified_train['Garage Qual'].fillna('No Garage')
    modified_train['Garage Type'] = modified_train['Garage Type'].fillna('No Garage')
    # with multiple features describing garage in detail, garage year built does not add a lot of value to the model
    modified_train.drop('Garage Yr Blt', axis='columns', inplace=True)
    # NA values in the five basement columns indicate no basement, not missing data
    modified_train['Bsmt Exposure'] = modified_train['Bsmt Exposure'].fillna('No Basement')
    modified_train['BsmtFin Type 2'] = modified_train['BsmtFin Type 2'].fillna('No Basement')
    modified_train['Bsmt Qual'] = modified_train['Bsmt Qual'].fillna('No Basement')
    modified_train['Bsmt Cond'] = modified_train['Bsmt Cond'].fillna('No Basement')
    modified_train['BsmtFin Type 1'] = modified_train['BsmtFin Type 1'].fillna('No Basement')
    # investigate missing values in Lot Frontage
    modified_train['Lot Frontage'].value_counts(dropna=False)
    modified_train.drop('Lot Frontage', axis='columns', inplace=True)
    # property ID, not useful for modeling
    modified_train.drop('PID', axis='columns', inplace=True)
    # remove remaining columns that have number of missing values greater than threshold specified
    dataframe_len = len(modified_train)
    threshold = dataframe_len * remove_na_pct_threshold
    missing_counts = modified_train.isna().sum()
    keep_cols = missing_counts[missing_counts < threshold].index
    modified_train = modified_train[keep_cols]
    # remove rows with missing values
    modified_train.dropna(inplace=True)
    # convert yr built to age when sold
    modified_train['Age'] = modified_train['Yr Sold'] - modified_train['Year Built']
    # create years since remodel
    modified_train['Years Since Built/Remodel'] = modified_train['Yr Sold'] - modified_train['Year Remod/Add']
    # remove years, month
    modified_train.drop(['Yr Sold', 'Year Built', 'Year Remod/Add', 'Mo Sold'], axis='columns', inplace=True)
    # this feature is not numeric
    modified_train['MS SubClass'] = modified_train['MS SubClass'].astype(str)
    # convert object dtype to categorical, convert to dummies, and delete original categorical features
    string_dtypes = modified_train.dtypes[modified_train.dtypes == 'object'].index
    for col in string_dtypes:
        modified_train[col] = modified_train[col].astype('category')
        dummies = pd.get_dummies(modified_train[col])
        modified_train = pd.concat([modified_train, dummies], axis=1)
        del modified_train[col] 
    
    return modified_train

In [27]:
transformed_train = transform_features(train)

In [31]:
transformed_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2903 entries, 1 to 2930
Columns: 320 entries, Lot Area to Partial
dtypes: float64(9), int64(22), uint8(289)
memory usage: 1.5 MB


In [39]:
dtypes = transformed_train.dtypes
for index, row in zip(dtypes.index, dtypes):
    print(index + ' / ', str(row))

Lot Area /  int64
Overall Qual /  int64
Overall Cond /  int64
Mas Vnr Area /  float64
BsmtFin SF 1 /  float64
BsmtFin SF 2 /  float64
Bsmt Unf SF /  float64
Total Bsmt SF /  float64
1st Flr SF /  int64
2nd Flr SF /  int64
Low Qual Fin SF /  int64
Gr Liv Area /  int64
Bsmt Full Bath /  float64
Bsmt Half Bath /  float64
Full Bath /  int64
Half Bath /  int64
Bedroom AbvGr /  int64
Kitchen AbvGr /  int64
TotRms AbvGrd /  int64
Fireplaces /  int64
Garage Cars /  float64
Garage Area /  float64
Wood Deck SF /  int64
Open Porch SF /  int64
Enclosed Porch /  int64
3Ssn Porch /  int64
Screen Porch /  int64
Pool Area /  int64
SalePrice /  int64
Age /  int64
Years Since Built/Remodel /  int64
120 /  uint8
150 /  uint8
160 /  uint8
180 /  uint8
190 /  uint8
20 /  uint8
30 /  uint8
40 /  uint8
45 /  uint8
50 /  uint8
60 /  uint8
70 /  uint8
75 /  uint8
80 /  uint8
85 /  uint8
90 /  uint8
A (agr) /  uint8
C (all) /  uint8
FV /  uint8
I (all) /  uint8
RH /  uint8
RL /  uint8
RM /  uint8
Grvl /  uint8


In [18]:


def select_features(dataframe):
    return dataframe[['Gr Liv Area', 'SalePrice']]

def train_and_test(dataframe):
    train = dataframe[:1460]
    test = dataframe[1460:]
    linear_model = LinearRegression()
    train_cols = dataframe.columns.drop('SalePrice')
    target_col = 'SalePrice'
    linear_model.fit(dataframe[train_cols], dataframe[target_col])
    predictions = linear_model.predict(test[train_cols])
    return np.sqrt(mean_squared_error(test[target_col], predictions))

## 2

##  3

## 4

## 5

## 6

## 7

## 8

## 9

## 10

## 11

## 12

## 13

## 14

## 15