In [439]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

In [440]:
data = pd.read_csv('AmesHousing.tsv', delimiter='\t')

In [441]:
# bare bones feature engineering & training
def transform_features(df):
    return df.copy()

def select_features(df):
    return df[["Gr Liv Area", "SalePrice"]]

def train_and_test(df):
    num_data = df.select_dtypes(include=['integer', 'float'])
    
    train = num_data.iloc[:1460, :]
    test = num_data.iloc[1460:, :]
    
    target_col = 'SalePrice'
    feature_cols = train.columns.drop(target_col)
    
    lr = LinearRegression()
    lr.fit(train[feature_cols], train[target_col])
    predictions = lr.predict(test[feature_cols])
    
    mse = mean_squared_error(test[target_col], predictions)
    rmse = np.sqrt(mse)
    
    return rmse

clean_df = transform_features(data)
filtered_df = select_features(clean_df)
rmse = train_and_test(filtered_df)

rmse

57088.25161263909

# Feature Engineering

* for numerical cols with **less than 5% missing**: **impute** the column's **mode**
* drop any other columns with missing values

In [442]:
# for numerical columns with less than 5% missing
# impute the column's mode
num_data = data.select_dtypes(include=['integer', 'float'])
impute_thresh = len(num_data)//20
missing_cnt = num_data.isna().sum()

to_impute = missing_cnt[(missing_cnt < impute_thresh) & (missing_cnt > 0)]
to_impute = list(to_impute.index)
                        
modes = num_data.mode()
for c in to_impute:
    mode = modes.loc[0, c]
    print('imputed ',mode, ' on null values for column:', c)
    data[c] = data[c].fillna(mode)

imputed  0.0  on null values for column: Mas Vnr Area
imputed  0.0  on null values for column: BsmtFin SF 1
imputed  0.0  on null values for column: BsmtFin SF 2
imputed  0.0  on null values for column: Bsmt Unf SF
imputed  0.0  on null values for column: Total Bsmt SF
imputed  0.0  on null values for column: Bsmt Full Bath
imputed  0.0  on null values for column: Bsmt Half Bath
imputed  2.0  on null values for column: Garage Cars
imputed  0.0  on null values for column: Garage Area


In [443]:
# drop any other columns with missing values
missing_cnt = data.isna().sum()
to_drop = list(missing_cnt[missing_cnt > 0].index)
data = data.drop(columns = to_drop)
print('cols dropped: ', to_drop)

cols dropped:  ['Lot Frontage', 'Alley', 'Mas Vnr Type', 'Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin Type 2', 'Electrical', 'Fireplace Qu', 'Garage Type', 'Garage Yr Blt', 'Garage Finish', 'Garage Qual', 'Garage Cond', 'Pool QC', 'Fence', 'Misc Feature']


# Create new features from unusable data

In [444]:
years_sold = data['Yr Sold'] - data['Year Built']
years_sold[years_sold < 0]        # find bad data

2180   -1
dtype: int64

In [445]:
years_since_remod = data['Yr Sold'] - data['Year Remod/Add']
years_since_remod[years_since_remod < 0]        # find bad data

1702   -1
2180   -2
2181   -1
dtype: int64

In [446]:
# Drop rows with negative values for both of these new features
data = data.drop(index=[1702, 2180, 2181])

In [447]:
# add new feature columns
data['Years Before Sale'] = years_sold
data['Years Since Remod'] = years_since_remod

# drop uneeded original year columns
data = data.drop(columns = ["Year Built", "Year Remod/Add"])

# Drop columns that:
* that aren't useful for ML
* leak data about the final sale

read more about columns in the [column description file](https://s3.amazonaws.com/dq-content/307/data_description.txt)

In [448]:
# Drop not useful columns
data = data.drop(columns = ["PID", "Order"])

# Drop columns that leak info about the sale
data = data.drop(columns = ["Mo Sold", "Sale Condition", "Sale Type", "Yr Sold"])

In [449]:
def transform_features(data):
    ## clean missing data,  remove/impute
    
    # for numerical columns
    # for columns with less than 5% missing
    # impute the column's mode
    num_data = data.select_dtypes(include=['integer', 'float'])
    impute_thresh = len(num_data)//20
    missing_cnt = num_data.isna().sum()

    to_impute = missing_cnt[(missing_cnt < impute_thresh) & (missing_cnt > 0)]
    to_impute = list(to_impute.index)

    modes = num_data.mode()
    for c in to_impute:
        mode = modes.loc[0, c]
        # print('imputed ',mode, ' on null values for column:', c)
        data[c] = data[c].fillna(mode)
    
    # print(data['Yr Sold'])
    # create useful columns from unuseful formatted data
    data['Years Before Sale'] = data['Yr Sold'] - data['Year Built']
    data['Years Since Remod'] = data['Yr Sold'] - data['Year Remod/Add']
    
    # Drop rows with negative year values
    data = data.drop(index=[1702, 2180, 2181])
    
    # drop any other columns with missing values
    missing_cnt = data.isna().sum()
    to_drop = list(missing_cnt[missing_cnt > 0].index)
    data = data.drop(columns = to_drop)
    # print('cols dropped: ', to_drop)

    # Drop uneeded original year columns
    # Drop not useful columns
    # Drop columns that leak info about the sale
    reworked = ["Year Built", "Year Remod/Add"]
    useless = ["PID", "Order"]
    leaks_result = ["Mo Sold", "Sale Condition", "Sale Type", "Yr Sold"]
    to_drop = reworked + useless + leaks_result
    
    data = data.drop(columns = to_drop)
    return data

In [450]:
# test the new cleaned/reworked model
data = pd.read_csv("AmesHousing.tsv", delimiter="\t")
clean_df = transform_features(data)
filtered_df = select_features(clean_df)
rmse = train_and_test(filtered_df)

rmse

55275.36731241307

In [451]:
import seaborn as sns
%matplotlib inline

# test = np.linspace(1,5,12).reshape(4,3) # create numpy 2D array
# find the features that correlate the most 
corr_matrix = clean_df.corr()
hi_corr = abs(corr_matrix['SalePrice']).sort_values(ascending=True)
print(hi_corr)
# sns.heatmap(hi_corr)

BsmtFin SF 2         0.006127
Misc Val             0.019273
3Ssn Porch           0.032268
Bsmt Half Bath       0.035875
Low Qual Fin SF      0.037629
Pool Area            0.068438
MS SubClass          0.085128
Overall Cond         0.101540
Screen Porch         0.112280
Kitchen AbvGr        0.119760
Enclosed Porch       0.128685
Bedroom AbvGr        0.143916
Bsmt Unf SF          0.182751
Lot Area             0.267520
2nd Flr SF           0.269601
Bsmt Full Bath       0.276258
Half Bath            0.284871
Open Porch SF        0.316262
Wood Deck SF         0.328183
BsmtFin SF 1         0.439284
Fireplaces           0.474831
TotRms AbvGrd        0.498574
Mas Vnr Area         0.506983
Years Since Remod    0.534985
Full Bath            0.546118
Years Before Sale    0.558979
1st Flr SF           0.635185
Garage Area          0.641425
Total Bsmt SF        0.644012
Garage Cars          0.648361
Gr Liv Area          0.717596
Overall Qual         0.801206
SalePrice            1.000000
Name: Sale

In [452]:
# there seems to be a drop off in correlation at 0.4 
# only keep columns over 0.4 correlation to the target
print(list(hi_corr[hi_corr > 0.4].index))

['BsmtFin SF 1', 'Fireplaces', 'TotRms AbvGrd', 'Mas Vnr Area', 'Years Since Remod', 'Full Bath', 'Years Before Sale', '1st Flr SF', 'Garage Area', 'Total Bsmt SF', 'Garage Cars', 'Gr Liv Area', 'Overall Qual', 'SalePrice']


In [453]:
low_corr = list(hi_corr[hi_corr < 0.4].index)
clean_df = clean_df.drop(columns=low_corr)

In [454]:
# list of columns from documentation *meant* to be categorical
nominal_features = ["PID", "MS SubClass", "MS Zoning", "Street", "Alley", "Land Contour", "Lot Config", "Neighborhood", 
                    "Condition 1", "Condition 2", "Bldg Type", "House Style", "Roof Style", "Roof Matl", "Exterior 1st", 
                    "Exterior 2nd", "Mas Vnr Type", "Foundation", "Heating", "Central Air", "Garage Type", 
                    "Misc Feature", "Sale Type", "Sale Condition"]

In [455]:
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2927 entries, 0 to 2929
Data columns (total 39 columns):
MS Zoning            2927 non-null object
Street               2927 non-null object
Lot Shape            2927 non-null object
Land Contour         2927 non-null object
Utilities            2927 non-null object
Lot Config           2927 non-null object
Land Slope           2927 non-null object
Neighborhood         2927 non-null object
Condition 1          2927 non-null object
Condition 2          2927 non-null object
Bldg Type            2927 non-null object
House Style          2927 non-null object
Overall Qual         2927 non-null int64
Roof Style           2927 non-null object
Roof Matl            2927 non-null object
Exterior 1st         2927 non-null object
Exterior 2nd         2927 non-null object
Mas Vnr Area         2927 non-null float64
Exter Qual           2927 non-null object
Exter Cond           2927 non-null object
Foundation           2927 non-null object
BsmtFin SF 

In [456]:
# num_data = df.select_dtypes(include=['integer', 'float'])

In [457]:
# a list of column names from documentation that are *meant* to be categorical
nominal_features = ["PID", "MS SubClass", "MS Zoning", "Street", "Alley", "Land Contour", "Lot Config", "Neighborhood", 
                    "Condition 1", "Condition 2", "Bldg Type", "House Style", "Roof Style", "Roof Matl", "Exterior 1st", 
                    "Exterior 2nd", "Mas Vnr Type", "Foundation", "Heating", "Central Air", "Garage Type", 
                    "Misc Feature", "Sale Type", "Sale Condition"]

In [458]:
## Which categorical columns have we still carried with us? We'll test these 
my_cat_intersection = list(set(clean_df.columns) & set(nominal_features))

In [459]:
## How many unique values in each categorical column?
uniqueness_counts = clean_df[my_cat_intersection].nunique()

## Aribtrary cutoff of 10 unique values (worth experimenting)
drop_nonuniq_cols = uniqueness_counts[uniqueness_counts > 10].index

clean_df = clean_df.drop(columns = drop_nonuniq_cols)

In [460]:
# with the obj columns, Create dummy columns
clean_df = pd.get_dummies(clean_df)

In [461]:
# Update select_features()
def select_features(df, coeff_threshold=0.4, uniq_threshold=10):
    
    corr_matrix = df.corr()['SalePrice']
    corr_matrix = abs(corr_matrix).sort_values(ascending=True)
    
    # there seems to be a drop off in correlation at 0.4 
    # only keep columns over 0.4 correlation to the target
    # for function dymnamics, use coeff_threshold
    low_corr = list(corr_matrix[corr_matrix < coeff_threshold].index)
    df = df.drop(columns=low_corr)
    
    # list of columns from documentation *meant* to be categorical
    nominal_features = ["PID", "MS SubClass", "MS Zoning", "Street", "Alley", "Land Contour", "Lot Config", "Neighborhood", 
                        "Condition 1", "Condition 2", "Bldg Type", "House Style", "Roof Style", "Roof Matl", "Exterior 1st", 
                        "Exterior 2nd", "Mas Vnr Type", "Foundation", "Heating", "Central Air", "Garage Type", 
                        "Misc Feature", "Sale Type", "Sale Condition"]
    
    ## Which categorical columns have we still carried with us? 
    cat_intersection = list(set(df.columns) & set(nominal_features))
    
    ## How many unique values in each categorical column?
    uniqueness_counts = clean_df[my_cat_intersection].nunique()

    ## Aribtrary cutoff of 10 unique values (worth experimenting)
    drop_nonuniq_cols = uniqueness_counts[uniqueness_counts > 10].index

    df = df.drop(columns = drop_nonuniq_cols)
    
    # with the obj columns, Create dummy columns, concatenate them, remove old cat columns
    df = pd.get_dummies(df)
    
    # answer
    numerical_df = df.select_dtypes(include=['int', 'float'])
    abs_corr_coeffs = numerical_df.corr()['SalePrice'].abs().sort_values()
    df = df.drop(abs_corr_coeffs[abs_corr_coeffs < coeff_threshold].index, axis=1)
    
    return df

In [478]:
# update train_and_test
def train_and_test(df, k=0):
    num_data = df.select_dtypes(include=['integer', 'float'])
    target_col = 'SalePrice'
    feature_cols = num_data.columns.drop(target_col)
    lr = LinearRegression()
    
    if k==1:
        train = num_data.iloc[:1460, :]
        test = num_data.iloc[1460:, :]

        lr.fit(train[feature_cols], train[target_col])
        predictions = lr.predict(test[feature_cols])

        mse = mean_squared_error(test[target_col], predictions)
        rmse = np.sqrt(mse)

        return rmse
    
    elif k > 1:
        kf = KFold(n_splits=k, shuffle=True, random_state=1)
        
        mses = cross_val_score(estimator=lr, X=df[feature_cols], y=df[target_col], scoring='neg_mean_squared_error', cv=kf)
        rmses = np.sqrt(np.absolute(mses))
        avg_rmse = np.mean(rmses)
        print(rmses)

        return avg_rmse

In [463]:
# ANSWER
def transform_features(df):
    num_missing = df.isnull().sum()
    drop_missing_cols = num_missing[(num_missing > len(df)/20)].sort_values()
    df = df.drop(drop_missing_cols.index, axis=1)
    
    text_mv_counts = df.select_dtypes(include=['object']).isnull().sum().sort_values(ascending=False)
    drop_missing_cols_2 = text_mv_counts[text_mv_counts > 0]
    df = df.drop(drop_missing_cols_2.index, axis=1)
    
    num_missing = df.select_dtypes(include=['int', 'float']).isnull().sum()
    fixable_numeric_cols = num_missing[(num_missing < len(df)/20) & (num_missing > 0)].sort_values()
    replacement_values_dict = df[fixable_numeric_cols.index].mode().to_dict(orient='records')[0]
    df = df.fillna(replacement_values_dict)
    
    years_sold = df['Yr Sold'] - df['Year Built']
    years_since_remod = df['Yr Sold'] - df['Year Remod/Add']
    df['Years Before Sale'] = years_sold
    df['Years Since Remod'] = years_since_remod
    df = df.drop([1702, 2180, 2181], axis=0)

    df = df.drop(["PID", "Order", "Mo Sold", "Sale Condition", "Sale Type", "Year Built", "Year Remod/Add"], axis=1)
    return df

def select_features(df, coeff_threshold=0.4, uniq_threshold=10):
    numerical_df = df.select_dtypes(include=['int', 'float'])
    abs_corr_coeffs = numerical_df.corr()['SalePrice'].abs().sort_values()
    df = df.drop(abs_corr_coeffs[abs_corr_coeffs < coeff_threshold].index, axis=1)
    
    nominal_features = ["PID", "MS SubClass", "MS Zoning", "Street", "Alley", "Land Contour", "Lot Config", "Neighborhood", 
                    "Condition 1", "Condition 2", "Bldg Type", "House Style", "Roof Style", "Roof Matl", "Exterior 1st", 
                    "Exterior 2nd", "Mas Vnr Type", "Foundation", "Heating", "Central Air", "Garage Type", 
                    "Misc Feature", "Sale Type", "Sale Condition"]
    
    transform_cat_cols = []
    for col in nominal_features:
        if col in df.columns:
            transform_cat_cols.append(col)

    uniqueness_counts = df[transform_cat_cols].apply(lambda col: len(col.value_counts())).sort_values()
    drop_nonuniq_cols = uniqueness_counts[uniqueness_counts > 10].index
    df = df.drop(drop_nonuniq_cols, axis=1)
    
    text_cols = df.select_dtypes(include=['object'])
    for col in text_cols:
        df[col] = df[col].astype('category')
    df = pd.concat([df, pd.get_dummies(df.select_dtypes(include=['category']))], axis=1).drop(text_cols,axis=1)
    
    return df

In [464]:
# ANSWER
def train_and_test2(df, k=0):
    numeric_df = df.select_dtypes(include=['integer', 'float'])
    features = numeric_df.columns.drop("SalePrice")
    lr = LinearRegression()
    
    if k == 0:
        train = df[:1460]
        test = df[1460:]

        lr.fit(train[features], train["SalePrice"])
        predictions = lr.predict(test[features])
        mse = mean_squared_error(test["SalePrice"], predictions)
        rmse = np.sqrt(mse)

        return rmse
    
    if k == 1:
        # Randomize *all* rows (frac=1) from `df` and return
        shuffled_df = df.sample(frac=1, )
        train = df[:1460]
        test = df[1460:]
        
        lr.fit(train[features], train["SalePrice"])
        predictions_one = lr.predict(test[features])        
        
        mse_one = mean_squared_error(test["SalePrice"], predictions_one)
        rmse_one = np.sqrt(mse_one)
        
        lr.fit(test[features], test["SalePrice"])
        predictions_two = lr.predict(train[features])        
       
        mse_two = mean_squared_error(train["SalePrice"], predictions_two)
        rmse_two = np.sqrt(mse_two)
        
        avg_rmse = np.mean([rmse_one, rmse_two])
        print(rmse_one)
        print(rmse_two)
        return avg_rmse
    else:
        kf = KFold(n_splits=k, shuffle=True)
        rmse_values = []
        for train_index, test_index, in kf.split(df):
            train = df.iloc[train_index]
            test = df.iloc[test_index]
            lr.fit(train[features], train["SalePrice"])
            predictions = lr.predict(test[features])
            mse = mean_squared_error(test["SalePrice"], predictions)
            rmse = np.sqrt(mse)
            rmse_values.append(rmse)
        print(rmse_values)
        avg_rmse = np.mean(rmse_values)
        return avg_rmse


In [479]:
df = pd.read_csv("AmesHousing.tsv", delimiter="\t")
tf = transform_features(df)
filtered_df = select_features(tf)

rmse = train_and_test2(filtered_df, k=4)
print(rmse)
rmse = train_and_test(filtered_df, k=4)
print(rmse)

rmse = train_and_test2(clean_df, k=4)
print(rmse)
rmse = train_and_test(clean_df, k=4)
print(rmse)

[26885.064689311057, 26062.376114660387, 35262.29975501161, 27788.565675199163]
28999.576558545556
[36756.52485284 25652.0636658  25571.38314607 28465.76822678]
29111.4349728706
[25444.793633697584, 28452.993070891287, 35372.62847431943, 26316.814758464134]
28896.80748434311
[36756.52485284 25652.0636658  25571.38314607 28465.76822678]
29111.4349728706
