In [1]:
import pandas as pd
pd.options.display.max_columns = 999
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn import linear_model
from sklearn.model_selection import cross_val_score, KFold
from sklearn.neighbors import KNeighborsRegressor


df = pd.read_csv('AmesHousing.tsv',sep='\t')
df = df.iloc[np.random.permutation(len(df))]
df = df.reset_index(drop=True)

In [98]:
def transform_features(df):
    num_missing = df.isnull().sum()
    drop_missing_cols = num_missing[(num_missing > len(df)/20)].sort_values()
    df = df.drop(drop_missing_cols.index, axis=1)
    # Dropping columns with percentage of null
    
    text_mv_counts = df.select_dtypes(include=['object']).isnull().sum().sort_values(ascending=False)
    drop_missing_cols_2 = text_mv_counts[text_mv_counts > 0]
    df = df.drop(drop_missing_cols_2.index, axis=1)
    # Dropping columns with str dtype
    num_missing = df.select_dtypes(include=['int', 'float']).isnull().sum()
    fixable_numeric_cols = num_missing[(num_missing < len(df)/20) & (num_missing > 0)].sort_values()
    replacement_values_dict = df[fixable_numeric_cols.index].mode().to_dict(orient='records')[0]
    df = df.fillna(replacement_values_dict)
    # I need to understand this better
    years_sold = df['Yr Sold'] - df['Year Built']
    years_since_remod = df['Yr Sold'] - df['Year Remod/Add']
    df['Years Before Sale'] = years_sold
    df['Years Since Remod'] = years_since_remod
    df = df.drop([1702, 2180, 2181], axis=0)
    # Creating and Renaming new columns, dropping old columns

    df = df.drop(["PID", "Order", "Mo Sold", "Sale Condition", "Sale Type", "Year Built", "Year Remod/Add"], axis=1)
    #dropping data leakage or irrelevant column names
    return df

def select_features(df, coeff_threshold=0.4, uniq_threshold=10):
    numerical_df = df.select_dtypes(include=['int', 'float'])
    abs_corr_coeffs = numerical_df.corr()['SalePrice'].abs().sort_values()
    df = df.drop(abs_corr_coeffs[abs_corr_coeffs < coeff_threshold].index, axis=1)
    
    nominal_features = ["PID", "MS SubClass", "MS Zoning", "Street", "Alley", "Land Contour", "Lot Config", "Neighborhood", 
                    "Condition 1", "Condition 2", "Bldg Type", "House Style", "Roof Style", "Roof Matl", "Exterior 1st", 
                    "Exterior 2nd", "Mas Vnr Type", "Foundation", "Heating", "Central Air", "Garage Type", 
                    "Misc Feature", "Sale Type", "Sale Condition"]
    
    transform_cat_cols = []
    for col in nominal_features:
        if col in df.columns:
            transform_cat_cols.append(col)

    uniqueness_counts = df[transform_cat_cols].apply(lambda col: len(col.value_counts())).sort_values()
    drop_nonuniq_cols = uniqueness_counts[uniqueness_counts > uniq_threshold].index
    df = df.drop(drop_nonuniq_cols, axis=1)
    
    text_cols = df.select_dtypes(include=['object'])
    for col in text_cols:
        df[col] = df[col].astype('category')
    df = pd.concat([df, pd.get_dummies(df.select_dtypes(include=['category']))], axis=1).drop(text_cols,axis=1)
    
    return df

def train_and_test(df,k=0):
    df = df.select_dtypes(include=['integer', 'float'])
    features = df.columns.drop("SalePrice")
    if k == 0:
        train = df[:1460]
        test = df[1460:]

        ## You can use `pd.DataFrame.select_dtypes()` to specify column types
        ## and return only those columns as a data frame.
        ## You can use `pd.Series.drop()` to drop a value.
        lr = linear_model.LinearRegression()
        lr.fit(train[features], train["SalePrice"])
        predictions = lr.predict(test[features])
        mse = mean_squared_error(test["SalePrice"], predictions)
        rmse = np.sqrt(mse)

        return rmse
    elif k == 1:
        fold_one = df[:1460]
        fold_two = df[1460:]
        ## You can use `pd.Series.drop()` to drop a value.
        model = KNeighborsRegressor()
        
        test_one = model.fit(fold_one[features], fold_one["SalePrice"])
        fold_two_prediction = test_one.predict(fold_two[features])
         
        simple_cross_rmse_one = mean_squared_error(fold_two["SalePrice"],fold_two_prediction)**(1/2)
        test_two = model.fit(fold_two[features], fold_two["SalePrice"])
        
        fold_one_prediction = test_two.predict(fold_one[features])
        simple_cross_rmse_two = mean_squared_error(fold_one["SalePrice"],fold_one_prediction)
        
        return simple_cross_rmse_one,simple_cross_rmse_one                      
    else:
        knn = KNeighborsRegressor()
        kf = KFold(n_splits=k)
        mses = cross_val_score(estimator=knn, X=df[features],y=df['SalePrice'], scoring='mean_squared_error',cv=kf)
        return mses
       
    
transform_df = transform_features(df)
filtered_df = select_features(transform_df)
rmse = train_and_test(filtered_df,k=4)
rmse


Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.


Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.


Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.



[-1.36820245e+09 -1.48274187e+09 -1.37677072e+09 -1.51688714e+09]



Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.

