In [65]:
import numpy as np 
import pandas as pd
import re

from sklearn.linear_model import LinearRegression 
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.model_selection import train_test_split, KFold

from src.utils.DataProcessing import DataProcessing 

In [77]:

PATH= r"/Users/michaelschaid/GitHub/house_prediction/data/AmesHousing.tsv"


    

DATA = (DataProcessing(data_path=PATH, target='saleprice')
        .load_data()
        .clean_data()
        .get_dummies(cols='neighborhood')
        .filter_features(upper_threshold=0.3, lower_threshold=-0.2)
        )

def train_regr(data, features, target):
    #split data into training and testing sets
    data_train, data_test = train_test_split(data, test_size=0.1)
    
    #train model
    regr = LinearRegression().fit(data_train[features], data_train[target] )
    
    #prediction
    regr_predict = regr.predict(data_test[features])
    
    mse = mean_squared_error(data_test[target], regr_predict)
    rmse = np.sqrt(mse)

    return rmse

train_regr(data = DATA.processed_dummies, features =  DATA.features, target = DATA.target)


35239.15041284548

### Cross-validation

In [76]:
def cross_validation(data, features, target, k_folds=5):
    
    
    kf = KFold(n_splits = k_folds, shuffle =True)
    all_rmse = []
    for train_index, test_index in kf.split(data):
        train = data.iloc[train_index]
        test = data.iloc[test_index]
        regr= LinearRegression().fit(train[features],train[target])
        prediction = regr.predict(test[features])
        rmse = np.sqrt(mean_squared_error(test[target], prediction))
        all_rmse.append(rmse)
    return all_rmse

rmse_5K = cross_validation(data = DATA.processed_dummies, features = DATA.features, target = DATA.target)


# Results

### Average RMSE for linear model with correlation values set upper_threshold=0.3, lower_threshold=-0.2 across 5 K-Fold is

In [75]:
np.mean(rmse_5K)

33762.06674701705