# Imports

In [234]:
import pandas as pd
import numpy as np
import copy
import math

# Read raw data

In [235]:
raw_data = pd.read_csv('house-prices-advanced-regression-techniques/train.csv', delimiter=',')
test_data = pd.read_csv('house-prices-advanced-regression-techniques/test.csv', delimiter=',')

# Categorical variable splitting

Needs to be done before splitting the data in test and train

In [236]:
features_to_split = ['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'MoSold', 'SaleType', 'SaleCondition']

def category_splitting(dataframe_to_split, features_to_split):
    
    dummy_table = dataframe_to_split['Id']

    for column in features_to_split:
        dataframe_to_split[column].fillna("None", inplace = True)
        split_by_category = pd.get_dummies(dataframe_to_split[column], prefix = column)
        dummy_table = pd.concat([dummy_table, split_by_category], axis='columns')
    
    dataframe_to_split.drop(columns = features_to_split, inplace = True)
    dummy_table.drop(columns = ['Id'], inplace = True)
    

    dataframe_to_split = pd.concat([dataframe_to_split, dummy_table], axis='columns')
    
    return dataframe_to_split


raw_data_split = category_splitting(raw_data, features_to_split)
test_data_split = category_splitting(test_data, features_to_split)


# Engineer some features

In [237]:

raw_data_split['has_2nd_floor'] = np.where(raw_data_split['2ndFlrSF'] > 0, 1, 0)
raw_data_split['has_lowQF'] = np.where(raw_data_split['LowQualFinSF'] > 0, 1, 0)
raw_data_split['has_2ormorebath'] = np.where(raw_data_split['FullBath'] > 1, 1, 0)
raw_data_split['has_fireplace'] = np.where(raw_data_split['Fireplaces'] > 0, 1, 0)
raw_data_split['has_wood_deck'] = np.where(raw_data_split['WoodDeckSF']  > 0, 1, 0)
raw_data_split['has_open_porch'] = np.where(raw_data_split['OpenPorchSF'] > 0, 1, 0)
raw_data_split['has_enclosed_porch'] = np.where(raw_data_split['EnclosedPorch'] > 0, 1, 0)
raw_data_split['has_3Ssn_porch'] = np.where(raw_data_split['3SsnPorch'] > 0, 1, 0)
raw_data_split['has_screen_porch'] = np.where(raw_data_split['ScreenPorch'] > 0, 1, 0)
raw_data_split['built_last_1Y'] = np.where(raw_data_split['YearBuilt'] > 2008, 1, 0)
raw_data_split['built_last_5Y'] = np.where(raw_data_split['YearBuilt'] > 2004, 1, 0)
raw_data_split['built_last_10Y'] = np.where(raw_data_split['YearBuilt'] > 1998, 1, 0)


test_data_split['has_2nd_floor'] = np.where(test_data_split['2ndFlrSF'] > 0, 1, 0)
test_data_split['has_lowQF'] = np.where(test_data_split['LowQualFinSF'] > 0, 1, 0)
test_data_split['has_2ormorebath'] = np.where(test_data_split['FullBath'] > 1, 1, 0)
test_data_split['has_fireplace'] = np.where(test_data_split['Fireplaces'] > 0, 1, 0)
test_data_split['has_wood_deck'] = np.where(test_data_split['WoodDeckSF'] > 0, 1, 0)
test_data_split['has_open_porch'] = np.where(test_data_split['OpenPorchSF'] > 0, 1, 0)
test_data_split['has_enclosed_porch'] = np.where(test_data_split['EnclosedPorch'] > 0, 1, 0)
test_data_split['has_3Ssn_porch'] = np.where(test_data_split['3SsnPorch'] > 0, 1, 0)
test_data_split['has_screen_porch'] = np.where(test_data_split['ScreenPorch'] > 0, 1, 0)
test_data_split['built_last_1Y'] = np.where(test_data_split['YearBuilt'] > 2008, 1, 0)
test_data_split['built_last_5Y'] = np.where(test_data_split['YearBuilt'] > 2004, 1, 0)
test_data_split['built_last_10Y'] = np.where(test_data_split['YearBuilt'] > 1998, 1, 0)



# Ensure overlap between categories

In [238]:
raw_categories = raw_data_split.columns
test_categories = test_data_split.columns

def intersection(list_a, list_b):
    return [ e for e in list_a if e in list_b ]

def only_in_1_not_in_2(list_a, list_b):
    return [ e for e in list_a if e not in list_b ]

overlap = intersection(raw_categories, test_categories)
overlap_with_output = copy.deepcopy(overlap)
overlap_with_output.append('SalePrice')

only_in_train = only_in_1_not_in_2(raw_categories, test_categories)

raw_data_split = raw_data_split[overlap_with_output]
test_data_split = test_data_split[overlap]
test_data_split['SalePrice']=-1

print(raw_data_split.shape)
print(test_data_split.shape)

(1460, 323)
(1459, 323)


# Feature scaling and adding squares and cubes

In [239]:
def scale_data(col):
    if col.dtype == 'int64' or col.dtype == 'float64':
        mean = col.mean()
        std_dev = col.std()
        col = (col-mean)/std_dev
        
    return col
        

In [240]:
features_to_scale = ['LotFrontage', 'LotArea', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'GrLivArea', 'PoolArea', 'BedroomAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'KitchenAbvGr', 'YrSold', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'MiscVal']

def scale_features(dataframe_to_scale, features_to_scale):
    
    for column in features_to_scale:
        dataframe_to_scale[column] = scale_data(dataframe_to_scale[column])
        
        square_name = column + "_squared"
        cube_name = column + "_cubed"
        quad_name = column + "_sqrt"
        log_name = column + "_log"
        dataframe_to_scale[square_name] = scale_data(dataframe_to_scale[column]**2)
        dataframe_to_scale[cube_name] = scale_data(dataframe_to_scale[column]**3)
        dataframe_to_scale[quad_name] = scale_data(dataframe_to_scale[column]**(1/2))
        dataframe_to_scale[log_name] = scale_data(np.log(dataframe_to_scale[column]+0.01))

        
    dataframe_to_scale.fillna(0, inplace = True)

    return dataframe_to_scale


    

In [241]:
raw_data_split.drop([523, 1298], inplace=True)

In [242]:
dataframe_to_normalize = pd.concat([raw_data_split, test_data_split], axis='rows')

dataframe_normalized = scale_features(dataframe_to_normalize, features_to_scale)

cleaned_train_data = dataframe_normalized[:1458]
test_data_split = dataframe_normalized[1458:]


  result = getattr(ufunc, method)(*inputs, **kwargs)


In [243]:
print(cleaned_train_data.shape)
print(test_data_split.shape)

(1458, 459)
(1459, 459)


# Data segmentation

In [244]:
train_data_output = cleaned_train_data['SalePrice']
train_data_features = cleaned_train_data.drop(['SalePrice', 'Id'], axis=1)

test_data_features_for_submission = test_data_split.drop(['SalePrice', 'Id'], axis = 1)
# test_data_features_for_submission.to_csv('20231213_train_data2.csv', index=False)

# Regression functions

In [245]:
def compute_cost(w, b, x, y, lambda_ = 1):
    m, n = x.shape
    fx = np.dot(x, w) + b
    cost = (fx - y)**2
    cost = np.sum(cost)/(2*m)
    
    regularization_term = w**2
    regularization_term = np.sum(regularization_term)/(2*m)*lambda_
    return cost + regularization_term
    

def compute_gradient(w, b, x, y, lambda_ = 1):
    m, n = x.shape
    fx = np.dot(x, w) + b
    
    dj_dw = np.zeros(n)
    dj_db = 0
    
    err = (fx-y)
    dj_dw = np.dot(err,x)/m + lambda_*w/m
    
    dj_db = np.sum(fx-y)/m
    
    return dj_dw, dj_db


def gradient_descent(w_in, b, x, y, alpha, iterations, lambda_):
    
    cost = 10000000000000000
    w = copy.deepcopy(w_in)
    
    for iteration in range(iterations):
        dj_dw, dj_db = compute_gradient(w, b, x, y, lambda_)
        w_new = w - alpha*dj_dw
        b_new = b - alpha*dj_db
        
        w = w_new
        b = b_new
        
        new_cost = compute_cost(w_new, b_new, x, y)
        
        if iteration%1000==0: 
            print(f"iteration: {iteration} cost: {cost/1000000:.2f}")
            
        if iteration%100==0: 
            if new_cost > cost: break

        cost = new_cost
    return (w_new, b_new)

    
def make_prediction(w, b, features):
    fx = np.dot(features, w) + b
    return fx

# Cross validation model

In [246]:
def run_cv (w, b, x, y, alpha, iterations, lambda_, k):
    
    cost_cv = 0

    for i in range(k):
        
        x_train = x.iloc[lambda x: (x.index+i) % k != 0,]
        y_train = y.iloc[lambda x: (x.index+i) % k != 0,]
        x_test = x.iloc[lambda x: (x.index+i) % k == 0,]
        y_test = y.iloc[lambda x: (x.index+i) % k == 0,]   
        
        w_computed, b_computed = gradient_descent(w, b, x_train, y_train, alpha, iterations, lambda_)
        cost_prediction = compute_cost(w_computed, b_computed, x_test, y_test, 0)
        
        '''
        if i == 2:
            prediction = make_prediction(w_computed, b_computed, x_test)
            difference = abs(prediction - y_test)
            print(f"difference: {difference[difference > 100000]}")
        '''
        print(f"cost prediction for round {i} is {cost_prediction/1000000:.2f}")
        cost_cv += cost_prediction
        
    return cost_cv/k


# Run gradient descent and test

In [254]:
b = 0
w = np.zeros(train_data_features.shape[1])
alpha = 0.06
iterations = 10000
lambda_ = 100
k = 5

cost_final = run_cv(w, b, train_data_features, train_data_output, alpha, iterations, lambda_, k)

print(f"cost: {cost_final/1000000:.2f}")


# lambda 150   iter 7000  alpha 0.06  cost 266.96
# lambda 140   iter 7000  alpha 0.06  cost 266.24
# lambda 130   iter 7000  alpha 0.06  cost 265.52
# lambda 120   iter 7000  alpha 0.06  cost 264.90
# lambda 120   iter 7000  alpha 0.06  cost 263.68





iteration: 0 cost: 10000000000.00
iteration: 1000 cost: 189.28
iteration: 2000 cost: 188.02
iteration: 3000 cost: 187.32
iteration: 4000 cost: 186.96
iteration: 5000 cost: 186.78
iteration: 6000 cost: 186.72
cost prediction for round 0 is 221.31
iteration: 0 cost: 10000000000.00
iteration: 1000 cost: 181.74
iteration: 2000 cost: 180.32
iteration: 3000 cost: 179.51
iteration: 4000 cost: 179.04
iteration: 5000 cost: 178.78
iteration: 6000 cost: 178.64
iteration: 7000 cost: 178.58
iteration: 8000 cost: 178.56
cost prediction for round 1 is 227.17
iteration: 0 cost: 10000000000.00
iteration: 1000 cost: 151.80
iteration: 2000 cost: 150.44
iteration: 3000 cost: 149.67
iteration: 4000 cost: 149.24
iteration: 5000 cost: 149.01
iteration: 6000 cost: 148.90
iteration: 7000 cost: 148.86
cost prediction for round 2 is 428.83
iteration: 0 cost: 10000000000.00
iteration: 1000 cost: 192.31
iteration: 2000 cost: 190.74
iteration: 3000 cost: 189.81
iteration: 4000 cost: 189.26
iteration: 5000 cost: 188

# Create output file for submission


In [253]:
lambda_ = 150

w_computed, b_computed = gradient_descent(w, b, train_data_features, train_data_output, alpha, iterations, lambda_)

prediction = make_prediction(w_computed, b_computed, test_data_features_for_submission)

submission_file = pd.concat([test_data['Id'],pd.DataFrame(prediction)], axis = 1)
submission_file.columns = ['Id', 'SalePrice']
submission_file.to_csv('20231214_submission3.csv', index=False)


iteration: 0 cost: 10000000000.00
iteration: 1000 cost: 192.97
iteration: 2000 cost: 191.31
iteration: 3000 cost: 190.44
iteration: 4000 cost: 190.00
iteration: 5000 cost: 189.80
iteration: 6000 cost: 189.72
