In [None]:
import numpy as np
import pandas as pd
import copy, math

In [None]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [None]:
train.head()

In [None]:
test.head()

In [None]:
#Median 
train['LotFrontage'] = train['LotFrontage'].fillna(train['LotFrontage'].median())
test['LotFrontage'] = test['LotFrontage'].fillna(test['LotFrontage'].median())

In [None]:
#Mode
test.fillna({
    'MSZoning':test['MSZoning'].mode()[0],
    'Exterior1st': test['Exterior1st'].mode()[0],
    'Exterior2nd': test['Exterior2nd'].mode()[0],
    'KitchenQual': test['KitchenQual'].mode()[0],
    'Functional': test['Functional'].mode()[0],
    'SaleType': test['SaleType'].mode()[0]
}, inplace=True)
train.fillna({
    'Electrical':train['Electrical'].mode()[0]
}, inplace=True)


In [None]:
cols_with_none = ['Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond',
                   'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
                   'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
                   'GarageCond', 'PoolQC', 'Fence', 'MiscFeature']
train[cols_with_none] = train[cols_with_none].fillna("None")
test[cols_with_none] = test[cols_with_none].fillna("None")

In [None]:
cols_with_0 = ['MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
                'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'GarageYrBlt',
                'GarageCars', 'GarageArea']
train[cols_with_0] = train[cols_with_0].fillna(0)
test[cols_with_0] = test[cols_with_0].fillna(0)


In [None]:
cols_object = train.select_dtypes(include='object').columns
print(cols_object)
cols_objecttest = test.select_dtypes(include='object').columns
print(cols_objecttest)

In [None]:
#One Hot Encoding
train = pd.get_dummies(train, columns=['MSZoning', 'Street', 'Alley', 'LotShape',
                                       'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
                                       'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
                                       'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
                                       'Exterior2nd', 'MasVnrType', 'Foundation', 'Heating',
                                       'Electrical','GarageType', 'Fence', 'MiscFeature',
                                       'SaleType', 'SaleCondition'],drop_first=True)
test = pd.get_dummies(test, columns=['MSZoning', 'Street', 'Alley', 'LotShape',
                                     'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
                                     'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
                                     'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 
                                     'Exterior2nd', 'MasVnrType', 'Foundation', 'Heating',
                                     'Electrical', 'GarageType', 'Fence', 'MiscFeature',
                                     'SaleType', 'SaleCondition'],drop_first=True)
bool_cols = train.select_dtypes('bool').columns
train[bool_cols] = train[bool_cols].astype(int)

bool_cols_test = test.select_dtypes('bool').columns
test[bool_cols_test] = test[bool_cols_test].astype(int)


In [None]:
# Ordinal mappings
mappings = {
    'ExterQual': {'Fa':0, 'TA':1, 'Gd':2, 'Ex':3},
    'ExterCond': {'Po':0, 'Fa':1, 'TA':2, 'Gd':3, 'Ex':4},
    'BsmtQual': {'None':0, 'Fa':1, 'TA':2, 'Gd':3, 'Ex':4},
    'BsmtCond': {'None':0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4},
    'BsmtExposure':{'No':0, 'None':0, 'Mn':1, 'Av':2, 'Gd':3},
    'BsmtFinType1':{'None':0, 'Unf':1, 'LwQ':2, 'Rec':3, 'BLQ':4, 'ALQ':5, 'GLQ':6},
    'BsmtFinType2':{'None':0, 'Unf':1, 'LwQ':2, 'Rec':3, 'BLQ':4, 'ALQ':5, 'GLQ':6},
    'HeatingQC':{'Po':0, 'Fa':1, 'TA':2, 'Gd':3, 'Ex':4},
    'CentralAir':{'N':0, 'Y':1},
    'KitchenQual': {'Fa':0, 'TA':1, 'Gd':2, 'Ex':3},
    'Functional':{'Typ':0, 'Min1':1, 'Min2':2, 'Mod':3, 'Maj1':4, 'Maj2':5, 'Sev':6},
    'FireplaceQu':{'None':0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5},
    'GarageFinish':{'None':0, 'Unf':1, 'RFn':2, 'Fin':3},
    'GarageQual':{'None':0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5},
    'GarageCond':{'None':0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5},
    'PavedDrive':{'N':0, 'P':1, 'Y':2},
    'PoolQC':{'None':0, 'Fa':1, 'Gd':2, 'Ex':3}
}

for col, m in mappings.items():
    for df in [train, test]:
        df[col] = df[col].map(m)


In [None]:
train.head()

In [None]:
test.head()

In [None]:
# Add missing columns to test and fill with 0
for col in train.columns:
    if col not in test.columns:
        test[col] = 0

# Drop extra columns from test that aren't in train
for col in test.columns:
    if col not in train.columns:
        test.drop(columns=col, inplace=True)

# Reorder test columns to match train
test = test[train.columns]


In [None]:
binary_cols = [col for col in train.columns if set(train[col].unique()) <= {0, 1}]
numeric = train.select_dtypes(include={'int64', 'float64'})
numeric = [col for col in numeric if col not in binary_cols]
num = train[numeric]
num = num.drop(columns='SalePrice')
x_z = (num-num.mean())/num.std()
x_z_df = pd.DataFrame(x_z, columns=num.columns, index=num.index)
binary_df = train[binary_cols]
x_train = pd.concat([x_z_df, binary_df], axis=1)
y_train = train['SalePrice'].to_numpy()
y_mean = y_train.mean()
y_std = y_train.std()
y_train = (y_train-y_mean)/y_std
x_train



In [None]:
binary_cols_test = [col for col in test.columns if set(test[col].unique())<={0, 1}]
numeric_test = test.select_dtypes(include={'int64', 'float64'})
numeric_test = [col for col in numeric_test if col not in binary_cols_test]
n = test[numeric_test].drop(columns='SalePrice', errors='ignore')
x_z_test = (n-num.mean())/num.std()
x_z_testdf = pd.DataFrame(x_z_test, columns = n.columns, index=n.index)
binary_test_df = test[binary_cols_test]
x_test = pd.concat([x_z_testdf, binary_test_df], axis=1)
x_test = x_test.drop(columns='SalePrice')


In [None]:
def predict(x, w, b):
    n = x.shape[0]
    p=0
    for i in range(n):
        p_i = x[i]*w[i]
        p = p+p_i
    p = p+b
    return p

In [None]:
def compute_cost(x,y, w, b):
    m=x.shape[0]
    cost = 0.0
    for i in range(m):
        f_wb_i = np.dot(x[i], w)+b
        cost = cost+(f_wb_i-y[i])**2
    cost = cost/(2*m)
    return cost

In [None]:
def compute_gradient(x, y, w, b):
    # m, n = x.shape
    # dj_dw=np.zeros((n,))
    # dj_db=0
    # for i in range(m):
    #     err = (np.dot(x[i], w)+b)-y[i]
    #     for j in range(n):
    #         dj_dw[j] = dj_dw[j] + err*x[i,j]
    #     dj_db = dj_db + err
    # dj_dw = dj_dw/m
    # dj_db = dj_db/m
    # return dj_dw, dj_db
    m=x.shape[0]
    preds = x@w+b
    error = preds-y
    dj_dw = (1/m)*(x.T@error)
    dj_db = (1/m)*np.sum(error)
    return dj_dw, dj_db

In [None]:
def gradient_descent(x, y, w, b, cost_function, gradient_function, alpha, num_iters):
    J_history = []
    w=copy.deepcopy(w)
    b = b
    for i in range(num_iters):
        dj_dw, dj_db = gradient_function(x, y, w, b)

        w = w-alpha*dj_dw
        b = b-alpha*dj_db

        if i<10000:
            J_history.append(cost_function(x, y, w, b))
        
        if i%math.ceil(num_iters/10)==0:
            print(f"Iteration {i:4d}: Cost: {J_history[-1]:8.2f}   ")
    return w, b, J_history


In [None]:
n_features = x_train.shape[1]
initial_W = np.zeros(n_features)
initial_b = 0
iterations = 10000
alpha = 1e-2
X = x_train.to_numpy()
Y = y_train
w_final, b_final, J_hist = gradient_descent(X, Y, initial_W, initial_b,  compute_cost, compute_gradient, alpha, iterations)


In [None]:
print(f"W_ARRAY: {w_final}")
print(f"b:{b_final}")

In [None]:
y_train_pred = X@w_final+b_final

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
rmse_train = mean_squared_error(Y, y_train_pred)
r2_train = r2_score(Y, y_train_pred)
print("Train RMSE: ", rmse_train)
print("Train R^2: ", r2_train)

In [None]:
import matplotlib.pyplot as plt
plt.scatter(y_train, y_train_pred, alpha=0.5)
plt.xlabel("Actual SalePrice")
plt.ylabel("Predicted SalePrice")
plt.title("Train Predictions")
slope, intercept = np.polyfit(y_train, y_train_pred, 1)
line_x = np.array([y_train.min(), y_train.max()])
line_y = slope*line_x+intercept
plt.plot(line_x, line_y, 'r--')
plt.show()

In [None]:
X_test = x_test.to_numpy()


In [None]:
y_test_pred = X_test@w_final+b_final
y_test_pred = y_test_pred*y_std+y_mean

In [None]:
submission = pd.DataFrame({
    'Id':test['Id'],
    'SalePrice':y_test_pred
})
submission.to_csv('submission.csv', index=False)


In [None]:
from sklearn.model_selection import train_test_split
x_tr, x_val, y_tr, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=42 )

In [None]:
y_pred_tr = x_tr@w_final+b_final

In [None]:
x_t = x_tr.to_numpy()
w_f, b_f, J_hist = gradient_descent(x_t,y_tr, initial_W, initial_b,  compute_cost, compute_gradient, alpha, iterations)

In [None]:
rmse_train2 = mean_squared_error(y_tr, y_pred_tr)
r2_train2 = r2_score(y_tr, y_pred_tr)
print("Train RMSE: ", rmse_train2)
print("Train R^2: ", r2_train2)

In [None]:
y_pred_test = x_val@w_final+b_final

In [None]:
rmse_test2 = mean_squared_error(y_val, y_pred_test)
r2_test2 = r2_score(y_val, y_pred_test)
print("Test RMSE: ", rmse_test2)
print("Test R^2: ", r2_test2)

In [None]:

plt.scatter(y_val, y_pred_test, alpha=0.5)
plt.xlabel("Actual SalePrice")
plt.ylabel("Predicted SalePrice")
plt.title("Train/Test Predictions")
slope, intercept = np.polyfit(y_val, y_pred_test, 1)
line_x = np.array([y_val.min(), y_val.max()])
line_y = slope*line_x+intercept
plt.plot(line_x, line_y, 'r--')
plt.show()