# House Prices Data Modeling


In [2]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns

#Libraries additionally added
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
import ast

# Additional Libraries
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

#Regular Expressions
import re

# Use the preprocessed data files
hp_train_data_path = "../Preprocessed_Data/preprocessed_hp_train.csv"
hp_test_data_path = "../Preprocessed_Data/preprocessed_hp_test.csv"


# Read the House Prices Train & Test data 
hp_train_df = pd.read_csv(hp_train_data_path)
hp_test_df = pd.read_csv(hp_test_data_path)


In [3]:
#Display the House Prices Train DataFrame
hp_train_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65,8450,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0,2,2008,WD,Normal,208500
1,2,20,RL,80,9600,Pave,Reg,Lvl,AllPub,FR2,...,0,0,0,0,0,5,2007,WD,Normal,181500
2,3,60,RL,68,11250,Pave,IR1,Lvl,AllPub,Inside,...,0,0,0,0,0,9,2008,WD,Normal,223500
3,4,70,RL,60,9550,Pave,IR1,Lvl,AllPub,Corner,...,272,0,0,0,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84,14260,Pave,IR1,Lvl,AllPub,FR2,...,0,0,0,0,0,12,2008,WD,Normal,250000


In [4]:
#Display the House Prices Test  DataFrame
hp_test_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80,11622,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,120,0,0,6,2010,WD,Normal
1,1462,20,RL,81,14267,Pave,IR1,Lvl,AllPub,Corner,...,36,0,0,0,0,12500,6,2010,WD,Normal
2,1463,60,RL,74,13830,Pave,IR1,Lvl,AllPub,Inside,...,34,0,0,0,0,0,3,2010,WD,Normal
3,1464,60,RL,78,9978,Pave,IR1,Lvl,AllPub,Inside,...,36,0,0,0,0,0,6,2010,WD,Normal
4,1465,120,RL,43,5005,Pave,IR1,HLS,AllPub,Inside,...,82,0,0,144,0,0,1,2010,WD,Normal


## Scale the Data

In [5]:
# Identify numerical columns excluding 'SalePrice'
numerical_columns = hp_train_df.select_dtypes(include=['int64', 'float64']).columns
numerical_columns = numerical_columns[numerical_columns != 'SalePrice']


# Identify categorical columns excluding 'SalePrice'
categorical_columns = hp_train_df.select_dtypes(include=['object']).columns

# One-hot encode categorical columns for training data
hp_train_df = pd.get_dummies(hp_train_df, columns=categorical_columns, drop_first=True)

# One-hot encode categorical columns for test data
hp_test_df = pd.get_dummies(hp_test_df, columns=categorical_columns, drop_first=True)


## Stadard Scalar

In [6]:
# Initialize StandardScaler
scaler = StandardScaler()

# Fit and transform on training data
hp_train_df[numerical_columns] = scaler.fit_transform(hp_train_df[numerical_columns])

# Transform test data using the same scaler
hp_test_df[numerical_columns] = scaler.transform(hp_test_df[numerical_columns])


In [7]:
# Review the Scaled Data
hp_train_df.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,-1.730865,0.073375,-0.228969,-0.207142,0.651479,-0.5172,1.050994,0.878668,0.511418,0.575425,...,0,0,0,0,1,0,0,0,1,0
1,-1.728492,-0.872563,0.452338,-0.091886,-0.071836,2.179628,0.156734,-0.429577,-0.57441,1.171992,...,0,0,0,0,1,0,0,0,1,0
2,-1.72612,0.073375,-0.092708,0.07348,0.651479,-0.5172,0.984752,0.830215,0.32306,0.092907,...,0,0,0,0,1,0,0,0,1,0
3,-1.723747,0.309859,-0.456072,-0.096897,0.651479,-0.5172,-1.863632,-0.720298,-0.57441,-0.499274,...,0,0,0,0,1,0,0,0,0,0
4,-1.721374,0.073375,0.63402,0.375148,1.374795,-0.5172,0.951632,0.733308,1.36457,0.463568,...,0,0,0,0,1,0,0,0,1,0


In [8]:
# Review the Scaled Data
hp_test_df.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1.733238,-0.872563,0.452338,0.110763,-0.795151,0.381743,-0.340077,-1.15638,-0.57441,0.053428,...,0,0,0,0,1,0,0,0,1,0
1,1.73561,-0.872563,0.497759,0.37585,-0.071836,0.381743,-0.43944,-1.30174,0.023903,1.051363,...,0,0,0,0,1,0,0,0,1,0
2,1.737983,0.073375,0.179815,0.332053,-0.795151,-0.5172,0.852269,0.6364,-0.57441,0.761852,...,0,0,0,0,1,0,0,0,1,0
3,1.740356,0.073375,0.361497,-0.054002,-0.071836,0.381743,0.88539,0.6364,-0.463612,0.347326,...,0,0,0,0,1,0,0,0,1,0
4,1.742728,1.492282,-1.22822,-0.552407,1.374795,-0.5172,0.686666,0.345679,-0.57441,-0.39619,...,0,0,0,0,1,0,0,0,1,0


## Split train and test data

In [15]:
# Separate target variable (Y) and features (X)
y = hp_train_df['SalePrice'] #.values.reshape(-1, 1)
X = hp_train_df.drop('SalePrice', axis=1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (1168, 233)
Shape of X_test: (292, 233)
Shape of y_train: (1168,)
Shape of y_test: (292,)


## Create a Linear Regression Model

In [17]:
# Initialize and train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

In [18]:
# Make predictions
price_predictions = model.predict(X_test)


In [19]:
price_predictions

array([ 1.56929488e+05,  3.49525352e+05,  9.08133828e+04,  1.75063819e+05,
        3.25798832e+05,  6.59037825e+04,  2.28662337e+05,  1.45992482e+05,
        5.82526731e+04,  1.47875391e+05,  1.45923645e+05,  1.03103372e+05,
        7.88917307e+04,  2.16974074e+05,  1.75380474e+05,  1.32031297e+05,
        1.89073168e+05,  1.31460787e+05,  1.28210626e+05,  2.15257698e+05,
        1.51775919e+05,  2.05763760e+05,  1.70878050e+05,  1.28904088e+05,
        2.00278187e+05,  1.35489257e+05,  1.94372475e+05,  1.04169138e+05,
        1.76497338e+05,  2.01894850e+05,  1.64019775e+05,  2.74750075e+05,
        2.49839842e+05,  1.10992150e+05,  2.37280861e+05,  1.52169118e+05,
        1.39701945e+05,  2.02771624e+05,  3.11221039e+05,  1.04007079e+05,
        1.22175570e+05,  2.24751528e+05,  1.01014654e+05,  3.67400711e+05,
        1.30013583e+05,  1.42349735e+05,  9.80454676e+04,  1.38502046e+05,
        4.19461173e+05,  1.33402089e+05,  1.20685557e+05,  2.70535236e+05,
        1.04644331e+05,  

In [20]:
# Create a copy of the original data
hp_predicted_df = hp_train_df.copy()

# Add a column with the predicted  values
hp_predicted_df["Predicted House Price"] = price_predictions

# Display sample data
hp_predicted_df.head()

ValueError: Length of values (292) does not match length of index (1460)

## Linear Regression Model Assessment

In [23]:
# Compute metrics for the linear regression model: score, r2, mse, rmse, std
score = model.score(X, y, sample_weight=None)
r2 = r2_score(y_test, price_predictions)
mse = mean_squared_error( y_test,price_predictions )
rmse = np.sqrt(mse)
std = np.std(y)

# Print relevant metrics.
print(f"The score is {score}.")
print(f"The r2 is {r2}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")

The score is -97510768371153.39.
The r2 is -400882429443219.56.
The mean squared error is 3.0748990045591784e+24.
The root mean squared error is 1753538994308.133.
The standard deviation is 79415.29188606751.
