# House Prices Data Modeling


In [63]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns

#Libraries additionally added
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
import ast

# Additional Libraries
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

#Regular Expressions
import re

# Use the preprocessed data files
hp_train_data_path = '../2_Final_Preprocessed_Data/final_preprocessed_hp_train.csv'


# Read the House Prices Train & Test data 
hp_train_df = pd.read_csv(hp_train_data_path)

In [64]:
#Display the House Prices Train DataFrame
hp_train_df.head()

Unnamed: 0,MSZoning,LotFrontage,LotArea,Neighborhood,HouseStyle,OverallQual,YearBuilt,YearRemodAdd,ExterQual,BsmtQual,...,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageType,GarageCars,GarageArea,SaleType,SaleCondition,SalePrice
0,RL,65,8450,CollgCr,2Story,7,2003,2003,Gd,Gd,...,3,1,8,0,Attchd,2,548,WD,Normal,208500
1,RL,80,9600,Other,1Story,6,1976,1976,TA,Gd,...,3,1,6,1,Attchd,2,460,WD,Normal,181500
2,RL,Other,11250,CollgCr,2Story,7,2001,2002,Gd,Gd,...,3,1,6,1,Attchd,2,608,WD,Normal,223500
3,RL,60,9550,Other,2Story,7,1915,1970,TA,TA,...,3,1,7,1,Detchd,3,642,WD,Abnorml,140000
4,RL,Other,14260,Other,2Story,Other,2000,2000,Gd,Gd,...,4,1,9,1,Attchd,3,836,WD,Normal,250000


## Scale the Data

In [65]:
# Identify numerical columns excluding 'SalePrice'
numerical_columns = hp_train_df.select_dtypes(include=['int64', 'float64']).columns
numerical_columns = numerical_columns[numerical_columns != 'SalePrice']


# Identify categorical columns excluding 'SalePrice'
categorical_columns = hp_train_df.select_dtypes(include=['object']).columns

# One-hot encode categorical columns for training data
hp_train_df = pd.get_dummies(hp_train_df, columns=categorical_columns, drop_first=True)



## Stadard Scalar

In [66]:
# Initialize StandardScaler
scaler = StandardScaler()

# Fit and transform on training data
hp_train_df[numerical_columns] = scaler.fit_transform(hp_train_df[numerical_columns])

# Transform test data using the same scaler
hp_test_df[numerical_columns] = scaler.transform(hp_test_df[numerical_columns])


In [67]:
# Review the Scaled Data
hp_train_df.head()

Unnamed: 0,LotArea,YearBuilt,YearRemodAdd,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,BsmtFullBath,...,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,-0.207142,1.050994,0.878668,0.575425,-0.288653,-0.944591,-0.459303,-0.793434,1.161852,1.10781,...,0,0,0,0,1,0,0,0,1,0
1,-0.091886,0.156734,-0.429577,1.171992,-0.288653,-0.641228,0.466465,0.25714,-0.795163,-0.819964,...,0,0,0,0,1,0,0,0,1,0
2,0.07348,0.984752,0.830215,0.092907,-0.288653,-0.301643,-0.313369,-0.627826,1.189351,1.10781,...,0,0,0,0,1,0,0,0,1,0
3,-0.096897,-1.863632,-0.720298,-0.499274,-0.288653,-0.06167,-0.687324,-0.521734,0.937276,1.10781,...,0,0,0,0,1,0,0,0,0,0
4,0.375148,0.951632,0.733308,0.463568,-0.288653,-0.174865,0.19968,-0.045611,1.617877,1.10781,...,0,0,0,0,1,0,0,0,1,0


## Split train and test data

In [68]:
# Separate target variable (Y) and features (X)
y = hp_train_df['SalePrice'] #.values.reshape(-1, 1)
X = hp_train_df.drop('SalePrice', axis=1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [69]:
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (1168, 79)
Shape of X_test: (292, 79)
Shape of y_train: (1168,)
Shape of y_test: (292,)


# Apply RandomForestRegressor

In [70]:
from sklearn.ensemble import RandomForestRegressor

# Assuming you have X_train and y_train prepared
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)


In [71]:
# Assuming you've trained your model with a different name, replace 'rf_model' with the actual name
your_trained_model = RandomForestRegressor()
your_trained_model.fit(X_train, y_train)
# Use feature importance scores to select relevant features
sfm = SelectFromModel(your_trained_model, threshold=0.01)
sfm.fit(X_train, y_train)
selected_features = X_train.columns[sfm.get_support()]
# Get feature importances from the trained model
feature_importances = your_trained_model.feature_importances_
# Create a DataFrame to store feature names and their importances
feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': feature_importances
})
# Sort the DataFrame by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
# Filter the DataFrame to include only the selected features
selected_feature_importance_df = feature_importance_df[feature_importance_df['Feature'].isin(selected_features)]
# Print the priority or importance of the selected features
print("Priority of Selected Features:")
print(selected_feature_importance_df)



## After bin a lot of columns and eliminate necessary columns 
# we got these features

Priority of Selected Features:
              Feature  Importance
17         GarageCars    0.284199
39       ExterQual_TA    0.129507
7            1stFlrSF    0.066310
8            2ndFlrSF    0.065518
6         TotalBsmtSF    0.056737
1           YearBuilt    0.054124
11           FullBath    0.047210
36  OverallQual_Other    0.039089
3          BsmtFinSF1    0.038350
0             LotArea    0.032489
18         GarageArea    0.027375
2        YearRemodAdd    0.026320
15       TotRmsAbvGrd    0.023723
16         Fireplaces    0.022201


In [73]:

#Selected features before binning when have 81 columns and then 290 after get_dummies

# Selected Features:
# Index(['LotArea', 'OverallQual', 'YearBuilt', 'BsmtFinSF1', 'TotalBsmtSF',
#        '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'GarageCars', 'GarageArea'],
#       dtype='object')



# ExterQual_TA    
# FullBath    
# OverallQual_Other  
# YearRemodAdd       
# TotRmsAbvGrd   
# Fireplaces    

## Create a Linear Regression Model

In [74]:
# Initialize and train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

In [75]:
# Make predictions
price_predictions = model.predict(X_test)


In [76]:
price_predictions

array([ 1.41209714e+05,  3.43579265e+05,  1.16072822e+05,  1.71872734e+05,
        3.15193572e+05,  7.91580676e+04,  2.33236551e+05,  1.65493487e+05,
        8.25070676e+04,  1.13896431e+05,  1.35154527e+05,  1.15955788e+05,
        1.17135543e+05,  2.11231465e+05,  1.69602542e+05,  1.31525140e+05,
        2.02013318e+05,  1.29906402e+05,  1.03917953e+05,  2.10283137e+05,
        1.96533547e+05,  2.04845195e+05,  1.93790597e+05,  1.18427931e+05,
        2.03903283e+05,  1.80156571e+05,  2.05168457e+05,  1.33166247e+05,
        1.76117269e+05,  2.37817310e+05,  1.30245049e+05,  2.55778863e+05,
        2.28735082e+05,  1.11719450e+05,  2.66320355e+05,  1.28579759e+05,
        1.67139397e+05,  2.03552183e+05,  2.99707372e+05,  7.08421045e+04,
        9.57604403e+04,  2.49998877e+05,  1.03781242e+05,  3.43713390e+05,
        1.32712604e+05,  1.51005341e+05,  9.91843703e+04,  1.21228757e+05,
        3.50930728e+05,  1.45392104e+05,  1.06424497e+05,  2.03250936e+05,
        1.13566361e+05,  

In [77]:
# Create a copy of the original data
hp_predicted_df = hp_train_df.copy()

# Add a column with the predicted  values
hp_predicted_df["Predicted House Price"] = price_predictions

# Display sample data
hp_predicted_df.head()

ValueError: Length of values (292) does not match length of index (1460)

## Linear Regression Model Assessment

In [78]:
# Compute metrics for the linear regression model: score, r2, mse, rmse, std
score = model.score(X, y, sample_weight=None)
r2 = r2_score(y_test, price_predictions)
mse = mean_squared_error( y_test,price_predictions )
rmse = np.sqrt(mse)
std = np.std(y)

# Print relevant metrics.
print(f"The score is {score}.")
print(f"The r2 is {r2}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")

The score is -4.8401097389015605e+19.
The r2 is -1.9898468480088746e+20.
The mean squared error is 1.5262774426571168e+30.
The root mean squared error is 1235426016666767.8.
The standard deviation is 79415.29188606751.
