# House Price Prediction
## Load and Explore the data 

In [143]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

current_directory = os.getcwd()
print(current_directory)


d:\study\profolio\ml_project\house_prediction


In [144]:
train = pd.read_csv(current_directory + '\\train.csv')
test = pd.read_csv(current_directory + '\\test.csv')

In [145]:
train.shape, test.shape

((1460, 81), (1459, 80))

In [146]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

- Our training data set contains 1460 observations and 81 variables. Our target variable is `SalePrice`
- There are some missing values contains in some of the columns such as `Alley` and `LotFrontage` ...

In [147]:
missing = train.isnull().sum()
missing[missing > 0]

LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

In [148]:
train.dtypes[missing[missing > 0].index]

LotFrontage     float64
Alley            object
MasVnrType       object
MasVnrArea      float64
BsmtQual         object
BsmtCond         object
BsmtExposure     object
BsmtFinType1     object
BsmtFinType2     object
Electrical       object
FireplaceQu      object
GarageType       object
GarageYrBlt     float64
GarageFinish     object
GarageQual       object
GarageCond       object
PoolQC           object
Fence            object
MiscFeature      object
dtype: object

There are total of 6965 missing values. With the majority of them being object or categorical data.

In [149]:
train.duplicated().sum()

0

The data set contains no duplicated values.

---
## Preprocessing

In [150]:
# Create input and target data
X_train = train.drop(['Id', 'SalePrice'], axis=1)
y_train = train['SalePrice']

In [151]:
# Create a list of numerical and categorical columns
categorical_cols = X_train.select_dtypes(include='object').columns.tolist()
numerical_cols = X_train.select_dtypes(exclude='object').columns.tolist()

In [152]:
len(categorical_cols) + len(numerical_cols) == len(X_train.columns)

True

#### Imputation

In [153]:
from sklearn.impute import SimpleImputer

In [154]:
imputer = SimpleImputer(strategy='mean')  # We will use the mean value to fill missing values
imputer.fit(X_train[numerical_cols])

X_train[numerical_cols] = imputer.transform(X_train[numerical_cols])

X_train[numerical_cols].isnull().sum().sum()

0

#### Scaling

In [155]:
from sklearn.preprocessing import StandardScaler

In [156]:
scaler = StandardScaler()
scaler.fit(X_train[numerical_cols])

scaler.scale_

array([4.22860820e+01, 2.20164789e+01, 9.97784611e+03, 1.38252284e+00,
       1.11241818e+00, 3.01925588e+01, 2.06383353e+01, 1.80507263e+02,
       4.55941866e+02, 1.61264017e+02, 4.41715605e+02, 4.38555057e+02,
       3.86455322e+02, 4.36378914e+02, 4.86064268e+01, 5.25300394e+02,
       5.18732867e-01, 2.38670868e-01, 5.50727099e-01, 5.02713131e-01,
       8.15498620e-01, 2.20262727e-01, 1.62483655e+00, 6.44445572e-01,
       2.39863645e+01, 7.47059036e-01, 2.13731608e+02, 1.25295863e+02,
       6.62333334e+01, 6.10982138e+01, 2.93072887e+01, 5.57383170e+01,
       4.01635452e+01, 4.95953090e+02, 2.70270015e+00, 1.32764022e+00])

In [157]:
X_train[numerical_cols] = scaler.transform(X_train[numerical_cols])

In [158]:
X_train[numerical_cols].describe()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
count,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,-8.455945000000001e-17,4.075887e-16,-5.840077000000001e-17,1.387018e-16,3.540547e-16,1.046347e-15,4.49686e-15,-3.4067120000000005e-17,-2.4333660000000003e-17,-3.4067120000000005e-17,...,-1.2166830000000001e-17,5.596741000000001e-17,3.041707e-17,-2.3116970000000003e-17,4.866731e-18,5.475072e-17,1.9466920000000002e-17,-2.6767020000000002e-17,7.543433000000001e-17,3.567436e-14
std,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,...,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343
min,-0.8725628,-2.227875,-0.9237292,-3.688413,-4.11297,-3.287824,-1.689368,-0.5744105,-0.9730182,-0.2886528,...,-2.212963,-0.7521758,-0.7044833,-0.3593249,-0.1163393,-0.2702084,-0.06869175,-0.08768781,-1.969111,-1.367655
25%,-0.8725628,-0.4564744,-0.2969908,-0.7951515,-0.5171998,-0.5719226,-0.8656586,-0.5744105,-0.9730182,-0.2886528,...,-0.647916,-0.7521758,-0.7044833,-0.3593249,-0.1163393,-0.2702084,-0.06869175,-0.08768781,-0.4891101,-0.6144386
50%,-0.1631095,6.454645e-16,-0.1040633,-0.07183611,-0.5171998,0.05737148,0.4425864,-0.5744105,-0.1319022,-0.2886528,...,0.03284429,-0.7521758,-0.3270298,-0.3593249,-0.1163393,-0.2702084,-0.06869175,-0.08768781,-0.1191097,0.1387775
75%,0.3098594,0.4065156,0.108708,0.6514792,0.3817427,0.9516316,0.9271216,0.3355252,0.5891327,-0.2886528,...,0.4820057,0.5886506,0.3221901,-0.3593249,-0.1163393,-0.2702084,-0.06869175,-0.08768781,0.620891,0.8919936
max,3.147673,11.03492,20.51827,2.821425,3.07857,1.282839,1.217843,8.289499,11.40575,8.851638,...,4.421526,6.087635,7.554198,8.675309,17.21723,8.341462,18.30618,31.16527,2.100892,1.64521


#### Ecoding Categorical Data

In [159]:
from sklearn.preprocessing import OneHotEncoder

In [161]:
encoder = OneHotEncoder(handle_unknown='ignore')
encoder.fit(X_train[categorical_cols])

encoded_cols = encoder.get_feature_names_out(categorical_cols).tolist()
encoded_cols[:5]

['MSZoning_C (all)',
 'MSZoning_FV',
 'MSZoning_RH',
 'MSZoning_RL',
 'MSZoning_RM']

In [162]:
encoded_vals = encoder.transform(X_train[categorical_cols]).toarray()
encoded_vals.shape

(1460, 268)

In [163]:
len(encoded_cols) == encoded_vals.shape[1]

True

In [164]:
X_train_1 = pd.concat([X_train[numerical_cols], pd.DataFrame(encoded_vals, columns=encoded_cols)], axis=1) # contains all processed features

In [165]:
X_train_1.shape,y_train.shape

((1460, 304), (1460,))

In [166]:
from sklearn.model_selection import train_test_split

In [167]:
X_train, X_val, y_train, y_val = train_test_split(X_train_1, y_train, test_size=0.2, random_state=0)

X_train.shape, X_val.shape

((1168, 304), (292, 304))

---
### Train a Linear Regression Model

In [168]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [169]:
lm = Ridge(alpha=1.0, random_state=0)

lm.fit(X_train, y_train)

In [170]:
y_pred = lm.predict(X_val)

In [171]:
mean_absolute_error(y_val, y_pred), mean_squared_error(y_val, y_pred), r2_score(y_val, y_pred)

(21346.40604835415, 2520975302.0084486, 0.634950756945355)

#### Feature Importance
Determining which feature in the dataset are the most important

In [172]:
weights = lm.coef_
weights_df = pd.DataFrame({
    'columns': X_train.columns,
    'weight': weights
}).sort_values('weight', ascending=False)

In [173]:
print('Top 5 most important features are: ')
weights_df[:5]

Top 5 most important features are: 


Unnamed: 0,columns,weight
100,Condition2_Norm,60424.726166
99,Condition2_Feedr,40943.858487
132,RoofMatl_WdShngl,39030.100889
86,Neighborhood_StoneBr,33993.802893
101,Condition2_PosA,33891.669301


---
### Prediction

In [175]:
sample_input = { 'MSSubClass': 20, 'MSZoning': 'RL', 'LotFrontage': 77.0, 'LotArea': 9320,
 'Street': 'Pave', 'Alley': np.nan, 'LotShape': 'IR1', 'LandContour': 'Lvl', 'Utilities': 'AllPub',
 'LotConfig': 'Inside', 'LandSlope': 'Gtl', 'Neighborhood': 'NAmes', 'Condition1': 'Norm', 'Condition2': 'Norm',
 'BldgType': '1Fam', 'HouseStyle': '1Story', 'OverallQual': 4, 'OverallCond': 5, 'YearBuilt': 1959,
 'YearRemodAdd': 1959, 'RoofStyle': 'Gable', 'RoofMatl': 'CompShg', 'Exterior1st': 'Plywood',
 'Exterior2nd': 'Plywood', 'MasVnrType': 'None','MasVnrArea': 0.0,'ExterQual': 'TA','ExterCond': 'TA',
 'Foundation': 'CBlock','BsmtQual': 'TA','BsmtCond': 'TA','BsmtExposure': 'No','BsmtFinType1': 'ALQ',
 'BsmtFinSF1': 569,'BsmtFinType2': 'Unf','BsmtFinSF2': 0,'BsmtUnfSF': 381,
 'TotalBsmtSF': 950,'Heating': 'GasA','HeatingQC': 'Fa','CentralAir': 'Y','Electrical': 'SBrkr', '1stFlrSF': 1225,
 '2ndFlrSF': 0, 'LowQualFinSF': 0, 'GrLivArea': 1225, 'BsmtFullBath': 1, 'BsmtHalfBath': 0, 'FullBath': 1,
 'HalfBath': 1, 'BedroomAbvGr': 3, 'KitchenAbvGr': 1,'KitchenQual': 'TA','TotRmsAbvGrd': 6,'Functional': 'Typ',
 'Fireplaces': 0,'FireplaceQu': np.nan,'GarageType': np.nan,'GarageYrBlt': np.nan,'GarageFinish': np.nan,'GarageCars': 0,
 'GarageArea': 0,'GarageQual': np.nan,'GarageCond': np.nan,'PavedDrive': 'Y', 'WoodDeckSF': 352, 'OpenPorchSF': 0,
 'EnclosedPorch': 0,'3SsnPorch': 0, 'ScreenPorch': 0, 'PoolArea': 0, 'PoolQC': np.nan, 'Fence': np.nan, 'MiscFeature': 'Shed',
 'MiscVal': 400, 'MoSold': 1, 'YrSold': 2010, 'SaleType': 'WD', 'SaleCondition': 'Normal'}

pd.DataFrame(sample_input, index=[0])

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,20,RL,77.0,9320,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,Shed,400,1,2010,WD,Normal


In [186]:
input_df = pd.DataFrame(sample_input, index=[0])
input_df[numerical_cols] = imputer.transform(input_df[numerical_cols])

input_df[numerical_cols] = scaler.transform(input_df[numerical_cols])

encoder.transform(input_df[categorical_cols].values).toarray()



array([[0., 0., 0., 1., 0., 0., 1., 0., 0., 1., 1., 0., 0., 0., 0., 0.,
        0., 1., 1., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
        0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 1.,
        0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0.,
        0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
        0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0.,
        0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0.,
        0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 

In [191]:
def predict_input(new_input):
    input_df = pd.DataFrame(new_input, index=[0])
    input_df[numerical_cols] = imputer.transform(input_df[numerical_cols])
    input_df[numerical_cols] = scaler.transform(input_df[numerical_cols])
    input_df_encoded = encoder.transform(input_df[categorical_cols].values).toarray()
    input_df = pd.concat([input_df[numerical_cols], pd.DataFrame(input_df_encoded, columns=encoded_cols)], axis=1)
    result = f'Model prediction: {lm.predict(input_df)[0]}'
    return result

predict_input(sample_input)



'Model prediction: 124447.50831683222'