# Packages 📦
Run the following cell to load all needed packages.

In [29]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Data 📊
Lets get familiar with the data we will be working with. Start by loading the training data into a pandas Dataframe and extracting high level information.

In [30]:
df = pd.read_csv("../data/train.csv")
df.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


# Data 📊
- As we can observe from the result of the previous cell, some features are on a different scale than other features. This difference in scale makes it a ideal for us to normalize the data to bring all features to a small and common scale.
- Also, we can observe that some of the features are in a significantly higher scale than others. We will need to scale this features down before normalizing them. We will use *log* to scale these features down

# Feature & Label 🏷️
- Before scaling and normalizing the data, lets go ahead and split our feature data from our label data

In [31]:
y = df[["Id", "SalePrice"]]
X = df.drop(columns=["Id", "SalePrice"])

print(f"Label Dataset shape: {y.shape}")
print(f"Feature Dataset shape: {X.shape}")
y.head()
X.head()

Label Dataset shape: (1460, 2)
Feature Dataset shape: (1460, 79)


Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


# NaN Column Values 🚫
- Lets extract columns that have missing values. We will need to populate these columns with the appropriate data otherwise we will face issues when training the model.
- For this project, we will update **Null** values with the **median** ONLY for numerical columns.

In [32]:
na_columns = {col: i for i, col in enumerate(X.columns) if X[col].isna().any()}

print(f"\nAmount of columns with NaN values: {len(na_columns.keys())}")
print(f"\nColumns with NaN values: {na_columns.keys()}")

# Extracts columns from na_columns whose datatype is not numeric
nan_cols = {col: idx for col, idx in na_columns.items() if X[col].dtype == type(object)}
print(f"\nAmount of Non-numerical columns with Null values: {len(nan_cols.keys())}")
print(f"Columns of type object with null values: {nan_cols.keys()}")

# Extract numerical cols with Null values
num_cols_na = {col: idx for col, idx in na_columns.items() if nan_cols.get(col, False) is False}
print(f"\nAmount of numerical columns with Null values: {len(num_cols_na.keys())}")
print(f"Columns of type object with null values: {num_cols_na.keys()}")

# Extract the median for each column
na_cols_median_vals = {col: X[col].median() for col in num_cols_na.keys()}
print(na_cols_median_vals)

#Fill NaNs
for col in num_cols_na:
    X[col] = X[col].fillna(na_cols_median_vals[col])


# Fill cols with 'NA'
# > if previous logic does not work, try this: For non-numeric columns, populate NULL values with the most frequent used value
for col in nan_cols:
    X[col] = X[col].fillna("Missing")
    #col_counts = col_s.value_counts()
    #print(col_counts)

# Check for NaNs to ensure all columns are populated
na_columns = {col: i for i, col in enumerate(X.columns) if X[col].isna().any()}
print(f"\nColumns with Nulls: {len(na_columns.keys())}")
X.head()


Amount of columns with NaN values: 19

Columns with NaN values: dict_keys(['LotFrontage', 'Alley', 'MasVnrType', 'MasVnrArea', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Electrical', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature'])

Amount of Non-numerical columns with Null values: 16
Columns of type object with null values: dict_keys(['Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Electrical', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature'])

Amount of numerical columns with Null values: 3
Columns of type object with null values: dict_keys(['LotFrontage', 'MasVnrArea', 'GarageYrBlt'])
{'LotFrontage': np.float64(69.0), 'MasVnrArea': np.float64(0.0), 'GarageYrBlt': np.float64(1980.0)}

Columns with Nulls: 0


Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,Missing,Reg,Lvl,AllPub,Inside,...,0,0,Missing,Missing,Missing,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,Missing,Reg,Lvl,AllPub,FR2,...,0,0,Missing,Missing,Missing,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,Missing,IR1,Lvl,AllPub,Inside,...,0,0,Missing,Missing,Missing,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,Missing,IR1,Lvl,AllPub,Corner,...,0,0,Missing,Missing,Missing,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,Missing,IR1,Lvl,AllPub,FR2,...,0,0,Missing,Missing,Missing,0,12,2008,WD,Normal


# Train & Validation Datasets 📈
- In this part of the notebook, we will normalize our dataset as well as splittling between between **train** and **validation** datasets.
- For features that are on a larger scale than others, we will scale them down using **log** before normalizing them.
- For normalization, we are going to utilize scikit-learn **StandardScaler** which allows us to normalize our features with **z-score** normalization.

In [36]:
# Lets start by splittling our dataset into train and validation
print(f"\nFeatures shape before splitting: {X.shape}")
print(f"\nLabel shape before splitting: {y.shape}")
X_train, X_cv, y_train, y_cv = train_test_split(X, y, test_size=0.3, shuffle=True)

print(f"\nTrain Features shape: {X_train.shape}")
print(f"Train Labels shape: {y_train.shape}")
print(f"\nTrain Features shape: {X_cv.shape}")
print(f"Train Labels shape: {y_cv.shape}")

# Now that we splitted our Datasets into train and cv, lets go ahead and scale and normalize our features



Features shape before splitting: (1460, 79)

Label shape before splitting: (1460, 2)

Train Features shape: (1022, 79)
Train Labels shape: (1022, 2)

Train Features shape: (438, 79)
Train Labels shape: (438, 2)
