In [25]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import linear_model, metrics
from sklearn.linear_model import LassoCV
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV, RandomizedSearchCV

In [2]:
data = pd.read_csv("modified_data.csv")

## Step 1: Grab Dataset

In [3]:
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,...,0,0,0,0,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,...,0,0,0,0,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,...,272,0,0,0,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,...,0,0,0,0,0,12,2008,WD,Normal,250000


## Step 2: Prepare Dataset

**Search for Missing Values**

In [4]:
data.drop('Id', axis=1, inplace=True)

In [5]:
data.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,272,0,0,0,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,12,2008,WD,Normal,250000


In [6]:
data.shape

(1460, 76)

In [7]:
data.isna().any()

MSSubClass       False
MSZoning         False
LotFrontage       True
LotArea          False
Street           False
                 ...  
MoSold           False
YrSold           False
SaleType         False
SaleCondition    False
SalePrice        False
Length: 76, dtype: bool

In [8]:
original_features = list(data.columns)
original_features
len(original_features)

76

In [9]:
features_with_missing = list(data.columns[data.isna().any()])
len(features_with_missing)

15

In [10]:
for feature in features_with_missing:
    print(feature, ' ',data[feature].isna().sum()/len(data)*100)

LotFrontage   17.73972602739726
MasVnrType   0.547945205479452
MasVnrArea   0.547945205479452
BsmtQual   2.5342465753424657
BsmtCond   2.5342465753424657
BsmtExposure   2.6027397260273974
BsmtFinType1   2.5342465753424657
BsmtFinType2   2.6027397260273974
Electrical   0.0684931506849315
FireplaceQu   47.26027397260274
GarageType   5.5479452054794525
GarageYrBlt   5.5479452054794525
GarageFinish   5.5479452054794525
GarageQual   5.5479452054794525
GarageCond   5.5479452054794525


In [11]:
data.dropna(axis=1, thresh=0.5 * (len(data)), inplace=True) #Any feature with more than 50% missing will be dropped

In [12]:
data['LotFrontage']

0       65.0
1       80.0
2       68.0
3       60.0
4       84.0
        ... 
1455    62.0
1456    85.0
1457    66.0
1458    68.0
1459    75.0
Name: LotFrontage, Length: 1460, dtype: float64

In [13]:
data['MasVnrType']

0       BrkFace
1          None
2       BrkFace
3          None
4       BrkFace
         ...   
1455       None
1456      Stone
1457       None
1458       None
1459       None
Name: MasVnrType, Length: 1460, dtype: object

In [14]:
mean_fill = ['LotFrontage', 'MasVnrArea']
bfill = ['FireplaceQu']
ffill = list(set(features_with_missing)-set(mean_fill)-set(bfill))

In [15]:
ffill

['GarageCond',
 'Electrical',
 'BsmtFinType2',
 'GarageFinish',
 'MasVnrType',
 'GarageType',
 'BsmtExposure',
 'GarageYrBlt',
 'BsmtCond',
 'BsmtQual',
 'BsmtFinType1',
 'GarageQual']

In [16]:
for feature in mean_fill:
    data[feature].fillna(data[feature].mean(), inplace=True)

In [17]:
for feature in ffill:
    data[feature].fillna(method = 'ffill', inplace=True)

In [18]:
for feature in bfill:
    data[feature].fillna(method = 'bfill', inplace=True)

In [19]:
data.isna().any()

MSSubClass       False
MSZoning         False
LotFrontage      False
LotArea          False
Street           False
                 ...  
MoSold           False
YrSold           False
SaleType         False
SaleCondition    False
SalePrice        False
Length: 76, dtype: bool

### Encode The dataset

In [20]:
cat_data = data.select_dtypes(include=['object']).copy() # Extract categorical data

In [21]:
cat_data.head()

Unnamed: 0,MSZoning,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,...,KitchenQual,Functional,FireplaceQu,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,SaleType,SaleCondition
0,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,...,Gd,Typ,TA,Attchd,RFn,TA,TA,Y,WD,Normal
1,RL,Pave,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,...,TA,Typ,TA,Attchd,RFn,TA,TA,Y,WD,Normal
2,RL,Pave,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,...,Gd,Typ,TA,Attchd,RFn,TA,TA,Y,WD,Normal
3,RL,Pave,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,...,Gd,Typ,Gd,Detchd,Unf,TA,TA,Y,WD,Abnorml
4,RL,Pave,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,...,Gd,Typ,TA,Attchd,RFn,TA,TA,Y,WD,Normal


In [22]:
cat_features = list(cat_data.columns)
cat_features

['MSZoning',
 'Street',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'SaleType',
 'SaleCondition']

In [27]:
nominal=['MSZoning', 'LandContour', 'LotConfig','Neighborhood']
ordinal = list(set(cat_features)-set(nominal))
numerical = list(set(original_features)-set(cat_features))
target = ['SalePrice']

In [None]:
data[numerical]

In [None]:
for feature in ordinal: #Integer encoding for ordinals
    data[feature] = (data[feature].astype('category')).cat.codes #change to object to use the cat.codes fn

In [None]:
#One Hot Encoding
df_nominal = pd.get_dummies(data[nominal])

In [None]:
df_ordinal = data[ordinal]

In [None]:
df_numerical = data[numerical]

In [None]:
encoded_data = pd.concat([df_numerical, df_nominal, df_ordinal], axis=1)

In [None]:
encoded_data.head(3)

In [None]:
encoded_data.shape

### Data Normalization

In [None]:
X = encoded_data.drop('SalePrice',axis=1)

In [None]:
y = data[target]
y

In [None]:
X = np.array(X) #Convert to numpy array
X

In [None]:
y = np.array(y)
y

In [None]:
X.shape

In [None]:
y.shape

In [None]:
X = StandardScaler().fit_transform(X)
y = StandardScaler().fit_transform(y)

### PCA Dimensionality Reduction

In [None]:
pca = PCA(n_components=2) # Specify number of principal components
p_components = pca.fit_transform(X)
p_components

In [None]:
df_p_components = pd.DataFrame(p_components, columns=['PC1','PC2'])
df_p_components

In [None]:
pca.explained_variance_ratio_ # Information from principal components

In [None]:
pca_var = PCA(0.9) # Specifying variance
new_pcs = pca_var.fit_transform(X)

In [None]:
pca_var.n_components_

In [None]:
y = pd.DataFrame(data=y, columns=['y'])

In [None]:
new_features = pd.concat([df_p_components, y], axis=1)
new_features.head(3)

### Training a Single Model

In [28]:
regressor = LassoCV()

In [None]:
regressor.fit(X,y)