<a href="https://colab.research.google.com/github/LCherop/ML_Tasks/blob/main/Machine_Learning_Lab_Task_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Import the required libraries

In [54]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LassoCV
from sklearn.model_selection import train_test_split

### Import the csv and read the csv file using Pandas

In [2]:
housing_df = pd.read_csv(r'/content/modified_data.csv')
housing_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,...,0,0,0,0,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,...,0,0,0,0,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,...,272,0,0,0,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,...,0,0,0,0,0,12,2008,WD,Normal,250000


In [3]:
housing_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 77 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   LotShape       1460 non-null   object 
 7   LandContour    1460 non-null   object 
 8   Utilities      1460 non-null   object 
 9   LotConfig      1460 non-null   object 
 10  LandSlope      1460 non-null   object 
 11  Neighborhood   1460 non-null   object 
 12  Condition1     1460 non-null   object 
 13  Condition2     1460 non-null   object 
 14  BldgType       1460 non-null   object 
 15  HouseStyle     1460 non-null   object 
 16  OverallQual    1460 non-null   int64  
 17  OverallCond    1460 non-null   int64  
 18  YearBuil

### Preparing for cleaning

In [4]:
#Drop the ID column because it will not be useful in training
housing_df.drop('Id',inplace=True,axis=1)

In [5]:
housing_df.head(4)

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,272,0,0,0,0,2,2006,WD,Abnorml,140000


### Fill in missing values

In [6]:
#find out how many columns have missing values
features_with_null=list(housing_df.columns[housing_df.isna().any()])
len(features_with_null)

15

In [7]:
#Calculate percentages of missing values for each column
for feature in features_with_null:
  print(feature,' ',housing_df[feature].isna().sum()/len(housing_df)*100,' ',housing_df[feature].dtype)


LotFrontage   17.73972602739726   float64
MasVnrType   0.547945205479452   object
MasVnrArea   0.547945205479452   float64
BsmtQual   2.5342465753424657   object
BsmtCond   2.5342465753424657   object
BsmtExposure   2.6027397260273974   object
BsmtFinType1   2.5342465753424657   object
BsmtFinType2   2.6027397260273974   object
Electrical   0.0684931506849315   object
FireplaceQu   47.26027397260274   object
GarageType   5.5479452054794525   object
GarageYrBlt   5.5479452054794525   float64
GarageFinish   5.5479452054794525   object
GarageQual   5.5479452054794525   object
GarageCond   5.5479452054794525   object


In [8]:
#Fill in missing values in features with float64 dtype using mean fill
for i in features_with_null:
  if housing_df[i].dtype == 'float64':
    housing_df[i].fillna(housing_df[i].mean(),inplace=True)

In [9]:
#Fill in missing values in features with object dtype using backward fill
for i in features_with_null:
  if housing_df[i].dtype == 'object':
    housing_df[i].fillna(method='bfill',inplace=True)

In [10]:
#Check if any missing values are still present
housing_df.columns[housing_df.isna().any()]

Index(['FireplaceQu'], dtype='object')

In [11]:
#Fill in missing value in FireplaceQu using ffil
housing_df['FireplaceQu'].fillna(method='ffill',inplace=True)

### Encode the Dataset

In [12]:
#Have all features in a list

all_features=list(housing_df.columns)
len(all_features)

76

In [13]:
#Create list of all categorical features
categorical_features=list(housing_df.select_dtypes(include=['object']).columns)
len(categorical_features)

39

In [14]:
#Create list of all non-categorical features
num_features = list(set(all_features)-set(categorical_features))
len(num_features)

37

In [15]:
target = ['SalePrice']

In [16]:
#Split categorical features into nominal and ordinal
nominal=['MSZoning','LandContour','Neighborhood']
ordinal=list(set(categorical_features)-set(nominal))

In [39]:
#Encode ordinal features
for j in ordinal:
  housing_df[j]=housing_df[j].astype('category').cat.codes

#Assign encoded ordinal data to a data frame
df_ordinal=housing_df[ordinal]

In [18]:
#Encode nominal features and assign to data frame
df_nominal=pd.get_dummies(housing_df[nominal])

In [19]:
#Assign numerical data to a data frame
housing_df[num_features]

Unnamed: 0,BsmtFullBath,MoSold,BedroomAbvGr,Fireplaces,MiscVal,LotArea,EnclosedPorch,FullBath,BsmtHalfBath,BsmtUnfSF,...,MasVnrArea,ScreenPorch,BsmtFinSF1,GarageCars,KitchenAbvGr,GarageArea,3SsnPorch,WoodDeckSF,YearBuilt,OverallCond
0,1,2,3,0,0,8450,0,2,0,150,...,196.0,0,706,2,1,548,0,0,2003,5
1,0,5,3,1,0,9600,0,2,1,284,...,0.0,0,978,2,1,460,0,298,1976,8
2,1,9,3,1,0,11250,0,2,0,434,...,162.0,0,486,2,1,608,0,0,2001,5
3,1,2,3,1,0,9550,272,1,0,540,...,0.0,0,216,3,1,642,0,0,1915,5
4,1,12,4,1,0,14260,0,2,0,490,...,350.0,0,655,3,1,836,0,192,2000,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,0,8,3,1,0,7917,0,2,0,953,...,0.0,0,0,2,1,460,0,0,1999,5
1456,1,2,3,2,0,13175,0,2,0,589,...,119.0,0,790,2,1,500,0,349,1978,6
1457,0,5,4,2,2500,9042,0,2,0,877,...,0.0,0,275,1,1,252,0,0,1941,9
1458,1,4,2,0,0,9717,112,1,0,0,...,0.0,0,49,1,1,240,0,366,1950,6


In [40]:
df_ordinal.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 36 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   BsmtFinType2   1460 non-null   int8 
 1   CentralAir     1460 non-null   int8 
 2   SaleCondition  1460 non-null   int8 
 3   HouseStyle     1460 non-null   int8 
 4   LotConfig      1460 non-null   int8 
 5   SaleType       1460 non-null   int8 
 6   RoofMatl       1460 non-null   int8 
 7   RoofStyle      1460 non-null   int8 
 8   HeatingQC      1460 non-null   int8 
 9   BsmtCond       1460 non-null   int8 
 10  Utilities      1460 non-null   int8 
 11  MasVnrType     1460 non-null   int8 
 12  BsmtQual       1460 non-null   int8 
 13  Heating        1460 non-null   int8 
 14  KitchenQual    1460 non-null   int8 
 15  LotShape       1460 non-null   int8 
 16  ExterCond      1460 non-null   int8 
 17  BsmtExposure   1460 non-null   int8 
 18  GarageQual     1460 non-null   int8 
 19  Founda

In [41]:
#Concantenate all encoded data to a new dataframe
encoded_data = pd.concat([df_nominal,df_ordinal,housing_df[num_features]],axis=1)

In [21]:
encoded_data.shape

(1460, 107)

### Standardize the dataset

In [42]:
#drop the target column
encoded_data.drop('SalePrice',axis = 1, inplace = True)

In [43]:
encoded_data.shape

(1460, 106)

In [44]:
encoded_data.head()

Unnamed: 0,MSZoning_C (all),MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,LandContour_Bnk,LandContour_HLS,LandContour_Low,LandContour_Lvl,Neighborhood_Blmngtn,...,MasVnrArea,ScreenPorch,BsmtFinSF1,GarageCars,KitchenAbvGr,GarageArea,3SsnPorch,WoodDeckSF,YearBuilt,OverallCond
0,0,0,0,1,0,0,0,0,1,0,...,196.0,0,706,2,1,548,0,0,2003,5
1,0,0,0,1,0,0,0,0,1,0,...,0.0,0,978,2,1,460,0,298,1976,8
2,0,0,0,1,0,0,0,0,1,0,...,162.0,0,486,2,1,608,0,0,2001,5
3,0,0,0,1,0,0,0,0,1,0,...,0.0,0,216,3,1,642,0,0,1915,5
4,0,0,0,1,0,0,0,0,1,0,...,350.0,0,655,3,1,836,0,192,2000,5


In [45]:
#define features(x) and target(y)
x = encoded_data.to_numpy()
y = housing_df[target]
y = y.to_numpy()

In [46]:
x.shape

(1460, 106)

In [48]:
X = StandardScaler().fit_transform(x)

In [49]:
y = StandardScaler().fit_transform(y)


### Feature Extraction using PCA

In [59]:
pca = PCA(n_components=2)

In [60]:
p_components=pca.fit_transform(x)

In [61]:
pca.explained_variance_ratio_

array([0.98479253, 0.00495046])

### Feature Extraction using L1

### Training the Regression Model

In [62]:
reg=LassoCV()

In [63]:
reg.fit(p_components,y)

  y = column_or_1d(y, warn=True)


LassoCV()

In [64]:
reg.coef_

array([2.64258294e-05, 1.02857201e-03])