## Prerequisite

In [None]:
# helper packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno
%matplotlib inline
import sklearn 
from sklearn.feature_selection import VarianceThreshold
from sklearn.compose import TransformedTargetRegressor
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA
import statsmodels.api as sm
import statsmodels.formula.api as smf

## Cali dataset

In [None]:
# read california housing dataset
cali = pd.read_csv("../data/housing.csv")
# show dimensions
cali.shape

In [None]:
cali.info()

In [None]:
cali.sample(20)

In [None]:
cali.isnull().sum()

In [None]:
cali.duplicated().sum()

In [None]:
cali.describe()

In [None]:
cali.corr()

In [None]:
plt.figure(figsize=(15,10))

In [None]:
sns.heatmap(cali.corr(),cmap='mako', annot=True, vmin=-1, vmax=1)

In [None]:
# featues
X = cali.drop("median_house_value", axis=1)
X.head()

In [None]:
# target variable
Y = cali["median_house_value"]
Y.head()

## Check for non-informative predictors
### Feature variance
After testing for feature variace, we could see that there were no constant features, so none were deleted

In [None]:
# remove low variance features
var_thres = VarianceThreshold(threshold=0)
# exception for categorical features
var_thres.fit(X.drop("ocean_proximity", axis=1))
# shows which column is non constant (True)
var_thres.get_support()

In [None]:
cali["ocean_proximity"] = cali["ocean_proximity"].replace(['NEAR OCEAN', 'NEAR BAY', 'ISLAND'], 'OTHER')
cali2 = pd.get_dummies(data=cali,drop_first=True)
cali2 = cali2.dropna()

In [None]:
features = cali2.drop('median_house_value',axis = 1)
target = cali2['median_house_value']

In [None]:
model = sm.OLS(target,features)
fit = model.fit()
fit.summary()

## Check for missingness

In [None]:
msno.matrix(cali, labels=True, filter="bottom", sort="ascending", n=50)

In [None]:
cali[cali['total_bedrooms'].isna()].sample(20)

why its missing

- from the mask it seems its missing at random , and its only 207.

In [None]:
cali = cali.dropna()

## Check for extreme values
### outlier detection
Note that extreme value detection and removal only applies to numerical features. To do it we first plot a boxplot of some feature, if we notice it has outliers we run the outlier removal function

In [None]:

cali.boxplot(["total_rooms", "total_bedrooms", "population", "households"])

In [None]:
sns.boxplot(x=cali["median_income"])

### outlier removal

In [None]:
for x in ['median_income']:
    q75,q25 = np.percentile(cali.loc[:,x],[75,25])
    intr_qr = q75-q25
 
    max = q75+(1.5*intr_qr)
    min = q25-(1.5*intr_qr)
 
    cali.loc[cali[x] < min,x] = np.nan
    cali.loc[cali[x] > max,x] = np.nan

In [None]:
for x in ["total_rooms"]:
    q75,q25 = np.percentile(cali.loc[:,x],[75,25])
    intr_qr = q75-q25
 
    max = q75+(1.5*intr_qr)
    min = q25-(1.5*intr_qr)
 
    cali.loc[cali[x] < min,x] = np.nan
    cali.loc[cali[x] > max,x] = np.nan

In [None]:
for x in ["total_bedrooms"]:
    q75,q25 = np.percentile(cali.loc[:,x],[75,25])
    intr_qr = q75-q25
 
    max = q75+(1.5*intr_qr)
    min = q25-(1.5*intr_qr)
 
    cali.loc[cali[x] < min,x] = np.nan
    cali.loc[cali[x] > max,x] = np.nan

In [None]:
for x in ["population"]:
    q75,q25 = np.percentile(cali.loc[:,x],[75,25])
    intr_qr = q75-q25
 
    max = q75+(1.5*intr_qr)
    min = q25-(1.5*intr_qr)
 
    cali.loc[cali[x] < min,x] = np.nan
    cali.loc[cali[x] > max,x] = np.nan

In [None]:
for x in ["households"]:
    q75,q25 = np.percentile(cali.loc[:,x],[75,25])
    intr_qr = q75-q25
 
    max = q75+(1.5*intr_qr)
    min = q25-(1.5*intr_qr)
 
    cali.loc[cali[x] < min,x] = np.nan
    cali.loc[cali[x] > max,x] = np.nan

In [None]:
cali.isnull().sum()

In [None]:
cali = cali.dropna(axis = 0)

In [None]:
#cali.isnull().sum()
cali.boxplot(["total_rooms", "total_bedrooms", "population", "households"])

## Check for skewness

In [None]:
sns.pairplot(cali, plot_kws=dict());

from this plot we can see there is right skew for these features total_rooms , total_bedrooms, population, households and median_income and we want to take log transformation

In [None]:
cali['log_total_rooms']= np.log(cali.total_rooms)
cali['log_total_bedrooms']= np.log(cali.total_bedrooms)
cali['log_population']= np.log(cali.population)
cali['log_households']= np.log(cali.households)
cali['log_median_income']= np.log(cali.median_income)
cali['log_median_house_value']= np.log(cali.median_house_value)

In [None]:
cali["ocean_proximity"] = cali["ocean_proximity"].replace(['NEAR OCEAN', 'NEAR BAY', 'ISLAND'], 'OTHER')
cali = pd.get_dummies(data=cali,drop_first=True)

In [None]:

features = cali.drop(['log_median_house_value','median_house_value', 'total_rooms','total_bedrooms', 'population', 'households', 'log_median_income'],axis = 1)
target = cali['log_median_house_value']

In [None]:
model = sm.OLS(target,features)
fit = model.fit()
fit.summary()

Notes:
[1] R² is computed without centering (uncentered) since the model does not contain a constant.
[2] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[3] The condition number is large, 1.46e+03. This might indicate that there are
strong multicollinearity or other numerical problems.

In [None]:
plt.figure(figsize=(8, 12))
heatmap = sns.heatmap(cali.corr()[['log_median_house_value']].sort_values(by='log_median_house_value', ascending=False), vmin=-1, vmax=1, annot=True, cmap=sns.diverging_palette(250, 30, l=65, as_cmap=True))


## Data Standardization

In [None]:
X = cali.drop(['median_house_value'], axis=1)
y = np.log(cali.median_house_value) # Applying log transformation

In [None]:
scaler = StandardScaler()
scaler.fit(X)
X = pd.DataFrame(scaler.transform(X), index= X.index, columns= X.colum

In [None]:
X

## Check for imbalanced distributions
Check for imbalanced distributions
Check for factor variable where some levels are very common while others very rare. After using value counts on ocean proximity feature, we can see that 'NEAR OCEAN', 'NEAR BAY' and 'ISLAND' have way lower value counts, in this case we can use Lumping to group them together into one category called 'OTHER'

In [None]:
cali["ocean_proximity"].value_counts()

In [None]:
# replace the values near ocean, near bay and island with other
#cali["ocean_proximity"] = cali["ocean_proximity"].loc[row_indexer,col_indexer] = value
cali["ocean_proximity"] = cali["ocean_proximity"].replace(['NEAR OCEAN', 'NEAR BAY', 'ISLAND'], 'OTHER')

In [None]:
cali["ocean_proximity"].value_counts()

## Check for redundant features

## Dimension reduction

## One-hot encoding / Dummy encoding

In [None]:
# encode categorical variables as numeric using dummy encoding
data_encoded=pd.get_dummies(data=cali,drop_first=True)
data_encoded

# Target Transformation
## Check for skewness