# Data Preprocessing & Feature Engineering for Machine Learning (Housing Dataset)

## Data Import and first Inspection

1. __Import__ the housing dataset (housing.csv) and __inspect__!

In [None]:
#inspect housing for more than 20000 districts
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import scipy.stats as stats

In [None]:
df=pd.read_csv('housing.csv')
df
#each and every row stands for district in california 

__Features__:

* **longitude:**  geographic coordinate (district´s east-west position)
* **latitude:**  geographic coordinate (district´s north-south position)
* **housing_median_age:** median age of houses in district
* **total_rooms** Sum of all rooms in district
* **total_bedrooms** Sum of all bedrooms in district
* **population:** total population in district
* **households:** total households in district
* **median_income:** median household income in district 
* **median_house_value:** median house value in district
* **ocean_proximity:** District´s proximity to the ocean

In [None]:
df.info()
#we have misisng values in total_bedrooms so check the column
#object datatype is for text or categorical values

In [None]:
df[df.total_bedrooms.isna()]
#checks the missing values in the column

In [None]:
df[df.duplicated()] #no duplicates


In [None]:
#summary statistics on the numerical column 
df.describe()

In [None]:
#include information on the text parameter by including 'O' object in include parameter
df.describe(include="O")

In [None]:
df.ocean_proximity.value_counts()
#only 5 districts are located on an island

In [None]:
df.total_bedrooms.value_counts()

In [None]:
df.total_rooms.value_counts()
#we have 18 districts with 1527 rooms

In [None]:
df.hist(bins=50,figsize=(20,14))
plt.show()
#create histogram for each numerical value

## Data Cleaning and Creating additional Features

2. __Drop__ all rows with (at least one) missing value(s).

3. __Add__ the additional Feature __"rooms_per_household"__ (should be self-explanatory)

4. __Add__ the additional Feature __"population_per_household"__ (should be self-explanatory)

5. __Add__ the additional Feature __"bedrooms_per_household"__ (should be self-explanatory)

In [None]:
#only total_bedrooms column has missing value
df.dropna(inplace=True) #drop the missing value

In [None]:
df.info()

In [None]:
df["rooms_per_household"]=df.total_rooms.div(df.households)

In [None]:
df.rooms_per_household.nlargest(10)

In [None]:
df.rooms_per_household.nsmallest(10)
#remove datapoints of extreme values

In [None]:
df.loc[[1914,5916,8219]]

In [None]:
df["pop_per_household"]=df.population.div(df.households)

In [None]:
df["bedrooms_per_household"]=df.total_bedrooms.div(df.households)

In [None]:
#summary statistics
df.describe()

## Which Factors influence House Prices?

6. __Calculate__ the __Correlation__ between "median_house_value" and all features. Which factors seems to influence house prices/values?

7. __Create__ a Seaborn Regression plot (__jointplot__) with income on the x-axis and house value on the y-axis.

8. __Create__ the following __scatterplot__ (df.plot(kind = "scatter")) with
- longitude on x-axis
- latitude on y-axis
- size (s) of data points determined by population
- color (c) of data points determined by median_house_value

9. Does this look familiar to you? It´s California. Let´s __add the map__ of California saved in __california.png__.

In [None]:
df.median_house_value.hist(bins=100,figsize=(12,8))
plt.show()

In [None]:
df.corr()
#gets a correlation matrix between pairs of features

In [None]:
#correlation values between house value and features
df.corr().median_house_value

In [None]:
df.corr().median_house_value.sort_values(ascending=False)
# - indicate strong negative correlation [no linear relationship] between that random variables
# + indicate strong positive relation between that random variables
# round of 0 indicate no relation between the random variables
# we have positive relation between median_house_value and median_income-- higher income means more people can pay for the house
# lower the no of bedrooms in the house -- higher the price of the house

In [None]:
df.median_income.hist(bins=100,figsize=(12,8))
plt.show()

In [None]:
sns.set(font_scale=1.5)
sns.jointplot(data=df,x='median_income',y='median_house_value',kind='reg',height=10)
plt.show()
#regression plot with a linear regression line -- scatterplot with regression line 
# strong positive relations between house value and the income
#higher the income -- higher the house prices

In [None]:
#kernel density estimation kde
sns.set(font_scale=1.5)
sns.jointplot(data=df,x='median_income',y='median_house_value',kind='kde',height=10)
plt.show()

In [None]:
#create a scatterplot with plot method 
# longitude on x-axis
# latitude on y-axis
# size (s) of data points determined by population
# color (c) of data points determined by median_house_value
df.plot(kind='scatter',x='longitude',y='latitude',
        s=df.population/100,label='Population', figsize=(15,10),
        c='median_house_value',cmap='coolwarm',colorbar=True,alpha=0.4,fontsize=15,sharex=False)
plt.xlabel('Longitude',fontsize=14)
plt.ylabel('Latitude',fontsize=14)
plt.legend(fontsize=16)
plt.show()

#higher the population in a district, larger the datapoint 

In [None]:
#because the map looks like california, include the california image in the map 
import matplotlib.image as mpimage
#mpimage.imread('california.png')
california_img=mpimage.imread('california.png')
#reads the file from an image into an array


In [None]:
california_img

In [None]:
plt.figure(figsize=(15,10))
plt.imshow(california_img)
plt.show()

In [None]:
#define new extent label
plt.figure(figsize=(15,10))
plt.imshow(california_img,extent=[-124.55,-113.80,32.45,42.05]) #search the coordinates
plt.show()

In [None]:
#Analyse the complete data
df.plot(kind='scatter',x='longitude',y='latitude',
        s=df.population/100,label='Population', figsize=(15,10),
        c='median_house_value',cmap='coolwarm',colorbar=True,alpha=0.4,fontsize=15,sharex=False)
plt.imshow(california_img,extent=[-124.55,-113.80,32.45,42.05],alpha=0.5,cmap=plt.get_cmap('jet'))
plt.xlabel('Longitude',fontsize=14)
plt.ylabel('Latitude',fontsize=14)
plt.legend(fontsize=16)
plt.show()


In [None]:
prox=df.ocean_proximity.unique()
prox

In [None]:
df_prox=df[df.ocean_proximity==prox[3]].copy() 
df_prox

In [None]:
#we are analysing the data for near ocean values
df_prox.plot(kind='scatter',x='longitude',y='latitude',
            s=df_prox.population/100,label='Population',figsize=(15,10),
            c='median_house_value',cmap='coolwarm',
            colorbar=True,alpha=0.4,fontsize=20,sharex=False)
plt.imshow(california_img,extent=[-124.55,-113.80,32.45,42.05],alpha=0.5,cmap=plt.get_cmap('jet'))
plt.xlabel('longitude',fontsize=14)
plt.ylabel('latitude',fontsize=14)
plt.legend(fontsize=16)
plt.show()


In [None]:
#analyse the data for inland 
df_prox1=df[df.ocean_proximity==prox[2]].copy()
df_prox1

In [None]:
df_prox1.plot(kind='scatter',x='longitude',y='latitude',
              s=df_prox1.population/100,label='Population',figsize=(15,10),
              c='median_house_value',cmap='coolwarm',
              colorbar=True,alpha=0.4,fontsize=20,sharex=False)
plt.imshow(california_img,extent=[-124.55,-113.80,32.45,42.05],alpha=0.5,cmap=plt.get_cmap('jet'))

plt.xlabel('latitude',fontsize=14)
plt.ylabel('longitude',fontsize=15)
plt.legend(fontsize=16)
plt.show()
#there is a relationship between latitude/longitude and prices but no linear relationship
# no clear linear relationship between latitude, longitude and house prices

## Advanced Explanatory Data Analyis with Seaborn

10. __Add__ the additional column __"income_cat"__ with the following income categories:
- lowest 25% -> "Low"
- 25th to 50th percentile -> "Below_Average"
- 50th to 75th percentile -> "Above_Average"
- 75th to 95th percentile -> "High"
- Above 95th percentile -> "Very High"

11. __Create__ (and interpret) the following Seaborn __Countplots__:

12. __Create__ (and interpret) the following Seaborn __Heatmap__ with mean house values for all combinations of income_cat & ocean_proximity:

In [None]:
df.median_income.hist(bins=50,figsize=(15,10))
plt.title('Median Income')
plt.show()


In [None]:
pd.qcut(df.median_income,q=[0,0.25,0.50,0.75,0.95,1]) 
#discritization of bining 
#convert numerical value into categorical value
#qcut -- q stands for quantile
#define the quantile or percentile --> 25 per lowest district are organized in first bin 0,0.25 ie 25th percentile
#next bin from 0.25 to 0.50 ie 25 th to 50th percentile and so on.
#output gives 5 interval

In [None]:
df["income_cut"]=pd.qcut(df.median_income,q=[0,0.25,0.50,0.75,0.95,1],labels=['low','Below Average','Above Average','High','Very High'])
df["income_cut"]

In [None]:
df.income_cut.value_counts(normalize=True)
#normalize=True gives relative count rather than absolute count
#25 per for above average, 25 per for low 

In [None]:
plt.figure(figsize=(12,8))
sns.set(font_scale=1.5,palette='viridis')
sns.countplot(data=df,x='income_cut',hue='ocean_proximity')
plt.legend(loc=1)
plt.show()

In [None]:
plt.figure(figsize=(12,8))
sns.set(font_scale=1.5)
sns.barplot(data=df,x='income_cut',y='median_house_value',dodge=True)
plt.show()
#higher the median_income in the district, higher the house value

In [None]:
plt.figure(figsize=(12,8))
sns.set(font_scale=1.5)
sns.barplot(data=df,x='ocean_proximity',y='median_house_value',dodge=True)
plt.show()
#median_house_value of the houses located in inland are clearly lower
#height of the bar are points estimates of the median house value,
#small black lines show 95 per confidence interval

In [None]:
matrix=df.groupby(["income_cut","ocean_proximity"]).median_house_value.mean()
matrix

In [None]:
#group income_category and ocean_proximity 
matrix=df.groupby(["income_cut","ocean_proximity"]).median_house_value.mean().unstack().drop(columns=['ISLAND'])
#drop columns -->island --> because it has lessobservations 


In [None]:
matrix.astype('int') #make float into integers-- makes it easier to display

In [None]:
plt.figure(figsize=(12,8))
sns.set(font_scale=1.4)
sns.heatmap(matrix.astype('int'),cmap='Reds',annot=True,fmt='d',vmin=90000,vmax=470000)
plt.show()
#annot=True, displays values on the block, 
#fmt='d' displays values in the fomrat of 450000 (proper number format) if not written it gives 4.5e+0.5
#minimum value of 90000 and maximum value of 470000 for median house value on the rightmost bar
#light red indicates median house value of 100000 and dark red indicates median house value of 450000

## Feature Engineering

In [None]:
#we want to forecast the prices
label=df.median_house_value.copy()
label

In [None]:
features=df.drop(columns=['median_house_value'])
features
#its not best to keep numerical features
#many ML algorithms dont perform well with different scales
#different ways to scale or normalize different features

In [None]:
features.info()

In [None]:
#select all columns with float 
features.select_dtypes('float')

In [None]:
feat1=features.select_dtypes('float').apply(lambda x: stats.zscore(x))
feat1
#stats is an extensive statistical library
#apply zscore on each and every column 

In [None]:
#disply only two decimals 
pd.options.display.float_format='{:.2f}'.format

In [None]:
feat1

In [None]:
#for each and every column,calculate the mean and standard deviation 
feat1.agg(["mean","std"])
#mean in every column is 0
#std deviation in every column is 1
#defination of standarizing the features --> we get mean of 0 and std deviation of 1
#all columns have the same scale

In [None]:
df.info()

In [None]:
#ML cannot handle text or categorical data, we have to convert it into numbers
#ocean_proximity and income_cut are two categorical features, but we will use only ocean_proximity 
features.ocean_proximity

In [None]:
features.ocean_proximity.value_counts()

In [None]:
dummies=pd.get_dummies(features.ocean_proximity)
dummies
#features are converted into numbers by using one hot encoding
#for each category, we create a separate column
#dummies create redunant value of either 0s or  1s --> this leads to multicolinearity problem
# some algorithms can be harmed by multicolinearity --> like linear regression, 
#in linear regression we need to use k-1 features, here we have 5 columns of dummy so use 5-1 columns of dummy
# In random forest regression, keep all k features (all dummy variables )--> it is benefici

In [None]:
features=pd.concat([feat1,dummies,df.income_cut],axis=1)
features
#axis=1 means horizontally concatenate all three data frames
#features have standarized numerical columns, dummy variables and income_cut categorical values

## Machine Learning - Predicting House Values 

Based on the Explanatory Data Analysis, <br>
- select an appropriate ML Model (e.g. Linear Regression or RandomForest Regression)
- split into Train and Test Set
- prepare/engineer Features
- Fit the Model (Train Set)
- Test the Model (Test Set)

## Splitting the Data into Train and Test set

In [None]:
features

In [None]:
#forecast the house prices for the data point our model has not seen before
#have a test size of 20 %
test_size=0.2

In [None]:
#use pandas method 'sample' to draw a sample for test set
X_test=features.sample(frac=test_size,random_state=123)
X_test
# X stands for features, y stands for labels
# use of sample has a risk of introducing sampling bais means sample is not representative of whole dataset
# eg - have low median_income in training_set and high median_income in test_set-- doesnt represent whole dataset -- this leads to a poor model
#Stratified sampling --> all variables have same ratio in both training and test set

In [None]:
#larger the dataset more simple random_state approaches the stratefied sampling
#check whether we have strategied sampling, hence compare income_cut of test set and features
#if both has values similar or nearly equal, random_state is OK

In [None]:
X_test.income_cut.value_counts(normalize=True)

In [None]:
features.income_cut.value_counts(normalize=True)

In [None]:
X_test.index #get row_index for test_set 

In [None]:
#lets create the training set
#check if the row_index from test set is in features
#if the row_index is not in test set than copy that row in training set
X_train=features.loc[~features.index.isin(X_test.index)].copy()
X_train

In [None]:
X_train.income_cut.value_counts(normalize=True)

In [None]:
features.income_cut.value_counts(normalize=True) #check whether ratios are ok in both trainin and feature set

In [None]:
#shuffle the training set--> rows and training set are randomly ordered 
#shuffle the training set by using the sample method 
X_train=X_train.sample(frac=1,random_state=123)
X_train
#using sample method and frac=1 we are drawing all rows in random order and overwritting X_train

In [None]:
X_train.drop(columns=['income_cut'],inplace=True)


In [None]:
X_test.drop(columns=['income_cut'],inplace=True)

In [None]:
X_train


In [None]:
X_test

In [None]:
label.loc[X_train.index]  #label is median_house_value 
#X_train.index gives row index for the training set

In [None]:
y_train=label.loc[X_train.index]

In [None]:
y_test=label.loc[X_test.index]

In [None]:
y_train

In [None]:
y_test

## Training the ML model (Random Forest Regressor)

In [None]:
#house_prices are influenced in a non linear way
#location of house & prices have no linear relationship
#hence, use model that capture non-linear relationship. RandomForestRegressor is such a model 
#RandomForestRegressor can capture non-linear relationship
from sklearn.ensemble import RandomForestRegressor 

In [None]:
forest_reg=RandomForestRegressor(random_state=42,n_estimators=500,max_features='sqrt',max_depth=75,min_samples_split=2)
#hyperparameters tunning to optimize the model and reduce overfitting

In [None]:
forest_reg.fit(X_train,y_train)

In [None]:
#check how well our model fits the training set, or how well our model predicts the house prices
forest_reg.score(X_train,y_train)
#if the fit is too good, it can indicate overfitting
#we got high value of 97 per this indicates our regressor is overfitting 

In [None]:
#lower the mean_squre_error closer to 0, the better the fit
from sklearn.metrics import mean_squared_error

In [None]:
#make predications on training set 
#mode makes predications on X_train and predict the labels and save in pred
pred=forest_reg.predict(X_train)
pred
#pred is not the actual label or house prices, this are the predicated ones 

In [None]:
#find mean sqaure error by comparing actual/real labels with predicated labels
forest_mse=mean_squared_error(y_train,pred)
forest_rmse=np.sqrt(forest_mse)
forest_rmse
#with score of 0.975 and rmse=17974. , we conclude our model fits the data pretty well
#still we cannot conclude our model generalises new cases or predict house prices for new data

## Evaluating the model on the test set 

In [None]:
forest_reg

In [None]:
forest_reg.score(X_test,y_test)
#score of 0.82 

In [None]:
#predict the labels for our test_set
pred_y_test=forest_reg.predict(X_test)
pred_y_test

In [None]:
forest_pred_mse=mean_squared_error(y_test,pred_y_test)
forest_pred_rmse=np.sqrt(forest_pred_mse)
forest_pred_rmse
#value of 47360

In [None]:
#compare predicated values with true values
comp=pd.DataFrame(data={'True_Values':y_test,'Predicated_Values':pred_y_test})
comp

In [None]:
abs_diff=comp.True_Values.sub(comp.Predicated_Values).abs()
abs_diff

In [None]:
#calculate the mean absolute difference
mean_abs_diff=abs_diff.mean()
mean_abs_diff
#received a value of 31664.
#on average, expect there is a difference of over 30000 between predicated value and actual value

## Feature importance

In [None]:
#conclusion from explanatory analysis was income is the most important factor followed by location
#randomforestregressor can give us relative important features for the model 
forest_reg.feature_importances_

In [None]:
feature_imp=pd.Series(data=forest_reg.feature_importances_,index=X_train.columns).sort_values(ascending=False)
feature_imp
#feature name can be extracted from X_train

In [None]:
feature_imp.sort_values().plot.barh(figsize=(12,8))
plt.show()
#barplot
#median_income is the feature with highest importance
#we can drop some unimportant features and rerun the model

In [None]:
print('The End')