In [50]:
import numpy as np
import pandas as pd

In [51]:
housing = pd.read_csv(r'C:\Users\DELL\Documents\Data Analytics\Datasets\housing\housing.csv')

### Splitting the dataset

In [52]:
from sklearn.model_selection import train_test_split

train,test = train_test_split(housing, test_size = 0.2, random_state= 42)

In [53]:
test.shape

(4128, 10)

### Cleaning the data set

In [54]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4128 entries, 20046 to 3665
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           4128 non-null   float64
 1   latitude            4128 non-null   float64
 2   housing_median_age  4128 non-null   float64
 3   total_rooms         4128 non-null   float64
 4   total_bedrooms      3921 non-null   float64
 5   population          4128 non-null   float64
 6   households          4128 non-null   float64
 7   median_income       4128 non-null   float64
 8   median_house_value  4128 non-null   float64
 9   ocean_proximity     4128 non-null   object 
dtypes: float64(9), object(1)
memory usage: 354.8+ KB


### Feature Engineering 

In [55]:
test['rooms_per_household'] = test['total_rooms']/test['households']
test['population_per_household'] = test['population']/test['households']
test['bedrooms_per_rooms'] = test['total_bedrooms']/test['total_rooms']

In [56]:
test_num = test.drop('ocean_proximity', axis =1)
test_num

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,rooms_per_household,population_per_household,bedrooms_per_rooms
20046,-119.01,36.06,25.0,1505.0,,1392.0,359.0,1.6812,47700.0,4.192201,3.877437,
3024,-119.46,35.14,30.0,2943.0,,1565.0,584.0,2.5313,45800.0,5.039384,2.679795,
15663,-122.44,37.80,52.0,3830.0,,1310.0,963.0,3.4801,500001.0,3.977155,1.360332,
20484,-118.72,34.28,17.0,3051.0,,1705.0,495.0,5.7376,218600.0,6.163636,3.444444,
9814,-121.93,36.62,34.0,2351.0,,1063.0,428.0,3.7250,278000.0,5.492991,2.483645,
...,...,...,...,...,...,...,...,...,...,...,...,...
15362,-117.22,33.36,16.0,3165.0,482.0,1351.0,452.0,4.6050,263300.0,7.002212,2.988938,0.152291
16623,-120.83,35.36,28.0,4323.0,886.0,1650.0,705.0,2.7266,266800.0,6.131915,2.340426,0.204950
18086,-122.05,37.31,25.0,4111.0,538.0,1585.0,568.0,9.2298,500001.0,7.237676,2.790493,0.130868
2144,-119.76,36.77,36.0,2507.0,466.0,1227.0,474.0,2.7850,72300.0,5.289030,2.588608,0.185880


### Feature Scaling and Final Preparation

In [57]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer



In [72]:

test_num = test_num.drop('median_house_value', axis=1)
num_pipeline = Pipeline([('imputer', SimpleImputer(strategy ='median')),('std_scaler', StandardScaler())])

test_num_tr = num_pipeline.fit_transform(test_num)
pd.DataFrame(test_num_tr)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0.255417,0.221941,-0.300740,-0.500566,-0.241970,-0.026548,-0.357860,-1.142376,-0.433677,0.520224,-0.160427
1,0.029766,-0.209477,0.098724,0.150380,-0.241970,0.128764,0.222450,-0.690505,-0.130503,-0.162932,-0.160427
2,-1.464546,1.037884,1.856363,0.551903,-0.241970,-0.100164,1.199951,-0.186169,-0.510634,-0.915576,-0.160427
3,0.400837,-0.612759,-0.939881,0.199269,-0.241970,0.254450,-0.007095,1.013807,0.271824,0.273237,-0.160427
4,-1.208808,0.484544,0.418295,-0.117603,-0.241970,-0.321910,-0.179898,-0.055993,0.031825,-0.274819,-0.160427
...,...,...,...,...,...,...,...,...,...,...,...
4123,1.153008,-1.044178,-1.019774,0.250874,-0.113687,-0.063356,-0.117998,0.411772,0.571918,0.013409,-1.080386
4124,-0.657216,-0.106312,-0.061061,0.775071,0.846060,0.205073,0.534528,-0.586693,0.260472,-0.356514,-0.148198
4125,-1.268982,0.808107,-0.300740,0.679104,0.019347,0.146719,0.181184,2.870089,0.656181,-0.099788,-1.459606
4126,-0.120668,0.554884,0.578080,-0.046986,-0.151697,-0.174678,-0.061257,-0.555650,-0.041164,-0.214947,-0.485791


In [73]:

num_attribs = list(test_num)
cat_attribs = ['ocean_proximity']

full_pipeline = ColumnTransformer([('num', num_pipeline, num_attribs), ('cat', OneHotEncoder(), cat_attribs)])
test_prepared = full_pipeline.fit_transform(test)

In [74]:
pd.DataFrame(test_prepared)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,0.255417,0.221941,-0.300740,-0.500566,-0.241970,-0.026548,-0.357860,-1.142376,-0.433677,0.520224,-0.160427,0.0,1.0,0.0,0.0,0.0
1,0.029766,-0.209477,0.098724,0.150380,-0.241970,0.128764,0.222450,-0.690505,-0.130503,-0.162932,-0.160427,0.0,1.0,0.0,0.0,0.0
2,-1.464546,1.037884,1.856363,0.551903,-0.241970,-0.100164,1.199951,-0.186169,-0.510634,-0.915576,-0.160427,0.0,0.0,0.0,1.0,0.0
3,0.400837,-0.612759,-0.939881,0.199269,-0.241970,0.254450,-0.007095,1.013807,0.271824,0.273237,-0.160427,1.0,0.0,0.0,0.0,0.0
4,-1.208808,0.484544,0.418295,-0.117603,-0.241970,-0.321910,-0.179898,-0.055993,0.031825,-0.274819,-0.160427,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4123,1.153008,-1.044178,-1.019774,0.250874,-0.113687,-0.063356,-0.117998,0.411772,0.571918,0.013409,-1.080386,1.0,0.0,0.0,0.0,0.0
4124,-0.657216,-0.106312,-0.061061,0.775071,0.846060,0.205073,0.534528,-0.586693,0.260472,-0.356514,-0.148198,0.0,0.0,0.0,0.0,1.0
4125,-1.268982,0.808107,-0.300740,0.679104,0.019347,0.146719,0.181184,2.870089,0.656181,-0.099788,-1.459606,1.0,0.0,0.0,0.0,0.0
4126,-0.120668,0.554884,0.578080,-0.046986,-0.151697,-0.174678,-0.061257,-0.555650,-0.041164,-0.214947,-0.485791,0.0,1.0,0.0,0.0,0.0
