In [3]:
import numpy as np
import pandas as pd

from scipy.stats import mode

import warnings
warnings.filterwarnings('ignore')

In [4]:
DATASET_PATH = './csv/housing.csv'
PREPARED_DATASET_PATH = './csv/housing_prepared.csv'

In [6]:
df = pd.read_csv(DATASET_PATH, sep=',')

In [13]:
df.shape  # 20640 rows, 11 columns

(20640, 11)

In [14]:
df.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity', 'id'],
      dtype='object')

In [15]:
df.index

RangeIndex(start=0, stop=20640, step=1)

In [17]:
df['total_rooms'].head(4)

0     880.0
1    7099.0
2    1467.0
3    1274.0
Name: total_rooms, dtype: float64

In [19]:
df[['total_rooms', 'total_bedrooms']]

Unnamed: 0,total_rooms,total_bedrooms
0,880.0,129.0
1,7099.0,1106.0
2,1467.0,190.0
3,1274.0,235.0
4,1627.0,280.0
...,...,...
20635,1665.0,374.0
20636,697.0,150.0
20637,2254.0,485.0
20638,1860.0,409.0


In [29]:
df['total_rooms'] > 8300.0

0        False
1        False
2        False
3        False
4        False
         ...  
20635    False
20636    False
20637    False
20638    False
20639    False
Name: total_rooms, Length: 20640, dtype: bool

In [37]:
df = df[df['total_rooms'] > 8300.0]
df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,id
283,-122.16,37.79,22.0,12842.0,2048.0,4985.0,1967.0,5.9849,371000.0,NEAR BAY,283
570,-122.24,37.72,5.0,18634.0,2885.0,7427.0,2718.0,7.6110,350700.0,NEAR BAY,570
576,-122.06,37.77,12.0,14316.0,2045.0,5781.0,2007.0,7.2634,341600.0,NEAR BAY,576
780,-122.10,37.63,18.0,9963.0,2031.0,5613.0,1946.0,3.8171,187200.0,NEAR BAY,780
864,-122.01,37.57,14.0,16199.0,2993.0,,2847.0,5.8322,281800.0,NEAR BAY,864
...,...,...,...,...,...,...,...,...,...,...,...
20530,-121.76,38.57,11.0,15018.0,3008.0,7984.0,2962.0,3.1371,201800.0,INLAND,20530
20539,-121.71,38.56,20.0,8627.0,1516.0,4071.0,1466.0,4.2198,164100.0,INLAND,20539
20544,-121.76,38.55,23.0,8800.0,1857.0,6330.0,1832.0,2.0650,219400.0,INLAND,20544
20563,-121.75,38.67,9.0,12139.0,2640.0,6837.0,2358.0,3.1250,132500.0,INLAND,20563


In [31]:
df[df['total_rooms'] > 8300.0].agg([np.min, np.max, np.mean])

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,id
amin,-122.62,32.65,2.0,8301.0,1074.0,701.0,254.0,1.3811,22500.0,<1H OCEAN,283.0
amax,121.29,41.61,46.0,39320.0,6445.0,35682.0,6082.0,13.947,500001.0,NEAR OCEAN,20629.0
mean,-118.186025,35.174079,12.459695,12322.6853,2230.772917,5686.595289,1991.144928,4.958602,234050.784679,,10987.360248


In [33]:
df[~(df['total_rooms'] > 8300.0)].agg([np.min, np.max, np.mean])  # invert result

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,id
amin,-124.35,-13534.03,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0,-,0.0
amax,122.03,1327.13,52.0,8299.0,3179.0,8793.0,2902.0,15.0001,500001.0,NEAR OCEAN,20639.0
mean,-119.502038,35.033647,29.035613,2403.646029,497.145191,1323.754317,463.797986,3.844602,206204.173835,,10303.4968


In [38]:
df.loc[283]

longitude              -122.16
latitude                 37.79
housing_median_age        22.0
total_rooms            12842.0
total_bedrooms          2048.0
population              4985.0
households              1967.0
median_income           5.9849
median_house_value    371000.0
ocean_proximity       NEAR BAY
id                         283
Name: 283, dtype: object

In [39]:
df.loc[:576]

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,id
283,-122.16,37.79,22.0,12842.0,2048.0,4985.0,1967.0,5.9849,371000.0,NEAR BAY,283
570,-122.24,37.72,5.0,18634.0,2885.0,7427.0,2718.0,7.611,350700.0,NEAR BAY,570
576,-122.06,37.77,12.0,14316.0,2045.0,5781.0,2007.0,7.2634,341600.0,NEAR BAY,576


In [40]:
df.iloc[0]

longitude              -122.16
latitude                 37.79
housing_median_age        22.0
total_rooms            12842.0
total_bedrooms          2048.0
population              4985.0
households              1967.0
median_income           5.9849
median_house_value    371000.0
ocean_proximity       NEAR BAY
id                         283
Name: 283, dtype: object

In [41]:
df.iloc[-6: -1]

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,id
20529,-121.78,38.55,12.0,10509.0,2186.0,5633.0,2138.0,2.9605,204300.0,INLAND,20529
20530,-121.76,38.57,11.0,15018.0,3008.0,7984.0,2962.0,3.1371,201800.0,INLAND,20530
20539,-121.71,38.56,20.0,8627.0,1516.0,4071.0,1466.0,4.2198,164100.0,INLAND,20539
20544,-121.76,38.55,23.0,8800.0,1857.0,6330.0,1832.0,2.065,219400.0,INLAND,20544
20563,-121.75,38.67,9.0,12139.0,2640.0,6837.0,2358.0,3.125,132500.0,INLAND,20563
