In [1]:
import numpy as np
import pandas as pd

from scipy.stats import mode

import warnings
warnings.filterwarnings('ignore')

In [2]:
DATASET_PATH = './csv/housing_emissions.csv'

In [3]:
df = pd.read_csv(DATASET_PATH, sep=',')
df.head(4)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,id,housing_median_age_nan,ocean_proximity_nan,longitude_outlier,latitude_outlier
0,0,0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,0,0,0,0,0
1,1,1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,1,0,0,0,0
2,2,2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,2,0,0,0,0
3,3,3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,3,0,0,0,0


In [4]:
df.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'longitude', 'latitude',
       'housing_median_age', 'total_rooms', 'total_bedrooms', 'population',
       'households', 'median_income', 'median_house_value', 'ocean_proximity',
       'id', 'housing_median_age_nan', 'ocean_proximity_nan',
       'longitude_outlier', 'latitude_outlier'],
      dtype='object')

In [5]:
df.drop(columns='id', inplace=True)

In [6]:
df.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'longitude', 'latitude',
       'housing_median_age', 'total_rooms', 'total_bedrooms', 'population',
       'households', 'median_income', 'median_house_value', 'ocean_proximity',
       'housing_median_age_nan', 'ocean_proximity_nan', 'longitude_outlier',
       'latitude_outlier'],
      dtype='object')

#### Quantitative variables

In [7]:
# Доля спален в общем количестве комнат
df['bedroom_share'] = df['total_bedrooms'] / df['total_rooms'] * 100

In [8]:
# Сколько человек в среднем живет в одной комнате
df['population_per_room'] = df['population'] / df['total_rooms']

In [9]:
df[['bedroom_share', 'population_per_room']].head()

Unnamed: 0,bedroom_share,population_per_room
0,14.659091,0.365909
1,15.579659,0.338217
2,12.951602,0.338105
3,18.44584,0.437991
4,17.209588,0.347265


#### Categorical variables

### Variant 1

Все строковые значения нужно перевести в числовые - значения понятные для машины.

In [10]:
df['ocean_proximity'].value_counts()

<1H OCEAN     9150
INLAND        6542
NEAR OCEAN    2655
NEAR BAY      2288
ISLAND           5
Name: ocean_proximity, dtype: int64

In [11]:
# Создаем столбцы с названиями значений и единице при существовании нового признака
pd.get_dummies(df['ocean_proximity'])

Unnamed: 0,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
0,0,0,0,1,0
1,0,0,0,1,0
2,0,0,0,1,0
3,0,0,0,1,0
4,0,0,0,1,0
...,...,...,...,...,...
20635,0,1,0,0,0
20636,0,1,0,0,0
20637,0,1,0,0,0
20638,0,1,0,0,0


In [12]:
df = pd.concat([df, pd.get_dummies(df['ocean_proximity'])], axis=1)
df

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,...,ocean_proximity_nan,longitude_outlier,latitude_outlier,bedroom_share,population_per_room,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
0,0,0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,...,0,0,0,14.659091,0.365909,0,0,0,1,0
1,1,1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,...,0,0,0,15.579659,0.338217,0,0,0,1,0
2,2,2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,...,0,0,0,12.951602,0.338105,0,0,0,1,0
3,3,3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,...,0,0,0,18.445840,0.437991,0,0,0,1,0
4,4,4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,...,0,0,0,17.209588,0.347265,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,20635,20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,...,0,0,0,22.462462,0.507508,0,1,0,0,0
20636,20636,20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,...,0,0,0,21.520803,0.510760,0,1,0,0,0
20637,20637,20637,-121.22,39.43,17.0,2254.0,485.0,1165.0,433.0,1.7000,...,0,0,0,21.517303,0.516859,0,1,0,0,0
20638,20638,20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,...,0,0,0,21.989247,0.398387,0,1,0,0,0


### Variant 2

#### Feature encoding

Описание категорий каким-то числом

In [13]:
df_cat = df.groupby('ocean_proximity').median()['total_bedrooms']
df_cat

ocean_proximity
<1H OCEAN     435.0
INLAND        426.0
ISLAND        512.0
NEAR BAY      425.5
NEAR OCEAN    461.0
Name: total_bedrooms, dtype: float64

In [14]:
df_cat = pd.DataFrame(df_cat)
df_cat

Unnamed: 0_level_0,total_bedrooms
ocean_proximity,Unnamed: 1_level_1
<1H OCEAN,435.0
INLAND,426.0
ISLAND,512.0
NEAR BAY,425.5
NEAR OCEAN,461.0


In [15]:
df_cat = df.groupby('ocean_proximity', as_index=False).median()[['ocean_proximity', 'total_bedrooms']]
df_cat = pd.DataFrame(df_cat)
df_cat

Unnamed: 0,ocean_proximity,total_bedrooms
0,<1H OCEAN,435.0
1,INLAND,426.0
2,ISLAND,512.0
3,NEAR BAY,425.5
4,NEAR OCEAN,461.0


In [16]:
df_cat.rename(columns={'total_bedrooms': 'median_rooms'}, inplace=True)
df_cat.sort_values(by='median_rooms')

Unnamed: 0,ocean_proximity,median_rooms
3,NEAR BAY,425.5
1,INLAND,426.0
0,<1H OCEAN,435.0
4,NEAR OCEAN,461.0
2,ISLAND,512.0


In [17]:
df['total_bedrooms'].median()

435.0

In [18]:
df = df.merge(df_cat, on=['ocean_proximity'])
df.head(3)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,...,longitude_outlier,latitude_outlier,bedroom_share,population_per_room,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN,median_rooms
0,0,0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,...,0,0,14.659091,0.365909,0,0,0,1,0,425.5
1,1,1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,...,0,0,15.579659,0.338217,0,0,0,1,0,425.5
2,2,2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,...,0,0,12.951602,0.338105,0,0,0,1,0,425.5


### Real variables

#### A) Feature descritization

возраст дома от 0 до 100 лет

1 категория - новые дома до 5 лет \
2 категория - дома от 5 до 10 лет \
... 

In [19]:
df['housing_median_age'].describe()

count    20640.000000
mean        28.665746
std         12.355019
min          1.000000
25%         19.000000
50%         29.000000
75%         37.000000
max         52.000000
Name: housing_median_age, dtype: float64

In [20]:
def age_to_cat(X):
    X['age_cat'] = 0
    
    X.loc[X['housing_median_age'] <= 5, 'age_cat'] = 1
    X.loc[(X['housing_median_age'] > 5) & (X['housing_median_age'] <= 10), 'age_cat'] = 2
    X.loc[(X['housing_median_age'] > 10) & (X['housing_median_age'] <= 25), 'age_cat'] = 3
    X.loc[X['housing_median_age'] > 25, 'age_cat'] = 4
    return X

In [21]:
df = age_to_cat(df)
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,...,latitude_outlier,bedroom_share,population_per_room,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN,median_rooms,age_cat
0,0,0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,...,0,14.659091,0.365909,0,0,0,1,0,425.5,4
1,1,1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,...,0,15.579659,0.338217,0,0,0,1,0,425.5,3
2,2,2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,...,0,12.951602,0.338105,0,0,0,1,0,425.5,4
3,3,3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,...,0,18.44584,0.437991,0,0,0,1,0,425.5,4
4,4,4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,...,0,17.209588,0.347265,0,0,0,1,0,425.5,4


#### B) Feature binarization

Только 1 признак \
1 - новый дом \
2 - старый дом

In [23]:
def age_to_binary_cat(X):
    X['age_binary_cat'] = 0
    X.loc[X['housing_median_age'] <= 5, 'age_binary_cat'] = 1
    
    return X

In [31]:
df = age_to_binary_cat(df)
df[df['housing_median_age'] <= 10]

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,...,bedroom_share,population_per_room,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN,median_rooms,age_cat,age_binary_cat
59,59,59,-122.29,37.82,2.0,2127.0,43.0,94.0,57.0,2.5625,...,2.021627,0.044194,0,0,0,1,0,425.5,1,1
87,87,87,-122.27,37.81,10.0,875.0,348.0,546.0,330.0,0.7600,...,39.771429,0.624000,0,0,0,1,0,425.5,2,0
88,88,88,-122.27,37.80,10.0,2127.0,42.0,125.0,39.0,0.9722,...,1.974612,0.058768,0,0,0,1,0,425.5,2,0
437,437,437,-122.30,37.87,10.0,503.0,118.0,228.0,100.0,2.1705,...,23.459245,0.453280,0,0,0,1,0,425.5,2,0
570,570,570,-122.24,37.72,5.0,2127.0,2885.0,7427.0,2718.0,7.6110,...,135.637047,3.491772,0,0,0,1,0,425.5,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20569,20270,20270,-119.18,34.19,5.0,384.0,131.0,410.0,149.0,1.5625,...,34.114583,1.067708,0,0,0,0,1,461.0,1,1
20621,20322,20322,-119.14,34.23,8.0,2127.0,75.0,102.0,80.0,2.5714,...,3.526093,0.047955,0,0,0,0,1,461.0,2,0
20626,20352,20352,-119.09,34.22,8.0,2127.0,10.0,309.0,16.0,4.0208,...,0.470146,0.145275,0,0,0,0,1,461.0,2,0
20628,20378,20378,-118.82,34.15,9.0,655.0,110.0,222.0,109.0,7.8528,...,16.793893,0.338931,0,0,0,0,1,461.0,2,0


In [32]:
df.to_csv('./csv/housing_signs.csv', sep=',')