### Вариант №2.
### Для заданного набора данных проведите обработку пропусков в данных для одного категориального и одного количественного признака. Какие способы обработки пропусков в данных для категориальных и количественных признаков Вы использовали? Какие признаки Вы будете использовать для дальнейшего построения моделей машинного обучения и почему?

In [17]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.impute import MissingIndicator
%matplotlib inline 
sns.set(style="ticks")

In [4]:
data = pd.read_csv(r'C:\Users\Masha\virtualenvs\tensorflow\data\restaurant-scores-lives-standard.csv', sep=",")

In [5]:
# размер набора данных
data.shape

(54477, 23)

In [6]:
# типы колонок
data.dtypes

business_id                    int64
business_name                 object
business_address              object
business_city                 object
business_state                object
business_postal_code          object
business_latitude            float64
business_longitude           float64
business_location             object
business_phone_number        float64
inspection_id                 object
inspection_date               object
inspection_score             float64
inspection_type               object
violation_id                  object
violation_description         object
risk_category                 object
Neighborhoods                float64
Police Districts             float64
Supervisor Districts         float64
Fire Prevention Districts    float64
Zip Codes                    float64
Analysis Neighborhoods       float64
dtype: object

In [7]:
# проверим есть ли пропущенные значения
data.isnull().sum()

business_id                      0
business_name                    0
business_address                 0
business_city                    0
business_state                   0
business_postal_code          1373
business_latitude            26332
business_longitude           26332
business_location            26332
business_phone_number        37095
inspection_id                    0
inspection_date                  0
inspection_score             14344
inspection_type                  0
violation_id                 13431
violation_description        13431
risk_category                13431
Neighborhoods                26365
Police Districts             26365
Supervisor Districts         26365
Fire Prevention Districts    26411
Zip Codes                    26352
Analysis Neighborhoods       26365
dtype: int64

In [8]:
# Первые 5 строк датасета
data.head()

Unnamed: 0,business_id,business_name,business_address,business_city,business_state,business_postal_code,business_latitude,business_longitude,business_location,business_phone_number,...,inspection_type,violation_id,violation_description,risk_category,Neighborhoods,Police Districts,Supervisor Districts,Fire Prevention Districts,Zip Codes,Analysis Neighborhoods
0,97164,Golden Waffle,244 Clement St,San Francisco,CA,94118,,,,,...,New Ownership,,,,,,,,,
1,69487,Hakkasan San Francisco,1 Kearny St,San Francisco,CA,94108,,,,,...,Routine - Unscheduled,69487_20180418_103119,Inadequate and inaccessible handwashing facili...,Moderate Risk,,,,,,
2,91044,Chopsticks Restaurant,4615 Mission St,San Francisco,CA,94112,,,,,...,Non-inspection site visit,,,,,,,,,
3,85987,Tselogs,552 Jones St,San Francisco,CA,94102,,,,,...,Routine - Unscheduled,85987_20180412_103132,Improper thawing methods,Moderate Risk,,,,,,
4,96024,Fig & Thistle Market,691 14th St,San Francisco,CA,94114,,,,,...,New Ownership - Followup,,,,,,,,,


In [9]:
total_count = data.shape[0]
print('Всего строк: {}'.format(total_count))

Всего строк: 54477


# Обработка пропусков в данных
## Обработка пропусков в числовых данных - импьютация

In [10]:
# Выберем числовые колонки с пропущенными значениями
# Цикл по колонкам датасета
num_cols = []
for col in data.columns:
    # Количество пустых значений 
    temp_null_count = data[data[col].isnull()].shape[0]
    dt = str(data[col].dtype)
    if temp_null_count>0 and (dt=='float64' or dt=='int64'):
        num_cols.append(col)
        temp_perc = round((temp_null_count / total_count) * 100.0, 2)
        print('Колонка {}. Тип данных {}. Количество пустых значений {}, {}%.'.format(col, dt, temp_null_count, temp_perc))

Колонка business_latitude. Тип данных float64. Количество пустых значений 26332, 48.34%.
Колонка business_longitude. Тип данных float64. Количество пустых значений 26332, 48.34%.
Колонка business_phone_number. Тип данных float64. Количество пустых значений 37095, 68.09%.
Колонка inspection_score. Тип данных float64. Количество пустых значений 14344, 26.33%.
Колонка Neighborhoods. Тип данных float64. Количество пустых значений 26365, 48.4%.
Колонка Police Districts. Тип данных float64. Количество пустых значений 26365, 48.4%.
Колонка Supervisor Districts. Тип данных float64. Количество пустых значений 26365, 48.4%.
Колонка Fire Prevention Districts. Тип данных float64. Количество пустых значений 26411, 48.48%.
Колонка Zip Codes. Тип данных float64. Количество пустых значений 26352, 48.37%.
Колонка Analysis Neighborhoods. Тип данных float64. Количество пустых значений 26365, 48.4%.


In [11]:
# Фильтр по колонкам с пропущенными значениями
data_num = data[num_cols]
data_num

Unnamed: 0,business_latitude,business_longitude,business_phone_number,inspection_score,Neighborhoods,Police Districts,Supervisor Districts,Fire Prevention Districts,Zip Codes,Analysis Neighborhoods
0,,,,,,,,,,
1,,,,88.0,,,,,,
2,,,,,,,,,,
3,,,,94.0,,,,,,
4,,,,,,,,,,
5,,,,,,,,,,
6,,,1.415046e+10,,,,,,,
7,,,,86.0,,,,,,
8,,,1.415554e+10,96.0,,,,,,
9,,,1.415571e+10,,,,,,,


In [12]:
# Фильтр по пустым значениям поля MasVnrArea 
data[data['inspection_score'].isnull()]

Unnamed: 0,business_id,business_name,business_address,business_city,business_state,business_postal_code,business_latitude,business_longitude,business_location,business_phone_number,...,inspection_type,violation_id,violation_description,risk_category,Neighborhoods,Police Districts,Supervisor Districts,Fire Prevention Districts,Zip Codes,Analysis Neighborhoods
0,97164,Golden Waffle,244 Clement St,San Francisco,CA,94118,,,,,...,New Ownership,,,,,,,,,
2,91044,Chopsticks Restaurant,4615 Mission St,San Francisco,CA,94112,,,,,...,Non-inspection site visit,,,,,,,,,
4,96024,Fig & Thistle Market,691 14th St,San Francisco,CA,94114,,,,,...,New Ownership - Followup,,,,,,,,,
5,97503,Moscone South Main Kitchen,747 Howard St,San Francisco,CA,94103,,,,,...,New Ownership,,,,,,,,,
6,97748,FISTFUL OF TACOS,201 Harrison St Unit C-2,San Francisco,CA,94105,,,,1.415046e+10,...,Reinspection/Followup,,,,,,,,,
9,77442,Gashead Tavern,2351 Mission St,San Francisco,CA,94110,,,,1.415571e+10,...,Reinspection/Followup,,,,,,,,,
10,87020,Cafasquared,2748 Hyde St,San Francisco,CA,94109,,,,,...,Routine - Unscheduled,87020_20160712_103139,Improper food storage,Low Risk,,,,,,
12,94432,Braised + Bread,50 Post St #65A,San Francisco,CA,94104,,,,,...,New Ownership - Followup,,,,,,,,,
13,88090,Hwaro,4516 Mission St,San Francisco,CA,94112,,,,1.415521e+10,...,New Construction,,,,,,,,,
14,94936,94936 Murph's Pub,24 Willie Mays Pl View Section 302,San Francisco,CA,94107,,,,,...,Reinspection/Followup,,,,,,,,,


In [13]:
# Запоминаем индексы строк с пустыми значениями
flt_index = data[data['inspection_score'].isnull()].index
flt_index

Int64Index([    0,     2,     4,     5,     6,     9,    10,    12,    13,
               14,
            ...
            54454, 54456, 54457, 54463, 54465, 54467, 54469, 54470, 54473,
            54474],
           dtype='int64', length=14344)

In [14]:
# Проверяем что выводятся нужные строки
data[data.index.isin(flt_index)]

Unnamed: 0,business_id,business_name,business_address,business_city,business_state,business_postal_code,business_latitude,business_longitude,business_location,business_phone_number,...,inspection_type,violation_id,violation_description,risk_category,Neighborhoods,Police Districts,Supervisor Districts,Fire Prevention Districts,Zip Codes,Analysis Neighborhoods
0,97164,Golden Waffle,244 Clement St,San Francisco,CA,94118,,,,,...,New Ownership,,,,,,,,,
2,91044,Chopsticks Restaurant,4615 Mission St,San Francisco,CA,94112,,,,,...,Non-inspection site visit,,,,,,,,,
4,96024,Fig & Thistle Market,691 14th St,San Francisco,CA,94114,,,,,...,New Ownership - Followup,,,,,,,,,
5,97503,Moscone South Main Kitchen,747 Howard St,San Francisco,CA,94103,,,,,...,New Ownership,,,,,,,,,
6,97748,FISTFUL OF TACOS,201 Harrison St Unit C-2,San Francisco,CA,94105,,,,1.415046e+10,...,Reinspection/Followup,,,,,,,,,
9,77442,Gashead Tavern,2351 Mission St,San Francisco,CA,94110,,,,1.415571e+10,...,Reinspection/Followup,,,,,,,,,
10,87020,Cafasquared,2748 Hyde St,San Francisco,CA,94109,,,,,...,Routine - Unscheduled,87020_20160712_103139,Improper food storage,Low Risk,,,,,,
12,94432,Braised + Bread,50 Post St #65A,San Francisco,CA,94104,,,,,...,New Ownership - Followup,,,,,,,,,
13,88090,Hwaro,4516 Mission St,San Francisco,CA,94112,,,,1.415521e+10,...,New Construction,,,,,,,,,
14,94936,94936 Murph's Pub,24 Willie Mays Pl View Section 302,San Francisco,CA,94107,,,,,...,Reinspection/Followup,,,,,,,,,


In [15]:
# фильтр по колонке
data_num[data_num.index.isin(flt_index)]['inspection_score']

0       NaN
2       NaN
4       NaN
5       NaN
6       NaN
9       NaN
10      NaN
12      NaN
13      NaN
14      NaN
16      NaN
17      NaN
19      NaN
20      NaN
22      NaN
23      NaN
24      NaN
25      NaN
26      NaN
27      NaN
28      NaN
32      NaN
34      NaN
35      NaN
36      NaN
37      NaN
38      NaN
39      NaN
40      NaN
41      NaN
         ..
54418   NaN
54419   NaN
54420   NaN
54422   NaN
54423   NaN
54427   NaN
54428   NaN
54430   NaN
54431   NaN
54432   NaN
54433   NaN
54434   NaN
54435   NaN
54436   NaN
54437   NaN
54439   NaN
54441   NaN
54442   NaN
54444   NaN
54451   NaN
54454   NaN
54456   NaN
54457   NaN
54463   NaN
54465   NaN
54467   NaN
54469   NaN
54470   NaN
54473   NaN
54474   NaN
Name: inspection_score, Length: 14344, dtype: float64

In [16]:
data_num_ins_sc = data_num[['inspection_score']]
data_num_ins_sc.head()

Unnamed: 0,inspection_score
0,
1,88.0
2,
3,94.0
4,


In [18]:
# Фильтр для проверки заполнения пустых значений
indicator = MissingIndicator()
mask_missing_values_only = indicator.fit_transform(data_num_ins_sc)
mask_missing_values_only

array([[ True],
       [False],
       [ True],
       ...,
       [ True],
       [False],
       [False]])

In [19]:
strategies=['mean', 'median','most_frequent']

In [21]:
def test_num_impute(strategy_param):
    imp_num = SimpleImputer(strategy=strategy_param)
    data_num_imp = imp_num.fit_transform(data_num_ins_sc)
    return data_num_imp[mask_missing_values_only]

In [22]:
strategies[0], test_num_impute(strategies[0])

('mean', array([86.03558169, 86.03558169, 86.03558169, ..., 86.03558169,
        86.03558169, 86.03558169]))

In [23]:
strategies[1], test_num_impute(strategies[1])

('median', array([87., 87., 87., ..., 87., 87., 87.]))

In [24]:
strategies[2], test_num_impute(strategies[2])

('most_frequent', array([90., 90., 90., ..., 90., 90., 90.]))

## Обработка пропусков в категориальных данных

In [25]:
# Выберем категориальные колонки с пропущенными значениями
# Цикл по колонкам датасета
cat_cols = []
for col in data.columns:
    # Количество пустых значений 
    temp_null_count = data[data[col].isnull()].shape[0]
    dt = str(data[col].dtype)
    if temp_null_count>0 and (dt=='object'):
        cat_cols.append(col)
        temp_perc = round((temp_null_count / total_count) * 100.0, 2)
        print('Колонка {}. Тип данных {}. Количество пустых значений {}, {}%.'.format(col, dt, temp_null_count, temp_perc))

Колонка business_postal_code. Тип данных object. Количество пустых значений 1373, 2.52%.
Колонка business_location. Тип данных object. Количество пустых значений 26332, 48.34%.
Колонка violation_id. Тип данных object. Количество пустых значений 13431, 24.65%.
Колонка violation_description. Тип данных object. Количество пустых значений 13431, 24.65%.
Колонка risk_category. Тип данных object. Количество пустых значений 13431, 24.65%.


In [26]:
cat_temp_data = data[['business_postal_code']]
cat_temp_data.head()

Unnamed: 0,business_postal_code
0,94118
1,94108
2,94112
3,94102
4,94114


In [28]:
cat_temp_data['business_postal_code'].unique()

array(['94118', '94108', '94112', '94102', '94114', '94103', '94105',
       '94107', '94110', '94109', '94104', nan, '94115', '94133', '94121',
       '94124', '94122', '94117', '94111', '94127', '94131', '95133',
       '94132', '94102-5917', '94123', '94134', '95109', '94116', '94158',
       '94901', '94602', '94130', '94080', '94143', '94188', '64110',
       '94402', '94301', '00000', '94101', '95122', '94123-3106', '94621',
       '94124-1917', '94129', '941033148', '95112', '94544', '94013',
       '95132', '92672', '941', '941102019', 'CA', '94120', 'Ca', '94014',
       '95105', '95117'], dtype=object)

In [29]:
cat_temp_data[cat_temp_data['business_postal_code'].isnull()].shape

(1373, 1)

In [30]:
# Импьютация наиболее частыми значениями
imp2 = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
data_imp2 = imp2.fit_transform(cat_temp_data)
data_imp2

array([['94118'],
       ['94108'],
       ['94112'],
       ...,
       ['94110'],
       ['94123'],
       ['94109']], dtype=object)

In [31]:
# Пустые значения отсутствуют
np.unique(data_imp2)

array(['00000', '64110', '92672', '94013', '94014', '94080', '941',
       '94101', '94102', '94102-5917', '94103', '941033148', '94104',
       '94105', '94107', '94108', '94109', '94110', '941102019', '94111',
       '94112', '94114', '94115', '94116', '94117', '94118', '94120',
       '94121', '94122', '94123', '94123-3106', '94124', '94124-1917',
       '94127', '94129', '94130', '94131', '94132', '94133', '94134',
       '94143', '94158', '94188', '94301', '94402', '94544', '94602',
       '94621', '94901', '95105', '95109', '95112', '95117', '95122',
       '95132', '95133', 'CA', 'Ca'], dtype=object)