# Базовые понятия статистики

Задание 1. Базовое изучение
Изучить представленный набор данных на основе описания его столбцов и выбрать 8 столбцов для дальнейшего изучения (среди них должны быть как числовые, так и категориальные). Провести расчет базовых метрик для них, кратко описать результаты.

Задание 2. Работа с выбросами
В выбранных числовых столбцах найти выбросы, выдвинуть гипотезы об их причинах и проинтерпретировать результаты. Принять и обосновать решение о дальнейшей работе с ними.

Задание 3. Работа с пропусками
Рассчитать количество пропусков для всех выбранных столбцов. Принять и обосновать решение о методе работы с пропусками по каждому столбцу, сформировать датафрейм, в котором пропуски будут отсутствовать.

In [46]:
import pandas as pd
import numpy as np

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

pd.options.mode.chained_assignment = None  # default='warn'

df = pd.read_csv("horse_data.csv", 
            na_values='?',
            names = [
                "surgery", 
                "age", 
                "hospital_number", 
                "rectal_temperature", 
                "pulse",
                "respiratory_rate",
                "temperature_of_extremities", 
                "peripheral_pulse",
                "mucous_membranes",
                "capillary_refill_time",
                "pain",
                "peristalsis",
                "abdominal_distension",
                "nasogastric_tube",
                "nasogastric_reflux",
                "nasogastric_reflux_PH",
                "rectal_examination_feces",
                "abdomen",
                "packed_cell_volume",
                "total_protein",
                "abdominocentesis_appearance",
                "abdomcentesis_total_protein",
                "outcome",
                "surgical_lesion",
                "lesion_1",
                "lesion_2",
                "lesion_3",
                "cp_data",
            ])

df.head()

Unnamed: 0,surgery,age,hospital_number,rectal_temperature,pulse,respiratory_rate,temperature_of_extremities,peripheral_pulse,mucous_membranes,capillary_refill_time,...,packed_cell_volume,total_protein,abdominocentesis_appearance,abdomcentesis_total_protein,outcome,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
0,2.0,1,530101,38.5,66.0,28.0,3.0,3.0,,2.0,...,45.0,8.4,,,2.0,2,11300,0,0,2
1,1.0,1,534817,39.2,88.0,20.0,,,4.0,1.0,...,50.0,85.0,2.0,2.0,3.0,2,2208,0,0,2
2,2.0,1,530334,38.3,40.0,24.0,1.0,1.0,3.0,1.0,...,33.0,6.7,,,1.0,2,0,0,0,1
3,1.0,9,5290409,39.1,164.0,84.0,4.0,1.0,6.0,2.0,...,48.0,7.2,3.0,5.3,2.0,1,2208,0,0,1
4,2.0,1,530255,37.3,104.0,35.0,,,6.0,2.0,...,74.0,7.4,,,2.0,2,4300,0,0,2


In [47]:
# пропуски - %
for column in df.columns:
    res = df[column].isnull().mean()
    print(f'{column} - {res : 0.1%}')

surgery -  0.3%
age -  0.0%
hospital_number -  0.0%
rectal_temperature -  20.0%
pulse -  8.0%
respiratory_rate -  19.3%
temperature_of_extremities -  18.7%
peripheral_pulse -  23.0%
mucous_membranes -  15.7%
capillary_refill_time -  10.7%
pain -  18.3%
peristalsis -  14.7%
abdominal_distension -  18.7%
nasogastric_tube -  34.7%
nasogastric_reflux -  35.3%
nasogastric_reflux_PH -  82.3%
rectal_examination_feces -  34.0%
abdomen -  39.3%
packed_cell_volume -  9.7%
total_protein -  11.0%
abdominocentesis_appearance -  55.0%
abdomcentesis_total_protein -  66.0%
outcome -  0.3%
surgical_lesion -  0.0%
lesion_1 -  0.0%
lesion_2 -  0.0%
lesion_3 -  0.0%
cp_data -  0.0%


In [48]:
#базовые метрики для количественных значений
df[['rectal_temperature', 'pulse', 'respiratory_rate']].describe()

Unnamed: 0,rectal_temperature,pulse,respiratory_rate
count,240.0,276.0,242.0
mean,38.167917,71.913043,30.417355
std,0.732289,28.630557,17.642231
min,35.4,30.0,8.0
25%,37.8,48.0,18.5
50%,38.2,64.0,24.5
75%,38.5,88.0,36.0
max,40.8,184.0,96.0


In [49]:
#Посмотрим выбросы по ключу rectal_temperature

quantile1 = df.rectal_temperature.quantile(0.25)
quantile3 = df.rectal_temperature.quantile(0.75)
iqr = quantile3 - quantile1
lower = quantile1 - (1.5 * iqr) 
upper = quantile3 + (1.5 * iqr)

df[(df.rectal_temperature < lower) | (df.rectal_temperature > upper)].sort_values('rectal_temperature')

Unnamed: 0,surgery,age,hospital_number,rectal_temperature,pulse,respiratory_rate,temperature_of_extremities,peripheral_pulse,mucous_membranes,capillary_refill_time,...,packed_cell_volume,total_protein,abdominocentesis_appearance,abdomcentesis_total_protein,outcome,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
44,1.0,1,535407,35.4,140.0,24.0,3.0,3.0,4.0,2.0,...,57.0,69.0,3.0,2.0,3.0,1,3205,0,0,2
141,2.0,1,522979,36.0,42.0,30.0,,,5.0,1.0,...,64.0,6.8,,,2.0,2,1400,0,0,2
238,2.0,1,528702,36.1,88.0,,3.0,3.0,3.0,1.0,...,45.0,7.0,3.0,4.8,3.0,1,2209,0,0,1
80,1.0,1,527518,36.4,98.0,35.0,3.0,3.0,4.0,1.0,...,47.0,6.4,3.0,3.6,2.0,1,2205,0,0,1
118,1.0,1,533983,36.5,78.0,30.0,1.0,,1.0,1.0,...,34.0,75.0,2.0,1.0,1.0,1,3112,6112,0,2
298,1.0,1,530612,36.5,100.0,24.0,3.0,3.0,3.0,1.0,...,50.0,6.0,3.0,3.4,1.0,1,2208,0,0,1
251,2.0,1,527940,36.6,42.0,18.0,3.0,3.0,2.0,1.0,...,52.0,7.1,,,2.0,1,5111,0,0,2
99,2.0,1,530002,39.6,108.0,51.0,3.0,3.0,6.0,2.0,...,59.0,8.0,2.0,2.6,1.0,2,4300,0,0,1
75,1.0,9,534092,39.7,100.0,,3.0,3.0,5.0,2.0,...,48.0,57.0,2.0,2.0,3.0,1,1400,0,0,2
20,1.0,1,530157,39.9,72.0,60.0,1.0,1.0,5.0,2.0,...,46.0,6.1,2.0,,1.0,1,2111,0,0,2


In [50]:
#Посмотрим выбросы по ключу pulse

quantile1 = df.pulse.quantile(0.25)
quantile3 = df.pulse.quantile(0.75)
iqr = quantile3 - quantile1
lower = quantile1 - (1.5 * iqr) 
upper = quantile3 + (1.5 * iqr)

df[df.pulse.between(lower, upper, inclusive=True)]

Unnamed: 0,surgery,age,hospital_number,rectal_temperature,pulse,respiratory_rate,temperature_of_extremities,peripheral_pulse,mucous_membranes,capillary_refill_time,...,packed_cell_volume,total_protein,abdominocentesis_appearance,abdomcentesis_total_protein,outcome,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
0,2.0,1,530101,38.5,66.0,28.0,3.0,3.0,,2.0,...,45.0,8.4,,,2.0,2,11300,0,0,2
1,1.0,1,534817,39.2,88.0,20.0,,,4.0,1.0,...,50.0,85.0,2.0,2.0,3.0,2,2208,0,0,2
2,2.0,1,530334,38.3,40.0,24.0,1.0,1.0,3.0,1.0,...,33.0,6.7,,,1.0,2,0,0,0,1
4,2.0,1,530255,37.3,104.0,35.0,,,6.0,2.0,...,74.0,7.4,,,2.0,2,4300,0,0,2
6,1.0,1,526802,37.9,48.0,16.0,1.0,1.0,1.0,1.0,...,37.0,7.0,,,1.0,1,3124,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,1.0,1,533886,,120.0,70.0,4.0,,4.0,2.0,...,55.0,65.0,,,3.0,2,3205,0,0,2
296,2.0,1,527702,37.2,72.0,24.0,3.0,2.0,4.0,2.0,...,44.0,,3.0,3.3,3.0,1,2208,0,0,1
297,1.0,1,529386,37.5,72.0,30.0,4.0,3.0,4.0,1.0,...,60.0,6.8,,,2.0,1,3205,0,0,2
298,1.0,1,530612,36.5,100.0,24.0,3.0,3.0,3.0,1.0,...,50.0,6.0,3.0,3.4,1.0,1,2208,0,0,1


In [51]:
#surgery?
#1 = Yes, it had surgery
#2 = It was treated without surgery

In [52]:
# заменим на категорийное значение
df.loc[df['surgery'] == 1, 'surgery'] = 'yes'
df.loc[df['surgery'] == 2, 'surgery'] = 'no'


df[df['surgery'].isna()]

Unnamed: 0,surgery,age,hospital_number,rectal_temperature,pulse,respiratory_rate,temperature_of_extremities,peripheral_pulse,mucous_membranes,capillary_refill_time,...,packed_cell_volume,total_protein,abdominocentesis_appearance,abdomcentesis_total_protein,outcome,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
132,,1,534572,38.0,48.0,20.0,3.0,1.0,3.0,1.0,...,43.0,73.0,2.0,1.0,,1,3111,0,0,2


In [53]:
# судя по lesion_1 операция была, проставим пропущенное значение
df['surgery'].iloc[132] = 'yes'

df.head()

Unnamed: 0,surgery,age,hospital_number,rectal_temperature,pulse,respiratory_rate,temperature_of_extremities,peripheral_pulse,mucous_membranes,capillary_refill_time,...,packed_cell_volume,total_protein,abdominocentesis_appearance,abdomcentesis_total_protein,outcome,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
0,no,1,530101,38.5,66.0,28.0,3.0,3.0,,2.0,...,45.0,8.4,,,2.0,2,11300,0,0,2
1,yes,1,534817,39.2,88.0,20.0,,,4.0,1.0,...,50.0,85.0,2.0,2.0,3.0,2,2208,0,0,2
2,no,1,530334,38.3,40.0,24.0,1.0,1.0,3.0,1.0,...,33.0,6.7,,,1.0,2,0,0,0,1
3,yes,9,5290409,39.1,164.0,84.0,4.0,1.0,6.0,2.0,...,48.0,7.2,3.0,5.3,2.0,1,2208,0,0,1
4,no,1,530255,37.3,104.0,35.0,,,6.0,2.0,...,74.0,7.4,,,2.0,2,4300,0,0,2


In [54]:
#Age
#1 = Adult horse
#2 = Young( < 6 months)

#Вероятно, число 9 было указано вместо 2, при замене на категорийное значение учтем это
df.loc[df['age'] == 1, 'age'] = 'adult'
df.loc[df['age'] == 9, 'age'] = 'young'

df.head()

Unnamed: 0,surgery,age,hospital_number,rectal_temperature,pulse,respiratory_rate,temperature_of_extremities,peripheral_pulse,mucous_membranes,capillary_refill_time,...,packed_cell_volume,total_protein,abdominocentesis_appearance,abdomcentesis_total_protein,outcome,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
0,no,adult,530101,38.5,66.0,28.0,3.0,3.0,,2.0,...,45.0,8.4,,,2.0,2,11300,0,0,2
1,yes,adult,534817,39.2,88.0,20.0,,,4.0,1.0,...,50.0,85.0,2.0,2.0,3.0,2,2208,0,0,2
2,no,adult,530334,38.3,40.0,24.0,1.0,1.0,3.0,1.0,...,33.0,6.7,,,1.0,2,0,0,0,1
3,yes,young,5290409,39.1,164.0,84.0,4.0,1.0,6.0,2.0,...,48.0,7.2,3.0,5.3,2.0,1,2208,0,0,1
4,no,adult,530255,37.3,104.0,35.0,,,6.0,2.0,...,74.0,7.4,,,2.0,2,4300,0,0,2


In [55]:
#пропуски отсутствуют
df[df['age'].isna()]

Unnamed: 0,surgery,age,hospital_number,rectal_temperature,pulse,respiratory_rate,temperature_of_extremities,peripheral_pulse,mucous_membranes,capillary_refill_time,...,packed_cell_volume,total_protein,abdominocentesis_appearance,abdomcentesis_total_protein,outcome,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data


In [56]:
#hospital_number
# - numeric id
# - the case number assigned to the horse(may not be unique if the horse is treated > 1 time)

#категориальная величина, номер дела в госпитале
#пропуски отсутствуют
df[df['hospital_number'].isna()]

Unnamed: 0,surgery,age,hospital_number,rectal_temperature,pulse,respiratory_rate,temperature_of_extremities,peripheral_pulse,mucous_membranes,capillary_refill_time,...,packed_cell_volume,total_protein,abdominocentesis_appearance,abdomcentesis_total_protein,outcome,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data


In [57]:
#rectal_temperature
# - linear
# - in degrees celsius.
# - An elevated temp may occur due to infection.
# - temperature may be reduced when the animal is in late shock
# - normal temp is 37.8
# - this parameter will usually change as the problem progresses eg.may start out normal, then become elevated because of the lesion, passing back through the normal range as the horse goes into shock

# процент пропусков
print(round(len(df.loc[df['rectal_temperature'].isna()]) / len(df.index) * 100, 1), "%")

20.0 %


In [58]:
#неприрывная величина, температура тела лошади
df[df['rectal_temperature'].isna()]

Unnamed: 0,surgery,age,hospital_number,rectal_temperature,pulse,respiratory_rate,temperature_of_extremities,peripheral_pulse,mucous_membranes,capillary_refill_time,...,packed_cell_volume,total_protein,abdominocentesis_appearance,abdomcentesis_total_protein,outcome,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
5,no,adult,528355,,,,2.0,1.0,3.0,1.0,...,,,,,1.0,2,0,0,0,2
7,yes,adult,529607,,60.0,,3.0,,,1.0,...,44.0,8.3,,,2.0,1,2208,0,0,2
8,no,adult,530051,,80.0,36.0,3.0,4.0,3.0,1.0,...,38.0,6.2,,,3.0,1,3205,0,0,2
16,yes,young,5301219,,128.0,36.0,3.0,3.0,4.0,2.0,...,53.0,7.8,3.0,4.7,2.0,2,1400,0,0,1
28,yes,adult,5279442,,,,,,,,...,,,,,2.0,2,4300,0,0,2
34,yes,adult,529796,,100.0,30.0,3.0,3.0,4.0,2.0,...,52.0,6.6,,,1.0,1,1124,0,0,2
35,no,adult,528812,,104.0,24.0,4.0,3.0,3.0,2.0,...,73.0,8.4,,,3.0,1,7111,0,0,2
40,yes,adult,529498,,88.0,,3.0,3.0,6.0,2.0,...,63.0,6.5,3.0,,2.0,1,4205,0,0,2
43,yes,adult,534069,,120.0,,3.0,4.0,4.0,1.0,...,52.0,67.0,2.0,2.0,3.0,1,3205,0,0,2
45,no,adult,529827,,120.0,,4.0,3.0,4.0,2.0,...,60.0,6.5,3.0,,2.0,1,3205,0,0,2


In [59]:
#пропуски заполним медианой
df['rectal_temperature'].fillna(df.groupby(['age', 'surgery'])['rectal_temperature'].transform('median'), inplace = True)
df.head(20)

Unnamed: 0,surgery,age,hospital_number,rectal_temperature,pulse,respiratory_rate,temperature_of_extremities,peripheral_pulse,mucous_membranes,capillary_refill_time,...,packed_cell_volume,total_protein,abdominocentesis_appearance,abdomcentesis_total_protein,outcome,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
0,no,adult,530101,38.5,66.0,28.0,3.0,3.0,,2.0,...,45.0,8.4,,,2.0,2,11300,0,0,2
1,yes,adult,534817,39.2,88.0,20.0,,,4.0,1.0,...,50.0,85.0,2.0,2.0,3.0,2,2208,0,0,2
2,no,adult,530334,38.3,40.0,24.0,1.0,1.0,3.0,1.0,...,33.0,6.7,,,1.0,2,0,0,0,1
3,yes,young,5290409,39.1,164.0,84.0,4.0,1.0,6.0,2.0,...,48.0,7.2,3.0,5.3,2.0,1,2208,0,0,1
4,no,adult,530255,37.3,104.0,35.0,,,6.0,2.0,...,74.0,7.4,,,2.0,2,4300,0,0,2
5,no,adult,528355,38.2,,,2.0,1.0,3.0,1.0,...,,,,,1.0,2,0,0,0,2
6,yes,adult,526802,37.9,48.0,16.0,1.0,1.0,1.0,1.0,...,37.0,7.0,,,1.0,1,3124,0,0,2
7,yes,adult,529607,38.1,60.0,,3.0,,,1.0,...,44.0,8.3,,,2.0,1,2208,0,0,2
8,no,adult,530051,38.2,80.0,36.0,3.0,4.0,3.0,1.0,...,38.0,6.2,,,3.0,1,3205,0,0,2
9,no,young,5299629,38.3,90.0,,1.0,,1.0,1.0,...,40.0,6.2,1.0,2.2,1.0,2,0,0,0,1


In [60]:
#Посмотрим выбросы
quantile1 = df.rectal_temperature.quantile(0.25)
quantile3 = df.rectal_temperature.quantile(0.75)
iqr = quantile3 - quantile1
lower = quantile1 - (1.5 * iqr) 
upper = quantile3 + (1.5 * iqr)

df[(df.rectal_temperature < lower) | (df.rectal_temperature > upper)].sort_values('rectal_temperature')

Unnamed: 0,surgery,age,hospital_number,rectal_temperature,pulse,respiratory_rate,temperature_of_extremities,peripheral_pulse,mucous_membranes,capillary_refill_time,...,packed_cell_volume,total_protein,abdominocentesis_appearance,abdomcentesis_total_protein,outcome,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
44,yes,adult,535407,35.4,140.0,24.0,3.0,3.0,4.0,2.0,...,57.0,69.0,3.0,2.0,3.0,1,3205,0,0,2
141,no,adult,522979,36.0,42.0,30.0,,,5.0,1.0,...,64.0,6.8,,,2.0,2,1400,0,0,2
238,no,adult,528702,36.1,88.0,,3.0,3.0,3.0,1.0,...,45.0,7.0,3.0,4.8,3.0,1,2209,0,0,1
80,yes,adult,527518,36.4,98.0,35.0,3.0,3.0,4.0,1.0,...,47.0,6.4,3.0,3.6,2.0,1,2205,0,0,1
118,yes,adult,533983,36.5,78.0,30.0,1.0,,1.0,1.0,...,34.0,75.0,2.0,1.0,1.0,1,3112,6112,0,2
298,yes,adult,530612,36.5,100.0,24.0,3.0,3.0,3.0,1.0,...,50.0,6.0,3.0,3.4,1.0,1,2208,0,0,1
251,no,adult,527940,36.6,42.0,18.0,3.0,3.0,2.0,1.0,...,52.0,7.1,,,2.0,1,5111,0,0,2
113,yes,adult,527933,36.8,60.0,28.0,,,,,...,,,,10.0,2.0,1,3205,0,0,1
277,no,adult,528620,36.9,50.0,40.0,2.0,3.0,3.0,1.0,...,37.5,6.5,,,1.0,2,3111,0,0,2
117,no,adult,5290482,39.5,,,3.0,3.0,4.0,2.0,...,,6.7,1.0,,3.0,1,4205,0,0,2


In [61]:
#pulse
# - linear
# - the heart rate in beats per minute
# - is a reflection of the heart condition: 30 - 40 is normal for adults
# - rare to have a lower than normal rate although athletic horses may have a rate of 20 - 25
# - animals with painful lesions or suffering from circulatory shock may have an elevated heart rate

#неприрывная величина, значение пульса лошади

# процент пропусков
print(round(len(df.loc[df['pulse'].isna()]) / len(df.index) * 100, 1), "%")

8.0 %


In [62]:
df.loc[df['pulse'].isna()]

Unnamed: 0,surgery,age,hospital_number,rectal_temperature,pulse,respiratory_rate,temperature_of_extremities,peripheral_pulse,mucous_membranes,capillary_refill_time,...,packed_cell_volume,total_protein,abdominocentesis_appearance,abdomcentesis_total_protein,outcome,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
5,no,adult,528355,38.2,,,2.0,1.0,3.0,1.0,...,,,,,1.0,2,0,0,0,2
28,yes,adult,5279442,38.1,,,,,,,...,,,,,2.0,2,4300,0,0,2
52,no,adult,529483,38.2,,,1.0,1.0,3.0,1.0,...,43.0,7.7,,,1.0,2,3111,0,0,2
56,yes,adult,528872,38.1,,,,,,,...,24.0,6.7,,,1.0,1,3112,0,0,2
58,yes,adult,528298,38.1,,20.0,4.0,3.0,3.0,,...,53.0,5.9,3.0,,2.0,1,3205,0,0,1
74,yes,young,5292929,38.6,,,,,,,...,37.0,4.9,,,2.0,1,11124,0,0,2
78,yes,adult,530693,38.1,,,3.0,3.0,3.0,3.0,...,46.0,5.9,,,2.0,1,3025,0,0,2
83,yes,adult,5279822,38.0,,24.0,3.0,3.0,6.0,2.0,...,68.0,7.8,,,2.0,1,3205,0,0,2
93,no,adult,530310,38.2,,,3.0,3.0,1.0,1.0,...,38.0,6.5,,,2.0,1,3205,0,0,2
115,no,adult,533723,38.2,,40.0,3.0,1.0,1.0,1.0,...,45.0,70.0,,,1.0,2,0,0,0,2


In [63]:
df['pulse'].fillna(df.groupby(['age', 'surgery'])['pulse'].transform('median'), inplace = True)
df['pulse']

0       66.0
1       88.0
2       40.0
3      164.0
4      104.0
       ...  
295    120.0
296     72.0
297     72.0
298    100.0
299     40.0
Name: pulse, Length: 300, dtype: float64

In [64]:
#respiratory_rate
# - linear
# - normal rate is 8 to 10
# - usefulness is doubtful due to the great fluctuations

#неприрывная величина, частота дыхания лошади
df.loc[df['respiratory_rate'].isna()]

Unnamed: 0,surgery,age,hospital_number,rectal_temperature,pulse,respiratory_rate,temperature_of_extremities,peripheral_pulse,mucous_membranes,capillary_refill_time,...,packed_cell_volume,total_protein,abdominocentesis_appearance,abdomcentesis_total_protein,outcome,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
5,no,adult,528355,38.2,54.0,,2.0,1.0,3.0,1.0,...,,,,,1.0,2,0,0,0,2
7,yes,adult,529607,38.1,60.0,,3.0,,,1.0,...,44.0,8.3,,,2.0,1,2208,0,0,2
9,no,young,5299629,38.3,90.0,,1.0,,1.0,1.0,...,40.0,6.2,1.0,2.2,1.0,2,0,0,0,1
28,yes,adult,5279442,38.1,66.0,,,,,,...,,,,,2.0,2,4300,0,0,2
29,no,adult,535415,37.7,48.0,,2.0,1.0,1.0,1.0,...,45.0,76.0,,,1.0,2,0,0,0,2
32,yes,adult,529427,37.2,60.0,,2.0,1.0,1.0,1.0,...,43.0,6.6,,,1.0,1,2209,0,0,2
37,yes,adult,533847,37.8,72.0,,,3.0,,1.0,...,56.0,80.0,1.0,2.0,1.0,1,3111,0,0,2
38,no,adult,528996,38.6,52.0,,1.0,1.0,1.0,1.0,...,32.0,6.6,1.0,5.0,1.0,2,0,0,0,1
40,yes,adult,529498,38.1,88.0,,3.0,3.0,6.0,2.0,...,63.0,6.5,3.0,,2.0,1,4205,0,0,2
43,yes,adult,534069,38.1,120.0,,3.0,4.0,4.0,1.0,...,52.0,67.0,2.0,2.0,3.0,1,3205,0,0,2


In [65]:
# процент пропусков
print(round(len(df.loc[df['respiratory_rate'].isna()]) / len(df.index) * 100, 1), "%")

19.3 %


In [66]:
df['respiratory_rate'].fillna(df.groupby(['age', 'surgery'])['respiratory_rate'].transform('median'), inplace = True)
df['respiratory_rate']

0      28.0
1      20.0
2      24.0
3      84.0
4      35.0
       ... 
295    70.0
296    24.0
297    30.0
298    24.0
299    20.0
Name: respiratory_rate, Length: 300, dtype: float64

In [67]:
#temperature_of_extremities
# - a subjective indication of peripheral circulation
# - possible values:
#   1 = Normal 
#   2 = Warm 
#   3 = Cool 
#   4 = Cold

#cool to cold extremities indicate possible shock
#hot extremities should correlate with an elevated rectal temp.


# процент пропусков
print(round(len(df.loc[df['temperature_of_extremities'].isna()]) / len(df.index) * 100, 1), "%")

18.7 %


In [68]:
#заменим пропуски на самое частое значение
df['temperature_of_extremities'].fillna(df.groupby(['age', 'surgery'])['temperature_of_extremities'].transform('max'), inplace = True)
df['temperature_of_extremities']

0      3.0
1      4.0
2      1.0
3      4.0
4      4.0
      ... 
295    4.0
296    3.0
297    4.0
298    3.0
299    4.0
Name: temperature_of_extremities, Length: 300, dtype: float64

In [69]:
# заменим на категорийное значение
df.loc[df['temperature_of_extremities'] == 1, 'temperature_of_extremities'] = 'Normal'
df.loc[df['temperature_of_extremities'] == 2, 'temperature_of_extremities'] = 'Warm'
df.loc[df['temperature_of_extremities'] == 3, 'temperature_of_extremities'] = 'Cool'
df.loc[df['temperature_of_extremities'] == 4, 'temperature_of_extremities'] = 'Cold'
df.head()

Unnamed: 0,surgery,age,hospital_number,rectal_temperature,pulse,respiratory_rate,temperature_of_extremities,peripheral_pulse,mucous_membranes,capillary_refill_time,...,packed_cell_volume,total_protein,abdominocentesis_appearance,abdomcentesis_total_protein,outcome,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
0,no,adult,530101,38.5,66.0,28.0,Cool,3.0,,2.0,...,45.0,8.4,,,2.0,2,11300,0,0,2
1,yes,adult,534817,39.2,88.0,20.0,Cold,,4.0,1.0,...,50.0,85.0,2.0,2.0,3.0,2,2208,0,0,2
2,no,adult,530334,38.3,40.0,24.0,Normal,1.0,3.0,1.0,...,33.0,6.7,,,1.0,2,0,0,0,1
3,yes,young,5290409,39.1,164.0,84.0,Cold,1.0,6.0,2.0,...,48.0,7.2,3.0,5.3,2.0,1,2208,0,0,1
4,no,adult,530255,37.3,104.0,35.0,Cold,,6.0,2.0,...,74.0,7.4,,,2.0,2,4300,0,0,2


In [70]:
#peripheral_pulse
# - subjective
# - possible values are:
#   1 = normal 
#   2 = increased 
#   3 = reduced 
#   4 = absent
#normal or increased p.p.are indicative of adequate circulation while reduced or absent indicate poor perfusion

# процент пропусков
print(round(len(df.loc[df['peripheral_pulse'].isna()]) / len(df.index) * 100, 1), "%")

23.0 %


In [71]:
#заменим пропуски на самое частое значение
df['peripheral_pulse'].fillna(df.groupby(['age', 'surgery'])['peripheral_pulse'].transform('max'), inplace = True)
df['peripheral_pulse']

0      3.0
1      4.0
2      1.0
3      1.0
4      4.0
      ... 
295    4.0
296    2.0
297    3.0
298    3.0
299    4.0
Name: peripheral_pulse, Length: 300, dtype: float64

In [72]:
# заменим на категорийное значение
df.loc[df['peripheral_pulse'] == 1, 'peripheral_pulse'] = 'normal'
df.loc[df['peripheral_pulse'] == 2, 'peripheral_pulse'] = 'increased'
df.loc[df['peripheral_pulse'] == 3, 'peripheral_pulse'] = 'reduced'
df.loc[df['peripheral_pulse'] == 4, 'peripheral_pulse'] = 'absent'

df.head()

Unnamed: 0,surgery,age,hospital_number,rectal_temperature,pulse,respiratory_rate,temperature_of_extremities,peripheral_pulse,mucous_membranes,capillary_refill_time,...,packed_cell_volume,total_protein,abdominocentesis_appearance,abdomcentesis_total_protein,outcome,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
0,no,adult,530101,38.5,66.0,28.0,Cool,reduced,,2.0,...,45.0,8.4,,,2.0,2,11300,0,0,2
1,yes,adult,534817,39.2,88.0,20.0,Cold,absent,4.0,1.0,...,50.0,85.0,2.0,2.0,3.0,2,2208,0,0,2
2,no,adult,530334,38.3,40.0,24.0,Normal,normal,3.0,1.0,...,33.0,6.7,,,1.0,2,0,0,0,1
3,yes,young,5290409,39.1,164.0,84.0,Cold,normal,6.0,2.0,...,48.0,7.2,3.0,5.3,2.0,1,2208,0,0,1
4,no,adult,530255,37.3,104.0,35.0,Cold,absent,6.0,2.0,...,74.0,7.4,,,2.0,2,4300,0,0,2


In [73]:
# по результату обработки первых 8ми столбцов, пропуски отсутствуют
for col in df.columns:
    res = df[col].isnull().mean()
    print(f'{col} - {res : 0.1%}')

surgery -  0.0%
age -  0.0%
hospital_number -  0.0%
rectal_temperature -  0.0%
pulse -  0.0%
respiratory_rate -  0.0%
temperature_of_extremities -  0.0%
peripheral_pulse -  0.0%
mucous_membranes -  15.7%
capillary_refill_time -  10.7%
pain -  18.3%
peristalsis -  14.7%
abdominal_distension -  18.7%
nasogastric_tube -  34.7%
nasogastric_reflux -  35.3%
nasogastric_reflux_PH -  82.3%
rectal_examination_feces -  34.0%
abdomen -  39.3%
packed_cell_volume -  9.7%
total_protein -  11.0%
abdominocentesis_appearance -  55.0%
abdomcentesis_total_protein -  66.0%
outcome -  0.3%
surgical_lesion -  0.0%
lesion_1 -  0.0%
lesion_2 -  0.0%
lesion_3 -  0.0%
cp_data -  0.0%
