In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [2]:
ds = pd.read_csv('auto_edited.csv', index_col=0) # загружаем данные

In [3]:
ds

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_of_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.40,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.40,8.0,115,5500,18,22,17450
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,-1,95,volvo,gas,std,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,9.5,114,5400,23,28,16845
201,-1,95,volvo,gas,turbo,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,8.7,160,5300,19,25,19045
202,-1,95,volvo,gas,std,four,sedan,rwd,front,109.1,...,173,mpfi,3.58,2.87,8.8,134,5500,18,23,21485
203,-1,95,volvo,diesel,turbo,four,sedan,rwd,front,109.1,...,145,idi,3.01,3.40,23.0,106,4800,26,27,22470


In [4]:
ds.dtypes # смотрим на типы данных

symboling              int64
normalized_losses     object
make                  object
fuel_type             object
aspiration            object
num_of_doors          object
body_style            object
drive_wheels          object
 engine_location      object
wheel_base           float64
length               float64
width                float64
height               float64
curb_weight            int64
engine_type           object
num_of_cylinders      object
engine_size            int64
fuel_system           object
bore                  object
stroke                object
compression_ratio    float64
horsepower            object
peak_rpm              object
city_mpg               int64
highway_mpg            int64
price                 object
dtype: object

Очевидно, что некоторые признаки вроде 'horsepower' или 'stroke' должны быть численными, а не категориальными. Скорее всего в этих данных существуют ошибки в заполнении или неверные символы, поэтому приведем эти признаки к числовым, а неверные значения заменим NaN:

In [5]:
ds['normalized_losses'] = pd.to_numeric(ds['normalized_losses'], errors='coerce')
ds['bore'] = pd.to_numeric(ds['bore'], errors='coerce')
ds['stroke'] = pd.to_numeric(ds['stroke'], errors='coerce')
ds['horsepower'] = pd.to_numeric(ds['horsepower'], errors='coerce')
ds['peak_rpm'] = pd.to_numeric(ds['peak_rpm'], errors='coerce')
ds['price'] = pd.to_numeric(ds['price'], errors='coerce')

In [6]:
ds.isna().sum() # пропущенные элементы

symboling             0
normalized_losses    41
make                  0
fuel_type             0
aspiration            0
num_of_doors          0
body_style            0
drive_wheels          0
 engine_location      0
wheel_base            0
length                0
width                 0
height                0
curb_weight           0
engine_type           0
num_of_cylinders      0
engine_size           0
fuel_system           0
bore                  4
stroke                4
compression_ratio     0
horsepower            2
peak_rpm              2
city_mpg              0
highway_mpg           0
price                 4
dtype: int64

In [20]:
# Заполним пропущенные данные - медианными значениями признака
ds['normalized_losses'] = ds['normalized_losses'].fillna(ds['normalized_losses'].median())
ds['bore'] = ds['bore'].fillna(ds['bore'].median())
ds['stroke'] = ds['stroke'].fillna(ds['stroke'].median())
ds['horsepower'] = ds['horsepower'].fillna(ds['horsepower'].median())
ds['peak_rpm'] = ds['peak_rpm'].fillna(ds['peak_rpm'].median())

ds = ds.drop(np.where(ds['price'].isna())[0]) # удалим образцы с нулевой ценой
ds = ds.reset_index(drop=True) # обновим индексы

In [22]:
print('Кол-во пропущенных значений в датасете: {}'.format(ds.isna().sum().sum()))

Кол-во пропущенных значений: 0


In [23]:
cat_cols = list(ds.select_dtypes(include='O').columns)
print('Категориальные признаки в датасете: \n{}'.format(cat_cols))

Категориальные признаки в датасете: 
['make', 'fuel_type', 'aspiration', 'num_of_doors', 'body_style', 'drive_wheels', ' engine_location', 'engine_type', 'num_of_cylinders', 'fuel_system']


Многие алгоритмы машинного обучения (за исключением деревоподобных) могут работать только с численными данными, поэтому часто категориальные входные данные необходимо каким-либо образом перекодировать в численные. Рассмотрим 2 популярных подхода: LabelEncoding и One-Hot-Encoding и их реализацию из библиотеки sklearn.

### LabelEncoding

In [24]:
df = ds.copy()

le = LabelEncoder() 
for col in cat_cols:
    df[col] = le.fit_transform(df[col]) # кодируем категориальные признаки

In [25]:
df # посмотрим как выглядит датафрейм

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_of_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,115.0,0,1,0,2,0,2,0,88.6,...,130,5,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,115.0,0,1,0,2,0,2,0,88.6,...,130,5,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,115.0,0,1,0,2,2,2,0,94.5,...,152,5,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,1,1,0,1,3,1,0,99.8,...,109,5,3.19,3.40,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,1,1,0,1,3,0,0,99.4,...,136,5,3.19,3.40,8.0,115.0,5500.0,18,22,17450.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196,-1,95.0,21,1,0,1,3,2,0,109.1,...,141,5,3.78,3.15,9.5,114.0,5400.0,23,28,16845.0
197,-1,95.0,21,1,1,1,3,2,0,109.1,...,141,5,3.78,3.15,8.7,160.0,5300.0,19,25,19045.0
198,-1,95.0,21,1,0,1,3,2,0,109.1,...,173,5,3.58,2.87,8.8,134.0,5500.0,18,23,21485.0
199,-1,95.0,21,0,1,1,3,2,0,109.1,...,145,3,3.01,3.40,23.0,106.0,4800.0,26,27,22470.0


Для ohe-hot кодирования существуют 2 подхода - класс OneHotEncoder из sklearn и метод get_dummies() из библиотеки pandas:

### OneHotEncoding

In [28]:
df = ds.copy()
ohe = OneHotEncoder()
df_ohe = ohe.fit_transform(df[cat_cols]) # по умолчанию возвращает разреженную матрицу
df_ohe = pd.DataFrame(df_ohe.todense(), columns=ohe.get_feature_names()) # приведем матрицу к плотному виду и создадим датафрейм

df_encoded = pd.concat([df.drop(cat_cols, axis=1), df_ohe], axis=1) # объединяем изначальное числовые и закодированные колонки

In [29]:
df_encoded # посмотрим на закодированный датафрейм

Unnamed: 0,symboling,normalized_losses,wheel_base,length,width,height,curb_weight,engine_size,bore,stroke,...,x8_twelve,x8_two,x9_1bbl,x9_2bbl,x9_4bbl,x9_idi,x9_mfi,x9_mpfi,x9_spdi,x9_spfi
0,3,115.0,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,3,115.0,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1,115.0,94.5,171.2,65.5,52.4,2823,152,2.68,3.47,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,2,164.0,99.8,176.6,66.2,54.3,2337,109,3.19,3.40,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,2,164.0,99.4,176.6,66.4,54.3,2824,136,3.19,3.40,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196,-1,95.0,109.1,188.8,68.9,55.5,2952,141,3.78,3.15,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
197,-1,95.0,109.1,188.8,68.8,55.5,3049,141,3.78,3.15,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
198,-1,95.0,109.1,188.8,68.9,55.5,3012,173,3.58,2.87,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
199,-1,95.0,109.1,188.8,68.9,55.5,3217,145,3.01,3.40,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


### Dummy variables

In [33]:
df = ds.copy()
df_dummy = pd.get_dummies(df, dummy_na=False, sparse=False) # применяем dummy-кодирование

In [34]:
df_dummy # посмотрим на преобразованный датасет

Unnamed: 0,symboling,normalized_losses,wheel_base,length,width,height,curb_weight,engine_size,bore,stroke,...,num_of_cylinders_twelve,num_of_cylinders_two,fuel_system_1bbl,fuel_system_2bbl,fuel_system_4bbl,fuel_system_idi,fuel_system_mfi,fuel_system_mpfi,fuel_system_spdi,fuel_system_spfi
0,3,115.0,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,...,0,0,0,0,0,0,0,1,0,0
1,3,115.0,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,...,0,0,0,0,0,0,0,1,0,0
2,1,115.0,94.5,171.2,65.5,52.4,2823,152,2.68,3.47,...,0,0,0,0,0,0,0,1,0,0
3,2,164.0,99.8,176.6,66.2,54.3,2337,109,3.19,3.40,...,0,0,0,0,0,0,0,1,0,0
4,2,164.0,99.4,176.6,66.4,54.3,2824,136,3.19,3.40,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196,-1,95.0,109.1,188.8,68.9,55.5,2952,141,3.78,3.15,...,0,0,0,0,0,0,0,1,0,0
197,-1,95.0,109.1,188.8,68.8,55.5,3049,141,3.78,3.15,...,0,0,0,0,0,0,0,1,0,0
198,-1,95.0,109.1,188.8,68.9,55.5,3012,173,3.58,2.87,...,0,0,0,0,0,0,0,1,0,0
199,-1,95.0,109.1,188.8,68.9,55.5,3217,145,3.01,3.40,...,0,0,0,0,0,1,0,0,0,0
