In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [2]:
data = pd.read_csv('aac_shelter_outcomes.csv')

In [None]:
data.head()

Обработаем 3 столбца с данными о дате, выделив из них год, месяц и день

In [3]:
data_date_of_birth = data['date_of_birth'].str.extract(r'(\d{4})-(\d{2})-(\d{2})').astype(int)
data_datetime = data['datetime'].str.extract(r'(\d{4})-(\d{2})-(\d{2})').astype(int)
data_monthyear = data['monthyear'].str.extract(r'(\d{4})-(\d{2})-(\d{2})').astype(int)

Есть подозрение, что datetime и monthyear заполнены одинаковыми данными. Проверим это

In [4]:
(data_datetime - data_monthyear).sum()

0    0
1    0
2    0
dtype: int64

Значит один из столбцов, например, monthyear, можно будет удалить

In [5]:
data_date_of_birth.columns = ['Year_of_birth', 'Month_of_birth', 'Day_of_birth']
data_datetime.columns = ['Year_of_outc', 'Month_of_outc', 'Day_of_outc']
result_data_date = pd.concat([data_date_of_birth, data_datetime], axis=1)

для каждой строки определим возвраст в неделях

In [6]:
data['age_in_weeks'] = (result_data_date.Year_of_outc * 52 + result_data_date.Month_of_outc * 4 + result_data_date.Day_of_outc // 7) - (result_data_date.Year_of_birth * 52 + result_data_date.Month_of_birth * 4 + result_data_date.Day_of_birth // 7) 

Удалим столбцы, которые уже использовали и которые нам не нужны

In [7]:
data.drop(['age_upon_outcome', 'date_of_birth', 'animal_id',
          'datetime', 'monthyear'], axis=1, inplace=True)

Отработаем данные Имени питомца

In [8]:
#Заполним NaN значением None 
data.name.fillna('None', inplace = True)
#Закодируем наличие  имени через 1 и отсутствие имени через 0
data['name_is_present'] = np.where(data.name == 'None', 0, 1)
del data['name']

Закодируем animal_type через OneHotEncoder

In [9]:
ohe = OneHotEncoder()

ohe_at = ohe.fit_transform(data[['animal_type']]).toarray()
a_type = pd.DataFrame(ohe_at, index=data[['animal_type']].index, columns=[str('atype_') + str(x) for x in ohe.categories_[0]])

In [10]:
data = pd.concat([data, a_type], axis=1)
del data['animal_type']

Заполним NaN sex_upon_outcome через Моду

In [11]:
data.sex_upon_outcome.unique()

array(['Intact Male', 'Spayed Female', 'Neutered Male', 'Unknown',
       'Intact Female', nan], dtype=object)

In [12]:
data.sex_upon_outcome.fillna(data.sex_upon_outcome.mode()[0], inplace = True)

Закодируем sex_upon_outcome через OneHotEncoder

In [13]:
ohe_sex = ohe.fit_transform(data[['sex_upon_outcome']]).toarray()
sex_type = pd.DataFrame(ohe_sex, index=data[['sex_upon_outcome']].index, columns=[str('sex_type_') + str(x) for x in ohe.categories_[0]])

In [14]:
data = pd.concat([data, sex_type], axis=1)
del data['sex_upon_outcome']

Отработаем цвет

Посмотрим количество уникальных значений

In [15]:
data.color.nunique()

525

Попробуем уменьшить количество варинатов для OneHotEncode путем выделения основного и вторичного цветов, а таже выделения признака "полосатости" питомца

In [16]:
#Выделим основной цвет, наличие полосатости и вторичный цвет
colors = data['color'].str.extract(r'(\w*)[ ]?(\w*)[/]?(\w*)')
colors = colors.rename(columns = {colors.columns[0] : 'Main', colors.columns[1] : 'Tabby', colors.columns[2] : 'Second'})
#При отсутствии вторичного цвета поставим None
colors['Second'] = np.where(colors['Second'] == '', 'None', colors['Second'])
#Закодируем наличие полосатости через 1
colors['Tabby'] = np.where(colors['Tabby'] == '', 0, 1)
#Закодируем первичный цвет через OneHotEncoder
ohe_main = ohe.fit_transform(colors[['Main']]).toarray()
main_out = pd.DataFrame(ohe_main, index=colors[['Main']].index, columns=[str('Mcolor_') + str(x) for x in ohe.categories_[0]])
#Закодируем вторичный цвет через OneHotEncoder
ohe_second = ohe.fit_transform(colors[['Second']]).toarray()
second_out = pd.DataFrame(ohe_second, index=colors[['Second']].index, columns=[str('Scolor_') + str(x) for x in ohe.categories_[0]])
#объединим все в один датафрейм и удалим лишниые столбцы
colors_full = pd.concat([colors, main_out, second_out], axis=1)
del colors_full['Main']
del colors_full['Second']

размерность матрицы, кодирующей цвет, значительно снизилась

In [17]:
colors_full.shape

(78256, 59)

Добавим данные о цвете в общий датасет

In [18]:
data = pd.concat([data, colors_full], axis=1)
del data['color']

Отработаем породу.
Попробуем уменьшить количество варинатов для OneHotEncode путем 
выделения факта смешенной породы и пород, участвующих в смешении

Сначала выделим признак Mix и смешения пророды через указание  "/"

In [19]:
data['breed_mix2'] = np.where(data.breed.str.contains('/'), 1, 0)
data['breed_mix3'] = np.where(data.breed.str.contains('Mix'), 1, 0)
data['breed_mix'] = np.abs(data['breed_mix2'] - data['breed_mix3'])
data.drop(['breed_mix2', 'breed_mix3'], axis=1, inplace=True)

In [20]:
data['breed1'] = data.breed.str.replace(' Mix', '')

Выделим основную  и вторичную породу

In [21]:
breeds  = data['breed1'].str.extract(r'(\w*[ ]?\w*[ ]?\w*)[/]?(\w*[ ]?\w*)')
breeds = breeds.rename(columns = {breeds.columns[0] : 'Main', breeds.columns[1] : 'Second'})
breeds['Second'] = np.where(breeds['Second'] == '', 'None', breeds['Second'])

Закодируем обе породы через OneHotEncoder

In [22]:
ohe_mbreed = ohe.fit_transform(breeds[['Main']]).toarray()
mbreed = pd.DataFrame(ohe_mbreed, index=breeds[['Main']].index, columns=[str('Mbreed_') + str(x) for x in ohe.categories_[0]])

ohe_sbreed = ohe.fit_transform(breeds[['Second']]).toarray()
sbreed = pd.DataFrame(ohe_sbreed, index=breeds[['Second']].index, columns=[str('Sbreed_') + str(x) for x in ohe.categories_[0]])

Добавим данные о породах в общий датасет

In [23]:
data = pd.concat([data, mbreed, sbreed], axis=1)

Удалим столбцы, данные которых уже отработаны и больше не требуются

In [24]:
data.drop(['breed', 'breed1'], axis=1, inplace=True)

Отработаем outcome_subtype

In [25]:
data.outcome_subtype.isna().sum() / len(data)

0.540444183193621

более половины данных по этому столбцу отсутствует, данных для заполнения нет.

Рассмотрим распредление outcome_subtype по outcome_type

In [26]:
data.groupby('outcome_type').outcome_subtype.value_counts()

outcome_type  outcome_subtype    
Adoption      Foster                  5558
              Offsite                  367
              Barn                       1
Died          In Kennel                335
              In Foster                164
              Enroute                   45
              At Vet                    33
              In Surgery                16
Euthanasia    Suffering               2514
              Rabies Risk             2417
              Aggressive               506
              Medical                  259
              Behavior                 142
              Underage                  28
              At Vet                    26
              Court/Investigation       18
Missing       In Foster                 18
              Possible Theft             9
              In Kennel                  8
Transfer      Partner                19660
              SCRP                    3211
              Snr                      626
              Barn  

In [27]:
data.outcome_type.value_counts()

Adoption           33112
Transfer           23499
Return to Owner    14354
Euthanasia          6080
Died                 680
Disposal             307
Rto-Adopt            150
Missing               46
Relocate              16
Name: outcome_type, dtype: int64

Большинство отсутствующих данных в outcome_subtype относятся к интереующему нас классу Adoption в outcome_type
Из 33К присутствует менее 6К данных
Поэтому есть предлодение удалить этот столбец и не использовать его при обучении модели.

In [28]:
del data['outcome_subtype']

In [29]:
data.shape

(78256, 583)

Обработаем пропуски в целевой переменной. Их 12. В "масштабах" общего количества строк эти строки можно удалить

In [30]:
data[(data.outcome_type.isnull())]

Unnamed: 0,outcome_type,age_in_weeks,name_is_present,atype_Bird,atype_Cat,atype_Dog,atype_Livestock,atype_Other,sex_type_Intact Female,sex_type_Intact Male,...,Sbreed_Treeing Walker,Sbreed_Unknown,Sbreed_Vizsla,Sbreed_Weimaraner,Sbreed_Welsh Terrier,Sbreed_West Highland,Sbreed_Whippet,Sbreed_Wire Hair,Sbreed_Yorkshire,Sbreed_Yorkshire Terrier
10753,,52,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13270,,0,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13302,,4,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16951,,52,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29214,,52,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
64626,,7,1,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
68246,,158,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
69847,,7,1,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
72037,,7,1,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
76709,,537,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
data = data[~(data.outcome_type.isnull())]

In [32]:
data.shape

(78244, 583)

In [33]:
data.outcome_type.unique()

array(['Transfer', 'Adoption', 'Euthanasia', 'Return to Owner', 'Died',
       'Disposal', 'Relocate', 'Missing', 'Rto-Adopt'], dtype=object)

Закодиреуем искомую переменную

In [35]:
data.outcome_type = data.outcome_type.map({'Adoption': 0,
                                           'Transfer': 1,
                                           'Return to Owner': 2,
                                           'Euthanasia': 3,
                                           'Died': 4,
                                           'Disposal': 5,
                                           'Rto-Adopt': 6,
                                           'Missing': 7,
                                           'Relocate': 8                                         
                                          })

In [None]:
le = LabelEncoder()
le.fit(data['outcome_type'])
data['outcome_type'] = le.transform(data['outcome_type'])

In [36]:
data.head()

Unnamed: 0,outcome_type,age_in_weeks,name_is_present,atype_Bird,atype_Cat,atype_Dog,atype_Livestock,atype_Other,sex_type_Intact Female,sex_type_Intact Male,...,Sbreed_Treeing Walker,Sbreed_Unknown,Sbreed_Vizsla,Sbreed_Weimaraner,Sbreed_Welsh Terrier,Sbreed_West Highland,Sbreed_Whippet,Sbreed_Wire Hair,Sbreed_Yorkshire,Sbreed_Yorkshire Terrier
0,1,2,0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,53,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,60,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,470,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3,24,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Подготовим данные для обучения модели

In [37]:
x = data.copy()
y = data['outcome_type']
del x['outcome_type']

In [None]:
x.shape

In [38]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

In [39]:
model_RFC = RandomForestClassifier(n_estimators=10, max_depth=5, min_samples_leaf=20, max_features=0.5, n_jobs=-1)#(n_estimators=10, max_depth=5, n_jobs=-1)
model_RFC.fit(x_train, y_train)
model_RFC.score(x_test, y_test)

0.6670784697963704

In [40]:
y_pred = model_RFC.predict(x_test)

In [42]:
confusion_matrix(y_test, y_pred)

array([[8586,  432,  924,    4,    0,    0,    0,    0,    0],
       [2023, 4179,  756,   33,    0,    0,    0,    0,    0],
       [2064,  302, 1957,    5,    0,    0,    0,    0,    0],
       [ 171,  536,  200,  937,    0,    0,    0,    0,    0],
       [  22,  138,   10,   30,    0,    0,    0,    0,    0],
       [  13,    9,    0,   80,    0,    0,    0,    0,    0],
       [  31,    0,   17,    0,    0,    0,    0,    0,    0],
       [   3,    7,    1,    0,    0,    0,    0,    0,    0],
       [   2,    0,    0,    2,    0,    0,    0,    0,    0]],
      dtype=int64)

In [43]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.66      0.86      0.75      9946
           1       0.75      0.60      0.66      6991
           2       0.51      0.45      0.48      4328
           3       0.86      0.51      0.64      1844
           4       0.00      0.00      0.00       200
           5       0.00      0.00      0.00       102
           6       0.00      0.00      0.00        48
           7       0.00      0.00      0.00        11
           8       0.00      0.00      0.00         4

    accuracy                           0.67     23474
   macro avg       0.31      0.27      0.28     23474
weighted avg       0.66      0.67      0.65     23474



  _warn_prf(average, modifier, msg_start, len(result))


F-мера по нужным нам классам составляет 0.75 и 0.66 соответственно, что больше, чем по другим классам. 