In [29]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from tqdm.notebook import tqdm
%matplotlib inline

In [30]:
df = pd.read_csv("train.csv")
df = df.drop(columns=["OutcomeSubtype","AnimalID"])
target = "OutcomeType"

In [31]:
target_dict = {
    "Adoption": 0,
"Transfer": 1,
"Return_to_owner": 2, 
"Euthanasia":3,
"Died":4
}

In [32]:
df[target] = df[target].map(target_dict)

## EDA

In [33]:
df.head()

Unnamed: 0,Name,DateTime,OutcomeType,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,Hambone,2014-02-12 18:22:00,2,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
1,Emily,2013-10-13 12:44:00,3,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
2,Pearce,2015-01-31 12:28:00,0,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White
3,,2014-07-11 19:09:00,1,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream
4,,2013-11-15 12:52:00,1,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan


In [34]:
df.shape

(26729, 8)

In [35]:
(df.isna().sum()/df.shape[0]).sort_values()

DateTime          0.000000
OutcomeType       0.000000
AnimalType        0.000000
Breed             0.000000
Color             0.000000
SexuponOutcome    0.000037
AgeuponOutcome    0.000673
Name              0.287740
dtype: float64

In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26729 entries, 0 to 26728
Data columns (total 8 columns):
Name              19038 non-null object
DateTime          26729 non-null object
OutcomeType       26729 non-null int64
AnimalType        26729 non-null object
SexuponOutcome    26728 non-null object
AgeuponOutcome    26711 non-null object
Breed             26729 non-null object
Color             26729 non-null object
dtypes: int64(1), object(7)
memory usage: 1.6+ MB


In [37]:
df.nunique()

Name               6374
DateTime          22918
OutcomeType           5
AnimalType            2
SexuponOutcome        5
AgeuponOutcome       44
Breed              1380
Color               366
dtype: int64

In [38]:
for cl in df.columns:
    print(cl)
    print(df[cl].value_counts())
    print("\n")

Name
Max         136
Bella       135
Charlie     107
Daisy       106
Lucy         94
           ... 
Teeny         1
Teeger        1
Patti         1
Saverina      1
Becca         1
Name: Name, Length: 6374, dtype: int64


DateTime
2015-08-11 00:00:00    19
2015-11-17 00:00:00    17
2015-07-02 00:00:00    13
2015-04-02 00:00:00    11
2014-08-31 09:00:00    10
                       ..
2016-01-29 14:53:00     1
2014-08-07 18:50:00     1
2015-12-18 14:44:00     1
2015-02-08 18:16:00     1
2015-07-19 17:30:00     1
Name: DateTime, Length: 22918, dtype: int64


OutcomeType
0    10769
1     9422
2     4786
3     1555
4      197
Name: OutcomeType, dtype: int64


AnimalType
Dog    15595
Cat    11134
Name: AnimalType, dtype: int64


SexuponOutcome
Neutered Male    9779
Spayed Female    8820
Intact Male      3525
Intact Female    3511
Unknown          1093
Name: SexuponOutcome, dtype: int64


AgeuponOutcome
1 year       3969
2 years      3742
2 months     3397
3 years      1823
1 month      1281

## preprocessing

In [39]:
from sklearn.model_selection import train_test_split

In [40]:
X_train, X_test, y_train, y_test = \
train_test_split(df.drop(columns=target),df[target], 
                 test_size=0.3, random_state=42, stratify=df[target])

In [41]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.base import TransformerMixin

In [42]:
class SimpleImputerDf(SimpleImputer):
    """DataFrame Wrapper around SimpleImputer"""
    def __init__(self, strategy):
        super(SimpleImputerDf, self).__init__(strategy=strategy)

    def fit(self, X, y=None):
        if len(X.values.shape)==1:
            vals = X.values.reshape(-1,1)
        else:
            vals = X.values    
        return super(SimpleImputerDf, self).fit(vals)
    
    def transform(self, X, y=None):
        if len(X.values.shape)==1:
            vals = X.values.reshape(-1,1)
            names = [X.name]
        else:
            vals = X.values
            names = X.columns
        z = super(SimpleImputerDf, self).transform(vals)
        return pd.DataFrame(z, index=X.index, columns=names)
    
class OrdinalEncoderDf(OrdinalEncoder):
    """DataFrame Wrapper around OrdinalEncoderDf"""
    def __init__(self):
        super(OrdinalEncoderDf, self).__init__()

    def transform(self, X, y=None):
        z = super(OrdinalEncoderDf, self).transform(X.values)
        return pd.DataFrame(z, index=X.index, columns=X.columns)

class DenseTransformer(TransformerMixin):

    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, y=None, **fit_params):
        return X.todense()

In [43]:
# преобразуем поля пола и типа животного
# нет необходимости использовать one hot encoder, так как дерево за нас сделает необходимые разбивки
features_1 = Pipeline([
    ("get features", FunctionTransformer(lambda x: x[["AnimalType", "SexuponOutcome"]])),
    ("impute", SimpleImputerDf(strategy="most_frequent")),
    ("encode", OrdinalEncoderDf())
])

In [44]:
# преобразуем дату в набор год/месяц/день недели
def create_time_features(df):
    dt = pd.to_datetime(df.iloc[:,0])
    return pd.DataFrame([dt.dt.year.rename("yr"), dt.dt.month.rename("mn"), dt.dt.weekday.rename("weekday")]).T

features_2 = Pipeline([
    ("get features", FunctionTransformer(lambda x: x["DateTime"])),
    ("impute", SimpleImputerDf(strategy="most_frequent")),
    ("create_time_features", FunctionTransformer(create_time_features))
])

In [45]:
# преобразуем AgeuponOutcome в числовую переменную
# преобразуем дату в набор год/месяц/день недели
def preprocess_ageuponoutcome(df):
    sr = df.iloc[:,0]
    dct = {
        'weeks':7,
        'years':365, 
        'months':30, 
        'year':365, 
        'days':1, 
        'month':30, 
        'week':7, 
        'day':1        
    }    
    return (sr.apply(lambda x:str(x).split()[1]).map(dct) * sr.apply(lambda x: int(str(x).split()[0])) ).values.reshape(-1,1)

features_3 = Pipeline([
    ("get features", FunctionTransformer(lambda x: x["AgeuponOutcome"])),
    ("impute", SimpleImputerDf(strategy="most_frequent")),
    ("calculate", FunctionTransformer(preprocess_ageuponoutcome)),
])

In [46]:
# прочие переменные преобразуем с помощью Count Encoder
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from category_encoders import TargetEncoder, CountEncoder
features_4 = Pipeline([
    ("get features", FunctionTransformer(lambda x: x.loc[:,["Breed", "Name", "Color"]])),
    ("impute", SimpleImputerDf(strategy="most_frequent")),
#     ("vectorize", TfidfVectorizer(ngram_range = (1,2), max_features=40)),
    ("encode", CountEncoder(handle_unknown=0, min_group_size=0.05)),
#     ("to_dense", FunctionTransformer(lambda x: x.todense())),
#     ("dense", DenseTransformer())
])

In [47]:
preprocessing_pipeline = FeatureUnion([
    ("1", features_1),
    ("2", features_2),
    ("3", features_3),
    ("4", features_4),    

])

column_names = ["AnimalType", "SexuponOutcome",
             "yr","mn", "weekday",
             "AgeuponOutcome",
             "Breed", "Name", "Color"
            ]

pd.DataFrame(preprocessing_pipeline.fit_transform(X_train), columns=column_names).head()

Unnamed: 0,AnimalType,SexuponOutcome,yr,mn,weekday,AgeuponOutcome,Breed,Name,Color
0,0.0,2.0,2013.0,11.0,5.0,60.0,6186.0,1.0,1561.0
1,1.0,1.0,2015.0,7.0,3.0,30.0,1342.0,5452.0,485.0
2,0.0,1.0,2015.0,8.0,1.0,21.0,18.0,5452.0,25.0
3,0.0,2.0,2013.0,11.0,3.0,365.0,272.0,7.0,60.0
4,0.0,2.0,2015.0,12.0,3.0,1825.0,6186.0,4.0,310.0


## Моделирование

In [48]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import confusion_matrix, classification_report


In [49]:
model = RandomForestClassifier()
baseline = DummyClassifier(strategy ="stratified")

In [50]:
pl = Pipeline([
    ("preprocess", preprocessing_pipeline),
    ("model", model) 
])
pl.fit(X_train, y_train)
y_predicted = pl.predict(X_test)

In [51]:
baseline.fit(X_train, y_train)
y_predicted_baseline = baseline.predict(X_test)

In [52]:
print("baseline")
print(classification_report(y_test, y_predicted_baseline, target_names =target_dict.keys()))
print("model")
print(classification_report(y_test, y_predicted, target_names =target_dict.keys()))

baseline
                 precision    recall  f1-score   support

       Adoption       0.42      0.42      0.42      3231
       Transfer       0.35      0.36      0.36      2827
Return_to_owner       0.18      0.17      0.18      1436
     Euthanasia       0.06      0.06      0.06       466
           Died       0.00      0.00      0.00        59

       accuracy                           0.33      8019
      macro avg       0.20      0.20      0.20      8019
   weighted avg       0.33      0.33      0.33      8019

model
                 precision    recall  f1-score   support

       Adoption       0.68      0.81      0.74      3231
       Transfer       0.71      0.66      0.69      2827
Return_to_owner       0.45      0.43      0.44      1436
     Euthanasia       0.57      0.20      0.30       466
           Died       0.75      0.05      0.10        59

       accuracy                           0.65      8019
      macro avg       0.63      0.43      0.45      8019
   weighted

## Видим, что модель гораздо круче базовой линии. Подберем параметры классификатотра и еще раз посмотрим на результат

In [53]:
%%time
param_grid = {
    'model__max_depth': [None, 2,3,5,10,15,20,30,40],
#     'model__min_samples_leaf': [1,2,5,10,0.01,0.05,0.1,0.2],
    'model__n_estimators': np.logspace(0.5,4,20).astype(int)
}
search = RandomizedSearchCV(pl, param_grid, n_jobs=-1, cv=5, verbose=1,n_iter=20)
search.fit(X_train, y_train)
search.best_params_

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 15.8min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 24.2min finished


Wall time: 24min 31s


{'model__n_estimators': 784, 'model__max_depth': 10}

In [54]:
pd.DataFrame(search.cv_results_).sort_values(by="rank_test_score").head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__n_estimators,param_model__max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
14,18.205363,0.583672,1.278603,0.187941,784,10.0,"{'model__n_estimators': 784, 'model__max_depth...",0.648316,0.657135,0.654997,0.657135,0.661678,0.655852,0.004353,1
0,3.04333,0.170009,0.449508,0.05918,17,10.0,"{'model__n_estimators': 17, 'model__max_depth'...",0.644041,0.657937,0.653394,0.653394,0.656601,0.653073,0.004856,2
10,10.478196,0.1369,0.895011,0.048441,219,20.0,"{'model__n_estimators': 219, 'model__max_depth...",0.642972,0.64511,0.641101,0.645911,0.651523,0.645323,0.003527,3
8,250.710045,11.384021,65.548157,22.333533,6543,30.0,"{'model__n_estimators': 6543, 'model__max_dept...",0.639765,0.644842,0.639765,0.646713,0.651523,0.644522,0.004454,4
2,277.098594,25.560969,84.219596,35.836678,6543,,"{'model__n_estimators': 6543, 'model__max_dept...",0.640567,0.646446,0.639498,0.64511,0.650722,0.644468,0.004083,5


## на мой вззгляд самая приличная модель у нас на 4 месте - высокий уровень метрики при низком среднеквадратическом отклоннии. При этом глубина модельки маленькая, она быстро обучается. Поищем оптимальное решение в окрестности найденных параметров

In [55]:
%%time
param_grid = {
    'model__max_depth': [8,9,10,11,12],
#     'model__min_samples_leaf': [1,2,5,10,0.01,0.05,0.1,0.2],
    'model__n_estimators': [8,9,10,11,12]
}
search = GridSearchCV(pl, param_grid, n_jobs=-1, cv=5, verbose=1)
search.fit(X_train, y_train)
search.best_params_

Fitting 5 folds for each of 25 candidates, totalling 125 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   37.2s
[Parallel(n_jobs=-1)]: Done 125 out of 125 | elapsed:  1.8min finished


Wall time: 1min 49s


{'model__max_depth': 9, 'model__n_estimators': 11}

In [56]:
print("модель с подобранными параметрами")
y_predicted = search.predict(X_test)
print(classification_report(y_test, y_predicted, target_names =target_dict.keys()))

модель с подобранными параметрами
                 precision    recall  f1-score   support

       Adoption       0.66      0.84      0.74      3231
       Transfer       0.72      0.64      0.68      2827
Return_to_owner       0.47      0.46      0.46      1436
     Euthanasia       0.59      0.09      0.16       466
           Died       0.00      0.00      0.00        59

       accuracy                           0.65      8019
      macro avg       0.49      0.41      0.41      8019
   weighted avg       0.64      0.65      0.63      8019



  _warn_prf(average, modifier, msg_start, len(result))


In [57]:
confusion_matrix(y_test, y_predicted)

array([[2704,  204,  321,    2,    0],
       [ 698, 1803,  307,   19,    0],
       [ 602,  172,  655,    7,    0],
       [  58,  262,  102,   44,    0],
       [   6,   51,    0,    2,    0]], dtype=int64)

In [58]:
### что-то качество не приросло

In [59]:
pd.DataFrame(search.cv_results_).sort_values(by="rank_test_score").head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__max_depth,param_model__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
8,2.71596,0.059849,0.444609,0.021156,9,11,"{'model__max_depth': 9, 'model__n_estimators':...",0.646446,0.65767,0.652859,0.659808,0.659006,0.655158,0.004978,1
0,2.762977,0.092352,0.453362,0.032222,8,8,"{'model__max_depth': 8, 'model__n_estimators': 8}",0.644575,0.664083,0.649118,0.653928,0.653661,0.653073,0.006483,2
6,2.908193,0.099984,0.471107,0.026341,9,9,"{'model__max_depth': 9, 'model__n_estimators': 9}",0.64511,0.650187,0.656601,0.656601,0.652058,0.652111,0.004314,3
13,2.796883,0.024254,0.472492,0.023761,10,11,"{'model__max_depth': 10, 'model__n_estimators'...",0.648049,0.656868,0.649385,0.64992,0.654196,0.651684,0.00331,4
1,2.91292,0.047846,0.527708,0.025753,8,9,"{'model__max_depth': 8, 'model__n_estimators': 9}",0.645644,0.652058,0.658204,0.653394,0.648584,0.651577,0.00428,5


### посмотрим на важность признаков

In [60]:
pd.DataFrame(
search.best_estimator_["model"].feature_importances_
, index=column_names
).sort_values(by=0)


Unnamed: 0,0
yr,0.015183
mn,0.030267
Color,0.034491
Breed,0.036637
weekday,0.048201
AnimalType,0.062395
Name,0.185142
AgeuponOutcome,0.265096
SexuponOutcome,0.322586


# Начало домашки по градиентному бустингу

In [None]:
## попробуем lightGBM

In [72]:
from lightgbm import LGBMClassifier

In [75]:
%%time
pl = Pipeline([
    ("preprocess", preprocessing_pipeline),
    ("model", LGBMClassifier()) 
])
pl.fit(X_train, y_train)
y_predicted = pl.predict(X_test)
print(classification_report(y_test, y_predicted, target_names =target_dict.keys()))

                 precision    recall  f1-score   support

       Adoption       0.67      0.84      0.75      3231
       Transfer       0.74      0.65      0.69      2827
Return_to_owner       0.48      0.46      0.47      1436
     Euthanasia       0.61      0.20      0.30       466
           Died       0.00      0.00      0.00        59

       accuracy                           0.66      8019
      macro avg       0.50      0.43      0.44      8019
   weighted avg       0.65      0.66      0.65      8019

Wall time: 3.09 s


### уже из коробки имеем качество выше чем у RandomForest с пододбором параметров.
Попробуем улучшить резутьтат, подобрав оптимальные параметры

In [82]:
%%time
param_grid = {
    'model__max_depth': [2,3,5,10,50,100,-1],
    'model__learning_rate': [0.01,0.1,0.5,1,10],
#     'model__min_samples_leaf': [1,2,5,10,0.01,0.05,0.1,0.2],
    'model__n_estimators': np.logspace(0.5,4,20).astype(int)
}
search = RandomizedSearchCV(pl, param_grid, n_jobs=-1, cv=5, verbose=1,n_iter=40)
search.fit(X_train, y_train)
search.best_params_

Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  8.4min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 25.6min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed: 26.6min finished


Wall time: 27min 2s


{'model__n_estimators': 6543,
 'model__max_depth': 2,
 'model__learning_rate': 0.01}

In [83]:
print("модель с подобранными параметрами")
y_predicted = search.predict(X_test)
print(classification_report(y_test, y_predicted, target_names =target_dict.keys()))

модель с подобранными параметрами
                 precision    recall  f1-score   support

       Adoption       0.67      0.83      0.75      3231
       Transfer       0.73      0.65      0.69      2827
Return_to_owner       0.48      0.46      0.47      1436
     Euthanasia       0.58      0.16      0.25       466
           Died       0.00      0.00      0.00        59

       accuracy                           0.66      8019
      macro avg       0.49      0.42      0.43      8019
   weighted avg       0.65      0.66      0.64      8019



  _warn_prf(average, modifier, msg_start, len(result))


## прироста в качестве не получили
# Попробуем теперь Catboost

In [84]:
from catboost import CatBoostClassifier

In [86]:
%%time
pl = Pipeline([
    ("preprocess", preprocessing_pipeline),
    ("model", CatBoostClassifier(logging_level="Silent")) 
])
pl.fit(X_train, y_train)
y_predicted = pl.predict(X_test)
print(classification_report(y_test, y_predicted, target_names =target_dict.keys()))

                 precision    recall  f1-score   support

       Adoption       0.67      0.81      0.73      3231
       Transfer       0.72      0.65      0.68      2827
Return_to_owner       0.46      0.45      0.45      1436
     Euthanasia       0.58      0.21      0.31       466
           Died       1.00      0.03      0.07        59

       accuracy                           0.65      8019
      macro avg       0.69      0.43      0.45      8019
   weighted avg       0.65      0.65      0.64      8019

Wall time: 17 s


## качество сопоставимо с lightgbm. Приятный бонус - наконец смогли предсказать исход "Died". Ни одна предыдущая модель его не находила.
# подберем параметры

In [88]:
%%time
param_grid = {
    'model__depth': [1,2,3,5],
    'model__learning_rate': [0.01,0.1,0.5,1,10],
    'model__l2_leaf_reg': [2,5,8,10],
#     'model__min_samples_leaf': [1,2,5,10,0.01,0.05,0.1,0.2],
#     'model__n_estimators': np.logspace(0.5,4,20).astype(int)
}
search = RandomizedSearchCV(pl, param_grid, n_jobs=-1, cv=3, verbose=1,n_iter=15)
search.fit(X_train, y_train)
search.best_params_

Fitting 3 folds for each of 15 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:  7.0min finished


Wall time: 7min 12s


{'model__learning_rate': 0.1, 'model__l2_leaf_reg': 8, 'model__depth': 3}

In [89]:
print("модель с подобранными параметрами")
y_predicted = search.predict(X_test)
print(classification_report(y_test, y_predicted, target_names =target_dict.keys()))

модель с подобранными параметрами
                 precision    recall  f1-score   support

       Adoption       0.67      0.83      0.74      3231
       Transfer       0.72      0.65      0.69      2827
Return_to_owner       0.47      0.45      0.46      1436
     Euthanasia       0.59      0.17      0.26       466
           Died       0.00      0.00      0.00        59

       accuracy                           0.65      8019
      macro avg       0.49      0.42      0.43      8019
   weighted avg       0.65      0.65      0.64      8019



  _warn_prf(average, modifier, msg_start, len(result))


## прироста в качестве не получили
# Попробуем теперь XGBOOST

In [92]:
from  xgboost import XGBClassifier

In [94]:
%%time
pl = Pipeline([
    ("preprocess", preprocessing_pipeline),
    ("model", XGBClassifier())
])
pl.fit(X_train, y_train)
y_predicted = pl.predict(X_test)
print(classification_report(y_test, y_predicted, target_names =target_dict.keys()))

                 precision    recall  f1-score   support

       Adoption       0.66      0.84      0.74      3231
       Transfer       0.72      0.65      0.68      2827
Return_to_owner       0.49      0.43      0.46      1436
     Euthanasia       0.65      0.14      0.23       466
           Died       0.00      0.00      0.00        59

       accuracy                           0.65      8019
      macro avg       0.50      0.41      0.42      8019
   weighted avg       0.65      0.65      0.64      8019

Wall time: 7.47 s


  _warn_prf(average, modifier, msg_start, len(result))


## качество похоже на lightgbm, а работает вдвое дольше. И категорию "died" тоже не предсказывает. Подберем параметры

In [99]:
%%time
param_grid = {
    'model__depth': [1,2,3,5,7,9],
    'model__learning_rate': [0.01,0.1,0.5,1,10],
#     'model__l2_leaf_reg': [2,5,8,10],
#     'model__min_samples_leaf': [1,2,5,10,0.01,0.05,0.1,0.2],
#     'model__n_estimators': np.logspace(0.5,4,20).astype(int)
}
search = RandomizedSearchCV(pl, param_grid, n_jobs=-1, cv=5, verbose=1,n_iter=20)
search.fit(X_train, y_train)
search.best_params_

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  3.6min finished


Wall time: 3min 40s


{'model__learning_rate': 0.5, 'model__depth': 7}

In [100]:
print("модель с подобранными параметрами")
y_predicted = search.predict(X_test)
print(classification_report(y_test, y_predicted, target_names =target_dict.keys()))

модель с подобранными параметрами
                 precision    recall  f1-score   support

       Adoption       0.67      0.82      0.74      3231
       Transfer       0.73      0.66      0.69      2827
Return_to_owner       0.48      0.45      0.46      1436
     Euthanasia       0.63      0.20      0.30       466
           Died       0.00      0.00      0.00        59

       accuracy                           0.66      8019
      macro avg       0.50      0.43      0.44      8019
   weighted avg       0.65      0.66      0.64      8019



  _warn_prf(average, modifier, msg_start, len(result))


In [101]:
## опять не получили заметного прироста качества