In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
DATASET = "https://raw.githubusercontent.com/aiedu-courses/stepik_eda_and_dev_tools/main/datasets/abalone.csv"

In [3]:
df = pd.read_csv(DATASET)

In [4]:
df.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [5]:
max_value = df['Rings'].max()
print(max_value)

29


In [6]:
min_value = df['Rings'].min()
print(min_value)

1


In [7]:
mean_value = df['Rings'].mean()
print(mean_value)

9.933684462532918


In [8]:
df.isnull().mean() * 100

Sex               0.000000
Length            0.000000
Diameter          2.370122
Height            0.000000
Whole weight      2.370122
Shucked weight    0.000000
Viscera weight    0.000000
Shell weight      1.197031
Rings             0.000000
dtype: float64

In [9]:
median_Diameter = df["Diameter"].median() #поиск и заполнение пропусков медианным значением по столбцу 'Diameter'

df["Diameter"].fillna(median_Diameter, inplace=True)

In [10]:
median_Whole_weight = df["Whole weight"].median() #поиск и заполнение пропусков медианным значением по столбцу 'Whole weight'

df["Whole weight"].fillna(median_Whole_weight, inplace=True)

median_Shell_weight = df["Shell weight"].median() #поиск и заполнение пропусков медианным значением по столбцу 'Shell weight'

df["Shell weight"].fillna(median_Shell_weight, inplace=True)

In [11]:
df.isnull().mean() * 100

Sex               0.0
Length            0.0
Diameter          0.0
Height            0.0
Whole weight      0.0
Shucked weight    0.0
Viscera weight    0.0
Shell weight      0.0
Rings             0.0
dtype: float64

## Наивный Байесовский классификатор


In [12]:
X = df[['Length','Diameter','Height','Whole weight','Shucked weight', 'Viscera weight', 'Shell weight']]
y = df['Rings']

y_class = (y > 8).astype(int)

In [13]:
y_class.value_counts()

1    2770
0    1407
Name: Rings, dtype: int64

Если будем использовать y_class = (y > 9).astype(int) - Тогда разделение на кассы будет равнозначным, но при этом значение метрики будет ниже, поэтому оставил y > 8

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y_class, test_size=0.25, random_state=42)

In [15]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()

gnb.fit(X_train, y_train)

y_pred = gnb.predict(X_test)

In [16]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)

0.7990430622009569

In [17]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred)

array([[290,  66],
       [144, 545]])

Не могу точно сказать хорошо ли справилась модель, судя по значению метрики accuracy 0.8, но в общем случае данное значение можно считать хорошим, но недостаточно точным

## KNN

In [18]:
from sklearn.neighbors import KNeighborsClassifier

knn_cl = KNeighborsClassifier()

knn_cl.fit(X_train, y_train)

pred_knn = knn_cl.predict(X_test)

In [19]:
accuracy_score(y_test, pred_knn)

0.8277511961722488

In [20]:
confusion_matrix(y_test, pred_knn)

array([[261,  95],
       [ 85, 604]])

Также, как и в случае с первым методом, не могу точно сказать, насколько хорошо сработала модель.
В общем случае скажу, как и с первой моделью, что модель справилась неплохо

## Подбор гиперпараметров

In [21]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

Гиперпараметры для модели
Байесовского классификатора

In [22]:
from sklearn.model_selection import GridSearchCV

X_train, X_test, y_train, y_test = train_test_split(X, y_class, test_size=0.25, random_state=42)

model = GaussianNB()

param_grid = {'priors': [[0.3, 0.7], [0.4, 0.6], [0.5, 0.5]]}
gnb = GridSearchCV(model, param_grid, cv=5)

gnb.fit(X_train, y_train)

In [23]:
pred = gnb.best_estimator_.predict(X_test)

accuracy_score(y_test, pred)

0.8

Гиперпараметры для модели KNeighborsClassifier

In [24]:
from sklearn.model_selection import GridSearchCV

X_train, X_test, y_train, y_test = train_test_split(X, y_class, test_size=0.25, random_state=42)

model = KNeighborsClassifier()

params = {'n_neighbors' : np.arange(2, 20, 2),
          'weights' : ['uniform', 'distance'],
          'p' : [1, 2]}

gs = GridSearchCV(model, params, scoring='accuracy', cv=3, n_jobs=-1, verbose=2)
gs.fit(X_train, y_train)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


In [25]:
gs.best_score_, gs.best_params_

(0.8365261813537676, {'n_neighbors': 12, 'p': 1, 'weights': 'distance'})

In [26]:
pred = gs.best_estimator_.predict(X_test)

accuracy_score(y_test, pred)

0.8325358851674641

In [27]:
confusion_matrix(y_test, pred)

array([[259,  97],
       [ 78, 611]])

Значение метрики acc стало лучше на 0,6% от первоначального

Основываясь на метрике accuracy_score выбираем для дальнейшего пути подель KNN, так как её значение больше на 0,03

## Добавление категориальных признаков

In [28]:
X_full = df.drop('Rings', axis=1)

X_train_full, X_test_full, y_train_full, y_test_full = train_test_split(X_full, y_class, test_size=0.25, random_state=42)

In [29]:
X_train_full.dtypes

Sex                object
Length            float64
Diameter          float64
Height            float64
Whole weight      float64
Shucked weight    float64
Viscera weight    float64
Shell weight      float64
dtype: object

In [30]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer

categorical = ['Sex']
numeric_features = ['Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight', 'Viscera weight', 'Shell weight']

ct = ColumnTransformer([
    ('ohe', OneHotEncoder(handle_unknown="ignore"), categorical),
    ('scaling', MinMaxScaler(), numeric_features)
])

X_train_transformed = ct.fit_transform(X_train_full)
X_test_transformed = ct.transform(X_test_full)

In [31]:
new_features = list(ct.named_transformers_['ohe'].get_feature_names_out())
new_features.extend(numeric_features)

new_features

['Sex_F',
 'Sex_I',
 'Sex_M',
 'Sex_f',
 'Length',
 'Diameter',
 'Height',
 'Whole weight',
 'Shucked weight',
 'Viscera weight',
 'Shell weight']

In [32]:
X_train_transformed = pd.DataFrame(X_train_transformed, columns=new_features)
X_test_transformed = pd.DataFrame(X_test_transformed, columns=new_features)

X_train_transformed.head()

Unnamed: 0,Sex_F,Sex_I,Sex_M,Sex_f,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight
0,1.0,0.0,0.0,0.0,0.72973,0.672269,0.119469,0.374358,0.317754,0.345622,0.27155
1,1.0,0.0,0.0,0.0,0.594595,0.571429,0.123894,0.242253,0.188299,0.164582,0.217738
2,0.0,0.0,1.0,0.0,0.790541,0.798319,0.154867,0.559943,0.496638,0.460829,0.402093
3,0.0,0.0,1.0,0.0,0.513514,0.521008,0.084071,0.181335,0.150303,0.132324,0.147982
4,0.0,0.0,1.0,0.0,0.567568,0.579832,0.137168,0.285638,0.157028,0.151415,0.347285


In [33]:
model = KNeighborsClassifier()

params = {'n_neighbors' : np.arange(2, 20, 2),
          'weights' : ['uniform', 'distance'],
          'p' : [1, 2]}

gs = GridSearchCV(model, params, scoring='accuracy', cv=3, n_jobs=-1, verbose=2)
gs.fit(X_train_transformed, y_train_full)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


In [34]:
pred = gs.best_estimator_.predict(X_test_transformed)

accuracy_score(y_test_full, pred)

0.8363636363636363

In [35]:
confusion_matrix(y_test_full, pred)

array([[257,  99],
       [ 72, 617]])

## Построение дашборда

In [36]:
!pip install explainerdashboard -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m287.2/287.2 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m221.2/221.2 kB[0m [31m29.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.4/10.4 MB[0m [31m100.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.8/91.8 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m532.9/532.9 kB[0m [31m56.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.7/57.7 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m145.4/145.4 kB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m99.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [37]:
from explainerdashboard import ClassifierExplainer, ExplainerDashboard

In [38]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [39]:
explainer = ClassifierExplainer(gs.best_estimator_, X_test_transformed.iloc[:10], y_test_full.iloc[:10])

Note: shap values for shap='kernel' normally get calculated against X_background, but paramater X_background=None, so setting X_background=shap.sample(X, 50)...
Generating self.shap_explainer = shap.KernelExplainer(model, X, link='identity')


In [40]:

db = ExplainerDashboard(explainer)

Building ExplainerDashboard..
Detected google colab environment, setting mode='external'
For this type of model and model_output interactions don't work, so setting shap_interaction=False...
The explainer object has no decision_trees property. so setting decision_trees=False...
Generating layout...
Calculating shap values...



JupyterDash is deprecated, use Dash instead.
See https://dash.plotly.com/dash-in-jupyter for more details.



  0%|          | 0/10 [00:00<?, ?it/s]

Calculating prediction probabilities...
Calculating metrics...
Calculating confusion matrices...
Calculating classification_dfs...
Calculating roc auc curves...
Calculating pr auc curves...
Calculating liftcurve_dfs...
Calculating dependencies...
Calculating permutation importances (if slow, try setting n_jobs parameter)...
Calculating predictions...
Calculating pred_percentiles...
Reminder: you can store the explainer (including calculated dependencies) with explainer.dump('explainer.joblib') and reload with e.g. ClassifierExplainer.from_file('explainer.joblib')
Registering callbacks...


In [41]:
db.run()

Starting ExplainerDashboard on http://172.28.0.12:8050
You can terminate the dashboard with ExplainerDashboard.terminate(8050)


<IPython.core.display.Javascript object>

Dash app running on:


<IPython.core.display.Javascript object>

1) самыми важными факторами оказалиь (Shell, Shucked, Viscera) - при методе SHAP
при методе Permutation - (Shell, Length, Viscera)

2)  Значение метрик:
 - accuracy = 0.7 (модель правильно предсказала класс в 70% случаев)
 - roc_auc_score, pr_auc_score - данные метрики показывают, что мы достаточно хорошо прогнозируем точность классов (их значения: 0.917 и 0.888 cоответственно)
 - precision = 1 очень точное значение (Чем выше значение precision, тем более точной считается модель в определении положительных примеров)
 - recall = 0.25 маленькое значение (Чем выше значение recall, тем более полной считается модель в поиске положительных примеров) модель плоха в поиске положительных примеров

3) Прогнозы
 - Возьмём индекс равный 7. Мы видем, что он на 71 % будет относиться к классу 0 параметра Rings, в большей степени на данное значение повлияло - Shell weight дав 16%, самое маленькое влияние оказывает Diameter = 1.88%
 - индекс равный 0. С вероятностью 94% он относится к 1ой категории. Основное влияние оказывает Shell weight = 10.65%, а самое незначительное Sex_M = 1.03%  