In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import tree
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import GridSearchCV

In [3]:
abalone = pd.read_csv('../data/abalone_preprocessed.csv', index_col=0)
abalone.columns


Index(['Female', 'Infant', 'Male', 'Length', 'Diameter', 'Height',
       'Whole weight', 'Shucked weight', 'Viscera weight', 'Shell weight',
       'Rings'],
      dtype='object')

In [4]:
abalone.head()

Unnamed: 0,Female,Infant,Male,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,0.0,0.0,1.0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,0.0,0.0,1.0,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,1.0,0.0,0.0,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,0.0,0.0,1.0,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,0.0,1.0,0.0,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [None]:
abalone["Rings"].unique()

array([15,  7,  9, 10,  8, 20, 16, 19, 14, 11, 12, 18, 13,  5,  4,  6, 21,
       17, 22,  1,  3, 26, 23, 29,  2, 27, 25, 24])

In [None]:
for i in range(1,30):
  print(abalone[abalone.Rings == i].shape[0])

1
1
15
57
115
259
391
568
689
634
487
267
203
126
103
67
58
42
32
26
14
6
9
2
1
1
2
0
1


In [None]:
abalone.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4177 entries, 0 to 4176
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Female          4177 non-null   float64
 1   Infant          4177 non-null   float64
 2   Male            4177 non-null   float64
 3   Length          4177 non-null   float64
 4   Diameter        4177 non-null   float64
 5   Height          4177 non-null   float64
 6   Whole weight    4177 non-null   float64
 7   Shucked weight  4177 non-null   float64
 8   Viscera weight  4177 non-null   float64
 9   Shell weight    4177 non-null   float64
 10  Rings           4177 non-null   int64  
dtypes: float64(10), int64(1)
memory usage: 359.1 KB


Разобьём выборку на категориальные признаки, числовые признаки и на классы.

In [None]:
X_cat = abalone[['Female','Infant', 'Male']]
y = pd.DataFrame(abalone.iloc[:,-1])
X_real = abalone.iloc[:, 3:-1]

In [None]:
y.head()

Unnamed: 0,Rings
0,15
1,7
2,9
3,10
4,7


Заменим значения y на возрастной диапазон к которому относится экзэмпляр. С 1 до 7 колец - молодой, с 8 до 11 средний, с 12 до 29 старый.

In [None]:
for i in range(len(y)):
  if(y['Rings'][i] < 8):
    y['Rings'][i] = "young"
  elif(y['Rings'][i] < 12):
    y['Rings'][i] = "middle"
  else:
    y['Rings'][i] = "old"

In [None]:
y.head()

Unnamed: 0,Rings
0,old
1,young
2,middle
3,middle
4,young


In [None]:
(X_real_train, X_real_test, y_train, y_test) =  train_test_split(X_real, y, test_size = 0.2, random_state = 0)

(X_cat_train, X_cat_test) = train_test_split(X_cat, test_size=0.2, random_state = 0)

In [None]:
scaler = StandardScaler()
scaler.fit(X_real_train, y_train)
X_real_train_scaled = scaler.transform(X_real_train)
X_real_test_scaled = scaler.transform(X_real_test)

In [None]:
X_real_train_scaled

array([[-2.88863777, -2.77572159, -1.41092732, ..., -1.55762803,
        -1.58940432, -1.65503169],
       [-2.59633945, -2.62402222, -1.99755535, ..., -1.55539711,
        -1.57129213, -1.6371499 ],
       [ 1.12002486,  1.21902846,  0.70093358, ...,  1.01239041,
         0.26709517,  1.51362157],
       ...,
       [ 0.57718513,  0.40996516,  0.11430555, ...,  0.44350612,
         0.21728665,  0.0866547 ],
       [ 0.82772654,  0.81449681,  0.58360797, ...,  0.50597185,
         0.4889695 ,  0.55158125],
       [-0.96782026, -0.85419625, -0.70697369, ..., -1.0311312 ,
        -0.97358986, -1.00413452]])

In [None]:
X_cat_train = pd.DataFrame(X_cat_train)
X_cat_test = pd.DataFrame(X_cat_test)

X_real_train_scaled = pd.DataFrame(X_real_train_scaled)
X_real_test_scaled = pd.DataFrame(X_real_test_scaled)

X_cat_train.index = X_real_train_scaled.index
X_train = pd.concat([X_real_train_scaled, X_cat_train], axis=1)

X_cat_test.index = X_real_test_scaled.index
X_test = pd.concat([X_real_test_scaled, X_cat_test], axis=1)

In [None]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,Female,Infant,Male
0,-2.888638,-2.775722,-1.410927,-1.630852,-1.557628,-1.589404,-1.655032,0.0,1.0,0.0
1,-2.596339,-2.624022,-1.997555,-1.614648,-1.555397,-1.571292,-1.63715,0.0,1.0,0.0
2,1.120025,1.219028,0.700934,1.118712,1.01239,0.267095,1.513622,0.0,0.0,1.0
3,1.579351,1.168462,1.404887,2.13043,2.616421,2.29566,1.352685,0.0,0.0,1.0
4,0.577185,0.460532,0.348957,0.410813,0.2338,0.253511,0.884183,0.0,0.0,1.0


Обучим модель метода k-ближайших соседей.

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train,y_train)
y_pred = knn.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


[[362  44  36]
 [111  87   2]
 [ 61   0 133]]
              precision    recall  f1-score   support

      middle       0.68      0.82      0.74       442
         old       0.66      0.43      0.53       200
       young       0.78      0.69      0.73       194

    accuracy                           0.70       836
   macro avg       0.71      0.65      0.67       836
weighted avg       0.70      0.70      0.69       836



  


Попробуем подобрать параметры.

In [None]:
knn.get_params().keys()

dict_keys(['algorithm', 'leaf_size', 'metric', 'metric_params', 'n_jobs', 'n_neighbors', 'p', 'weights'])

In [None]:
knn_parameters_grid = {'n_neighbors': range(1, 10),
                       'weights': ['uniform', 'distance'],
                       'metric': ['minkowski', 'manhattan', 'euclidean', 'chebyshev'],
                       'algorithm': ['brute', 'ball_tree', 'KD_tree', 'auto']
                       }

In [None]:
knn_grid = GridSearchCV(knn, knn_parameters_grid,
                        cv=5, n_jobs=-1, verbose=True)
knn_grid.fit(X_train,y_train)
knn_grid.best_params_, knn_grid.best_score_

Fitting 5 folds for each of 288 candidates, totalling 1440 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 164 tasks      | elapsed:    6.0s
[Parallel(n_jobs=-1)]: Done 764 tasks      | elapsed:   24.7s
[Parallel(n_jobs=-1)]: Done 1440 out of 1440 | elapsed:   34.7s finished
  self.best_estimator_.fit(X, y, **fit_params)


({'algorithm': 'brute',
  'metric': 'manhattan',
  'n_neighbors': 9,
  'weights': 'uniform'},
 0.7210395352792174)

Удалось немного повысить точность предсказания.

Теперь обучим классификатор дерева решений.

In [None]:
dtc = tree.DecisionTreeClassifier()
dtc.fit(X_train,y_train)
y_pred_2 = dtc.predict(X_test)
print(confusion_matrix(y_test, y_pred_2))
print(classification_report(y_test, y_pred_2))

[[326  78  38]
 [ 92 103   5]
 [ 57   3 134]]
              precision    recall  f1-score   support

      middle       0.69      0.74      0.71       442
         old       0.56      0.52      0.54       200
       young       0.76      0.69      0.72       194

    accuracy                           0.67       836
   macro avg       0.67      0.65      0.66       836
weighted avg       0.67      0.67      0.67       836



Попробуем подобрать гиперпараметры для этой модели.

In [None]:
dtc.get_params().keys()

dict_keys(['ccp_alpha', 'class_weight', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'min_impurity_decrease', 'min_impurity_split', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'presort', 'random_state', 'splitter'])

In [None]:
dtc_parameters_grid = {'max_depth': range(1,6),
                      'max_features': range(1,7),
                      'min_samples_leaf' : range(1,10)}

In [None]:
dtc_grid = GridSearchCV(dtc, dtc_parameters_grid,
                        cv=5, n_jobs=-1, verbose=True)
dtc_grid.fit(X_train, y_train)
dtc_grid.best_params_, dtc_grid.best_score_

Fitting 5 folds for each of 270 candidates, totalling 1350 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 540 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done 1350 out of 1350 | elapsed:    8.6s finished


({'max_depth': 5, 'max_features': 6, 'min_samples_leaf': 1}, 0.711167351395863)

Точность увеличилась сильнее чем у прошлой модели после подбора параметров, но всё же оказалась менее точной.

Попробуем сделать то же с наивным байесовским классификатором

In [None]:
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred_3 = nb.predict(X_test)
print(confusion_matrix(y_test, y_pred_3))
print(classification_report(y_test, y_pred_3))

[[216 136  90]
 [ 92 100   8]
 [ 33   2 159]]
              precision    recall  f1-score   support

      middle       0.63      0.49      0.55       442
         old       0.42      0.50      0.46       200
       young       0.62      0.82      0.71       194

    accuracy                           0.57       836
   macro avg       0.56      0.60      0.57       836
weighted avg       0.58      0.57      0.56       836



  y = column_or_1d(y, warn=True)


Точность получилась маленькая.

Проверим метод опорных векторов.

In [None]:
svm = svm.SVC()
svm.fit(X_train, y_train)
y_pred_4 = svm.predict(X_test)
print(confusion_matrix(y_test, y_pred_4))
print(classification_report(y_test, y_pred_4))

  y = column_or_1d(y, warn=True)


[[393  23  26]
 [123  75   2]
 [ 56   0 138]]
              precision    recall  f1-score   support

      middle       0.69      0.89      0.78       442
         old       0.77      0.38      0.50       200
       young       0.83      0.71      0.77       194

    accuracy                           0.72       836
   macro avg       0.76      0.66      0.68       836
weighted avg       0.74      0.72      0.71       836



Точность изначально выше чем у моделей, стоящих раньше.

In [None]:
svm.get_params().keys()

dict_keys(['C', 'break_ties', 'cache_size', 'class_weight', 'coef0', 'decision_function_shape', 'degree', 'gamma', 'kernel', 'max_iter', 'probability', 'random_state', 'shrinking', 'tol', 'verbose'])

In [None]:
svm_parameters_grid = {'C': [0.1, 1, 10, 100, 1000], 

              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],

              'kernel': ['rbf']}

In [None]:
svm_grid = GridSearchCV(svm, svm_parameters_grid,
                        cv=5, n_jobs=-1, verbose=True)
svm_grid.fit(X_train, y_train)
svm_grid.best_params_, svm_grid.best_score_

Fitting 5 folds for each of 25 candidates, totalling 125 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   11.5s
[Parallel(n_jobs=-1)]: Done 125 out of 125 | elapsed:   44.2s finished
  y = column_or_1d(y, warn=True)


({'C': 1000, 'gamma': 0.01, 'kernel': 'rbf'}, 0.7530696454624384)

После подбора гиперпараметров эта модель показала самую высокую точность.

Проверим последнюю модель.

In [None]:
lg = LogisticRegression()
lg.fit(X_train, y_train)
y_pred_5 = lg.predict(X_test)
print(confusion_matrix(y_test, y_pred_5))
print(classification_report(y_test, y_pred_5))

[[386  27  29]
 [118  81   1]
 [ 57   0 137]]
              precision    recall  f1-score   support

      middle       0.69      0.87      0.77       442
         old       0.75      0.41      0.53       200
       young       0.82      0.71      0.76       194

    accuracy                           0.72       836
   macro avg       0.75      0.66      0.68       836
weighted avg       0.73      0.72      0.71       836



  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [None]:
lg.get_params().keys()

dict_keys(['C', 'class_weight', 'dual', 'fit_intercept', 'intercept_scaling', 'l1_ratio', 'max_iter', 'multi_class', 'n_jobs', 'penalty', 'random_state', 'solver', 'tol', 'verbose', 'warm_start'])

In [None]:
lg_parameters_grid = {'penalty' : ['l1', 'l2'],
                       'C' : [1.0, 10, 100, 200],
                       'max_iter': [100,  200, 300],
                       }

In [None]:
lg_grid = GridSearchCV(lg, lg_parameters_grid,
                        cv=5, n_jobs=-1, verbose=True)
lg_grid.fit(X_train, y_train)
lg_grid.best_params_, lg_grid.best_score_

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    6.1s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:    8.0s finished
  y = column_or_1d(y, warn=True)


({'C': 100, 'max_iter': 200, 'penalty': 'l2'}, 0.7446864119295042)

Результат немного меньше чем у прошлой модели, но выше чем у остальных.