In [2]:
import pandas as pd



In [3]:
# Загружаем данные
train_data = pd.read_csv('Train.csv')

In [4]:
# Выводим статистическую информацию
print(train_data.describe())

              Area  MajorAxisLenght  MinorAxisLenght  Eccentricity  \
count   215.000000       215.000000       215.000000    215.000000   
mean    832.144186        63.234226        15.305780      0.933431   
std     839.750421        41.410531         7.555325      0.105220   
min      32.000000         6.582800         3.712800      0.107400   
25%     308.000000        38.618400         9.822000      0.932750   
50%     477.000000        49.822900        14.790600      0.958400   
75%     797.000000        64.523750        17.637850      0.984050   
max    4086.000000       223.221800        55.172700      0.998300   

       Orientation   ConvexArea   FilledArea  EulerNumber  EquivDiameter  \
count   215.000000   215.000000   215.000000   215.000000     215.000000   
mean     -2.692709   901.037209   835.874419     0.069767      29.395838   
std      22.868972   939.131627   845.117057     2.535683      14.011307   
min     -88.452200    33.000000    32.000000   -23.000000       6

In [5]:
# Выводим названия столбцов
print("Columns:", train_data.columns)

# Выводим несколько строк данных
print("First few rows of data:")
print(train_data.head())

Columns: Index(['Area', 'MajorAxisLenght', 'MinorAxisLenght', 'Eccentricity',
       'Orientation', 'ConvexArea', 'FilledArea', 'EulerNumber',
       'EquivDiameter', 'Solidity', 'Extent', 'Perimeter', 'Class'],
      dtype='object')
First few rows of data:
     Area  MajorAxisLenght  MinorAxisLenght  Eccentricity  Orientation  \
0     NaN              NaN              NaN           NaN          NaN   
1   311.0          45.6957           8.7998        0.9813      -5.7814   
2     NaN              NaN              NaN           NaN          NaN   
3  1690.0          69.0800          32.8673        0.8796     -15.6711   
4     NaN              NaN              NaN           NaN          NaN   

   ConvexArea  FilledArea  EulerNumber  EquivDiameter  Solidity  Extent  \
0         NaN         NaN          NaN            NaN       NaN     NaN   
1       334.0       311.0          1.0        19.8992    0.9311  0.7068   
2         NaN         NaN          NaN            NaN       NaN     NaN 

In [13]:
# Проверяем наличие пропущенных значений
print(train_data.isnull().sum())

# Заполняем удаляем строки с NaN
train_data = train_data.dropna()

Area               0
MajorAxisLenght    0
MinorAxisLenght    0
Eccentricity       0
Orientation        0
ConvexArea         0
FilledArea         0
EulerNumber        0
EquivDiameter      0
Solidity           0
Extent             0
Perimeter          0
Class              0
dtype: int64


In [14]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
train_data['Class'] = label_encoder.fit_transform(train_data['Class'])

In [15]:
# Выводим корреляционную матрицу
correlation_matrix = train_data.corr()
print(correlation_matrix['Class'].sort_values(ascending=False))

Class              1.000000
Solidity           0.217749
Area               0.170209
FilledArea         0.170035
ConvexArea         0.163072
MajorAxisLenght    0.153819
EquivDiameter      0.144184
Perimeter          0.136842
MinorAxisLenght    0.068945
Eccentricity       0.062760
EulerNumber        0.059093
Extent            -0.013353
Orientation       -0.016109
Name: Class, dtype: float64


In [16]:
# Распределение классов
print(train_data['Class'].value_counts())

Class
2    60
1    52
0    52
3    51
Name: count, dtype: int64


In [17]:
# Разделение данных
X = train_data.drop('Class', axis=1)
y = train_data['Class']

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

# Разделить данные на обучающий и тестовый наборы
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Классификатор k-ближайших соседей
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)
knn_accuracy = accuracy_score(y_test, y_pred_knn)

print("KNN Accuracy:", knn_accuracy)

# Градиентный бустинг
gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)
y_pred_gb = gb.predict(X_test)
gb_accuracy = accuracy_score(y_test, y_pred_gb)

print("Gradient Boosting Accuracy:", gb_accuracy)

KNN Accuracy: 0.9302325581395349
Gradient Boosting Accuracy: 0.9069767441860465


In [19]:
from sklearn.model_selection import GridSearchCV

# Поиск гиперпараметров для KNN
knn_params = {'n_neighbors': range(1, 11)}
knn_grid = GridSearchCV(KNeighborsClassifier(), knn_params, cv=5)
knn_grid.fit(X_train, y_train)
print("Best KNN Params:", knn_grid.best_params_)

# Поиск гиперпараметров для Gradient Boosting
gb_params = {'n_estimators': [50, 100, 150]}
gb_grid = GridSearchCV(GradientBoostingClassifier(), gb_params, cv=5)
gb_grid.fit(X_train, y_train)
print("Best Gradient Boosting Params:", gb_grid.best_params_)

Best KNN Params: {'n_neighbors': 1}
Best Gradient Boosting Params: {'n_estimators': 150}


In [20]:
best_knn = knn_grid.best_estimator_
best_gb = gb_grid.best_estimator_

# Оценка производительности
print("Best KNN Accuracy:", accuracy_score(y_test, best_knn.predict(X_test)))
print("Best Gradient Boosting Accuracy:", accuracy_score(y_test, best_gb.predict(X_test)))

Best KNN Accuracy: 0.9302325581395349
Best Gradient Boosting Accuracy: 0.9069767441860465


In [21]:
# Важность признаков для градиентного бустинга
feature_importances = pd.Series(best_gb.feature_importances_, index=X.columns)
print("Feature Importances:\n", feature_importances.sort_values(ascending=False))

Feature Importances:
 Perimeter          0.188735
Extent             0.188459
ConvexArea         0.148147
Area               0.109187
FilledArea         0.072224
MajorAxisLenght    0.061717
EquivDiameter      0.058448
Solidity           0.058158
EulerNumber        0.054766
Orientation        0.031972
Eccentricity       0.019811
MinorAxisLenght    0.008375
dtype: float64


In [22]:
# Описание каждого класса
class_counts = train_data['Class'].value_counts()
print("Class distribution:\n", class_counts)

Class distribution:
 Class
2    60
1    52
0    52
3    51
Name: count, dtype: int64


In [23]:
# Загрузка тестовых данных
test_data = pd.read_csv('Test.csv')

In [33]:
test_data = test_data.dropna()
# Убедимся, что столбцы совпадают
train_columns = X.columns
test_columns = test_data.columns

# Выведем разницу для диагностики
print("Columns in train data but not in test data:", set(train_columns) - set(test_columns))
print("Columns in test data but not in train data:", set(test_columns) - set(train_columns))

# Оставим только те столбцы в test_data, которые есть в train_data
test_data_aligned = test_data[train_columns]

# Выполним теперь преобразования и предсказание
test_data_imputed = imputer.transform(test_data_aligned)

# Предсказание классов после обработки NaN значений
test_predictions = best_gb.predict(test_data_imputed)
test_data['Predicted_Class'] = test_predictions

print(test_data['Predicted_Class'])


Columns in train data but not in test data: set()
Columns in test data but not in train data: {'Predicted_Class'}


ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- Predicted_Class


In [34]:
# Описание классов в тестовой выборке
test_class_counts = test_data['Predicted_Class'].value_counts()
print("Test Class Distribution:\n", test_class_counts)

Test Class Distribution:
 Predicted_Class
2    112
Name: count, dtype: int64


In [35]:
from sklearn.preprocessing import StandardScaler

# Стандартизация данных
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Разделим данные и заново выполним предыдущие шаги
X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Строим классификаторы
best_knn.fit(X_train_scaled, y_train_scaled)
best_gb.fit(X_train_scaled, y_train_scaled)

# Оцениваем производительность
print("Best KNN (scaled) Accuracy:", accuracy_score(y_test_scaled, best_knn.predict(X_test_scaled)))
print("Best Gradient Boosting (scaled) Accuracy:", accuracy_score(y_test_scaled, best_gb.predict(X_test_scaled)))

# Стандартизация тестовых данных
test_data_scaled = scaler.transform(test_data.drop('Predicted_Class', axis=1))
test_predictions_scaled = best_gb.predict(test_data_scaled)
test_data['Predicted_Class_Scaled'] = test_predictions_scaled

Best KNN (scaled) Accuracy: 0.9069767441860465
Best Gradient Boosting (scaled) Accuracy: 0.9069767441860465


In [36]:
# Сравнение результатов до и после стандартизации
print("Original Best Accuracy:\n", max(knn_accuracy, gb_accuracy))
print("Scaled Best Accuracy:\n", max(accuracy_score(y_test_scaled, best_knn.predict(X_test_scaled)), 
                                     accuracy_score(y_test_scaled, best_gb.predict(X_test_scaled))))

Original Best Accuracy:
 0.9302325581395349
Scaled Best Accuracy:
 0.9069767441860465
