In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

# Загрузка данных
df = pd.read_csv('cleaned_wine_quality_dataset.csv')

# Определение признаков и целевой переменной
X = df.drop('quality', axis=1)
y = df['quality']

# Разделение данных на обучающий и тестовый наборы
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Инициализация и обучение моделей
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)

dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

# Предсказание на тестовом наборе
knn_pred = knn_model.predict(X_test)
dt_pred = dt_model.predict(X_test)

# Оценка моделей
knn_accuracy = metrics.accuracy_score(y_test, knn_pred)
dt_accuracy = metrics.accuracy_score(y_test, dt_pred)

knn_precision = metrics.precision_score(y_test, knn_pred, average='weighted')
dt_precision = metrics.precision_score(y_test, dt_pred, average='weighted')

knn_recall = metrics.recall_score(y_test, knn_pred, average='weighted')
dt_recall = metrics.recall_score(y_test, dt_pred, average='weighted')

knn_f1 = metrics.f1_score(y_test, knn_pred, average='weighted')
dt_f1 = metrics.f1_score(y_test, dt_pred, average='weighted')

# ROC AUC не применим к мультиклассовой классификации
# roc_auc_knn = metrics.roc_auc_score(y_test, knn_pred)
# roc_auc_dt = metrics.roc_auc_score(y_test, dt_pred)

print('KNN Метрики:')
print('Accuracy:', knn_accuracy)
print('Precision:', knn_precision)
print('Recall:', knn_recall)
print('F1-мера:', knn_f1)

print('\nДерево решений Метрики:')
print('Accuracy:', dt_accuracy)
print('Precision:', dt_precision)
print('Recall:', dt_recall)
print('F1-мера:', dt_f1)

KNN Метрики:
Accuracy: 0.497893850042123
Precision: 0.48929555554080806
Recall: 0.497893850042123
F1-мера: 0.4889240863015983

Дерево решений Метрики:
Accuracy: 0.5939342881213142
Precision: 0.5944038605544898
Recall: 0.5939342881213142
F1-мера: 0.5937164628875736


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# попробуем применить k-means для всего датасета, и посмотрим, увеличится ли точность наших алгоритмов классификации

# Select the features for clustering
X = df.loc[:, df.columns != 'quality']

# Initialize the KMeans model
kmeans = KMeans(n_clusters=150, random_state=0)

# Fit the model to the data
kmeans.fit(X)

# Add the predicted clusters to the dataframe
df['cluster'] = kmeans.labels_
df.to_csv('cleaned_and_clustered_wine_quality_dataset.csv', index=False)

  super()._check_params_vs_input(X, default_n_init=10)


In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

# Загрузка данных
df = pd.read_csv('cleaned_and_clustered_wine_quality_dataset.csv')

# Определение признаков и целевой переменной
X = df.drop('quality', axis=1)
y = df['quality']

# Разделение данных на обучающий и тестовый наборы
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Инициализация и обучение моделей
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)

dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

# Предсказание на тестовом наборе
knn_pred = knn_model.predict(X_test)
dt_pred = dt_model.predict(X_test)

# Оценка моделей
knn_accuracy = metrics.accuracy_score(y_test, knn_pred)
dt_accuracy = metrics.accuracy_score(y_test, dt_pred)

knn_precision = metrics.precision_score(y_test, knn_pred, average='weighted')
dt_precision = metrics.precision_score(y_test, dt_pred, average='weighted')

knn_recall = metrics.recall_score(y_test, knn_pred, average='weighted')
dt_recall = metrics.recall_score(y_test, dt_pred, average='weighted')

knn_f1 = metrics.f1_score(y_test, knn_pred, average='weighted')
dt_f1 = metrics.f1_score(y_test, dt_pred, average='weighted')

# ROC AUC не применим к мультиклассовой классификации
# roc_auc_knn = metrics.roc_auc_score(y_test, knn_pred)
# roc_auc_dt = metrics.roc_auc_score(y_test, dt_pred)

print('KNN Метрики:')
print('Accuracy:', knn_accuracy)
print('Precision:', knn_precision)
print('Recall:', knn_recall)
print('F1-мера:', knn_f1)

print('\nДерево решений Метрики:')
print('Accuracy:', dt_accuracy)
print('Precision:', dt_precision)
print('Recall:', dt_recall)
print('F1-мера:', dt_f1)

KNN Метрики:
Accuracy: 0.4785172704296546
Precision: 0.46844033325139894
Recall: 0.4785172704296546
F1-мера: 0.4697342678960042

Дерево решений Метрики:
Accuracy: 0.5998315080033698
Precision: 0.5989745312547512
Recall: 0.5998315080033698
F1-мера: 0.59921404951216


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
# вывод: На этих данных алгоритм kNN показывает себя значительно хуже дерева решений.

KeyError: 'quality'