In [3]:
import pandas as pd
import numpy as np

In [4]:
data = '/Users/leona/OneDrive/Documentos/GitHub/wine-classification/wine_prices/analisys/final_file.csv'

In [5]:
df = pd.read_csv(data)

In [6]:
x = df.drop(columns=['quality'])
y = df['quality']

In [7]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_val_score, train_test_split
from sklearn.metrics import confusion_matrix, recall_score, precision_score, roc_auc_score, f1_score

In [8]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=15, stratify=y)

In [9]:
strat = StratifiedKFold(n_splits=5, shuffle=True, random_state=12)

RANDOM FOREST

In [10]:
# Param for RandomForestClassifier
param_ram = {
    'n_estimators': np.array([100, 110, 115]),
    'criterion': ["gini", "entropy"],
    'max_depth': np.array([18, 19]),
    'min_samples_split': np.array([2, 8])
}

In [11]:
ram = RandomForestClassifier()
grid_ram = GridSearchCV(estimator=ram, param_grid=param_ram, cv=strat, n_jobs=-1)
grid_ram.fit(x, y)

print(grid_ram.best_score_)
print(grid_ram.best_params_)

0.6910861609522116
{'criterion': 'entropy', 'max_depth': 18, 'min_samples_split': 2, 'n_estimators': 100}


In [12]:
best_ram = RandomForestClassifier(criterion='gini', max_depth=19, min_samples_split=2, n_estimators=115)
best_ram.fit(x_train, y_train)
prediction_ram = best_ram.predict(x_test)

GRADIENT BOOSTING

In [26]:
# Param for GradientBoostClassifier
param_gradient = {
    'loss': ['log_loss'],
    'learning_rate': np.array([0.1]),
    'criterion': ['friedman_mse', 'squared_error'],
    'n_estimators': np.array([100, 120])
}

In [27]:
gradient = GradientBoostingClassifier()
grid_gradient = GridSearchCV(estimator=gradient, param_grid=param_gradient, cv=strat, n_jobs=-1)
grid_gradient.fit(x, y)

print(grid_gradient.best_score_)
print(grid_gradient.best_params_)

0.593963403801741
{'criterion': 'friedman_mse', 'learning_rate': 0.1, 'loss': 'log_loss', 'n_estimators': 120}


In [15]:
best_gradient = GradientBoostingClassifier(criterion='friedman_mse', learning_rate=0.1, loss='log_loss', n_estimators=120)
best_gradient.fit(x_train, y_train)
prediction_gradient = best_gradient.predict(x_test)

DECISION TREE

In [45]:
# Param for DecisionTreeClassifier
param_tree = {
    'min_samples_split': np.array([2, 8]),
    'max_depth': np.array([28, 30]),
    'criterion': ["gini", "entropy", "log_loss"]
}

In [46]:
tree = DecisionTreeClassifier()
grid_tree = GridSearchCV(estimator=tree, param_grid=param_tree, cv=strat, n_jobs=-1)
grid_tree.fit(x, y)

print(grid_tree.best_score_)
print(grid_tree.best_params_)

0.6108957186001065
{'criterion': 'gini', 'max_depth': 28, 'min_samples_split': 2}


In [47]:
best_tree = DecisionTreeClassifier(criterion='gini', max_depth=28, min_samples_split=2)
best_tree.fit(x_train, y_train)
prediction_tree = best_tree.predict(x_test)

KNeighbors

In [53]:
# Param for KNeighborsClassifier
param_neigh = {
    'weights': ['distance'],
    'n_neighbors': np.array([8, 10, 12]),
    'metric': ['minkowski', 'chebyshev'],
    'p': np.array([1, 2, 3, 4])
}

In [54]:
neigh = KNeighborsClassifier()
grid_neigh = GridSearchCV(estimator=neigh, param_grid=param_neigh, cv=strat, n_jobs=-1)
grid_neigh.fit(x, y)

print(grid_neigh.best_score_)
print(grid_neigh.best_params_)

0.6107431752235447
{'metric': 'minkowski', 'n_neighbors': 10, 'p': 1, 'weights': 'distance'}


In [55]:
best_neigh = KNeighborsClassifier(metric='minkowski', n_neighbors=10, p=1, weights='distance')
best_neigh.fit(x_train, y_train)
prediction_neigh = best_neigh.predict(x_test)

GAUSSIANNB

In [56]:
gauss = GaussianNB()
gauss.fit(x_train, y_train)
prediction_gauss = gauss.predict(x_test)

RESULTS

In [64]:
scores = [
    {'Model': 'Random Forest', 'Accuracy': best_ram.score(x_test, y_test), 'Precision': precision_score(y_test, prediction_ram), 'Recall': recall_score(y_test, prediction_ram), 'ROC_AUC': roc_auc_score(y_test, prediction_ram), 'f1_score': f1_score(y_test, prediction_ram)},
    {'Model': 'Gradient Boosting', 'Accuracy': best_gradient.score(x_test, y_test), 'Precision': precision_score(y_test, prediction_gradient), 'Recall': recall_score(y_test, prediction_gradient), 'ROC_AUC': roc_auc_score(y_test, prediction_gradient), 'f1_score': f1_score(y_test, prediction_gradient)},
    {'Model': 'Decision Tree', 'Accuracy': best_tree.score(x_test, y_test), 'Precision': precision_score(y_test, prediction_tree), 'Recall': recall_score(y_test, prediction_tree), 'ROC_AUC': roc_auc_score(y_test, prediction_tree), 'f1_score': f1_score(y_test, prediction_tree)},
    {'Model': 'KNeighbors', 'Accuracy': best_neigh.score(x_test, y_test), 'Precision': precision_score(y_test, prediction_neigh), 'Recall': recall_score(y_test, prediction_neigh), 'ROC_AUC': roc_auc_score(y_test, prediction_neigh), 'f1_score': f1_score(y_test, prediction_neigh)},
    {'Model': 'GaussianNB', 'Accuracy': gauss.score(x_test, y_test), 'Precision': precision_score(y_test, prediction_gauss), 'Recall': recall_score(y_test, prediction_gauss), 'ROC_AUC': roc_auc_score(y_test, prediction_gauss), 'f1_score': f1_score(y_test, prediction_gauss)}
]
results = pd.DataFrame(scores)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


ValueError: multi_class must be in ('ovo', 'ovr')

In [None]:
display(results)