In [1]:
import pandas as pd
import numpy as np

In [2]:
data = '/Users/leona/OneDrive/Documentos/GitHub/wine-classification/wine_prices/analisys/final_file.csv'

In [3]:
df = pd.read_csv(data)

In [4]:
x = df.drop(columns=['type'])
y = df['type']

In [5]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_val_score, train_test_split
from sklearn.metrics import confusion_matrix, recall_score, precision_score, roc_auc_score, f1_score

In [6]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=15, stratify=y)

In [7]:
strat = StratifiedKFold(n_splits=5, shuffle=True, random_state=12)

LOGISTIC REGRESSION

In [8]:
# Param for Logistic Regression
param_logistic = {
    'C': np.array([1, 2, 3, 7, 10, 15, 20, 50]),
    'penalty': ['l1', 'l2', 'elasticnet', None],
    'solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']
}

In [9]:
logistic = LogisticRegression(max_iter=2000, tol=0.01, )
grid_logistic = GridSearchCV(estimator=logistic, param_grid=param_logistic, cv=strat, n_jobs=-1)
grid_logistic.fit(x, y)

print(grid_logistic.best_score_)
print(grid_logistic.best_params_)

0.9938427192514953
{'C': 1, 'penalty': None, 'solver': 'newton-cholesky'}


440 fits failed out of a total of 960.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\leona\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\leona\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\leona\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1172, in fit
    solver = _check_solver(s

In [10]:
best_logistic = LogisticRegression(C=1, penalty=None, solver='newton-cholesky')
best_logistic.fit(x_train, y_train)
prediction_logistic = best_logistic.predict(x_test)

RANDOM FOREST

In [11]:
# Param for RandomForestClassifier
param_ram = {
    'n_estimators': np.array([100, 150]),
    'criterion': ["gini", "entropy"],
    'max_depth': np.array([4, 6, 8]),
    'min_samples_split': np.array([2, 8])
}

In [12]:
ram = RandomForestClassifier()
grid_ram = GridSearchCV(estimator=ram, param_grid=param_ram, cv=strat, n_jobs=-1)
grid_ram.fit(x, y)

print(grid_ram.best_score_)
print(grid_ram.best_params_)

0.9944588144726714
{'criterion': 'entropy', 'max_depth': 8, 'min_samples_split': 2, 'n_estimators': 100}


In [13]:
best_ram = RandomForestClassifier(criterion='entropy', max_depth=8, min_samples_split=2, n_estimators=100)
best_ram.fit(x_train, y_train)
prediction_ram = best_ram.predict(x_test)

GRADIENT BOOSTING

In [14]:
# Param for GradientBoostClassifier
param_gradient = {
    'loss': ['log_loss', 'exponential'],
    'learning_rate': np.array([0.01, 0.1]),
    'criterion': ['friedman_mse', 'squared_error'],
    'n_estimators': np.array([100, 120])
}

In [15]:
gradient = GradientBoostingClassifier()
grid_gradient = GridSearchCV(estimator=gradient, param_grid=param_gradient, cv=strat, n_jobs=-1)
grid_gradient.fit(x, y)

print(grid_gradient.best_score_)
print(grid_gradient.best_params_)

0.9949203529342098
{'criterion': 'friedman_mse', 'learning_rate': 0.1, 'loss': 'log_loss', 'n_estimators': 120}


In [16]:
best_gradient = GradientBoostingClassifier(criterion='friedman_mse', learning_rate=0.1, loss='log_loss', n_estimators=120)
best_gradient.fit(x_train, y_train)
prediction_gradient = best_gradient.predict(x_test)

DECISION TREE

In [17]:
# Param for DecisionTreeClassifier
param_tree = {
    'min_samples_split': np.array([2, 8, 12]),
    'max_depth': np.array([4, 6, 8, 12]),
    'criterion': ["gini", "entropy", "log_loss"]
}

In [18]:
tree = DecisionTreeClassifier()
grid_tree = GridSearchCV(estimator=tree, param_grid=param_tree, cv=strat, n_jobs=-1)
grid_tree.fit(x, y)

print(grid_tree.best_score_)
print(grid_tree.best_params_)

0.9892254396873333
{'criterion': 'log_loss', 'max_depth': 8, 'min_samples_split': 2}


In [19]:
best_tree = DecisionTreeClassifier(criterion='log_loss', max_depth=12, min_samples_split=2)
best_tree.fit(x_train, y_train)
prediction_tree = best_tree.predict(x_test)

KNeighbors

In [20]:
# Param for KNeighborsClassifier
param_neigh = {
    'weights': ['distance'],
    'n_neighbors': np.array([2, 3, 4, 5, 6, 7, 8]),
    'metric': ['minkowski', 'chebyshev'],
    'p': np.array([1, 2, 3, 4])
}

In [21]:
neigh = KNeighborsClassifier()
grid_neigh = GridSearchCV(estimator=neigh, param_grid=param_neigh, cv=strat, n_jobs=-1)
grid_neigh.fit(x, y)

print(grid_neigh.best_score_)
print(grid_neigh.best_params_)

0.9649063776869781
{'metric': 'minkowski', 'n_neighbors': 8, 'p': 1, 'weights': 'distance'}


In [22]:
best_neigh = KNeighborsClassifier(metric='minkowski', n_neighbors=6, p=1, weights='distance')
best_neigh.fit(x_train, y_train)
prediction_neigh = best_neigh.predict(x_test)

GAUSSIANNB

In [23]:
gauss = GaussianNB()
gauss.fit(x_train, y_train)
prediction_gauss = gauss.predict(x_test)

RESULTS

In [24]:
scores = [
    {'Model': 'Logistic Regression', 'Accuracy': best_logistic.score(x_test, y_test), 'Precision': precision_score(y_test, prediction_logistic), 'Recall': recall_score(y_test, prediction_logistic), 'ROC_AUC': roc_auc_score(y_test, prediction_logistic), 'f1_score': f1_score(y_test, prediction_logistic)},
    {'Model': 'Random Forest', 'Accuracy': best_ram.score(x_test, y_test), 'Precision': precision_score(y_test, prediction_ram), 'Recall': recall_score(y_test, prediction_ram), 'ROC_AUC': roc_auc_score(y_test, prediction_ram), 'f1_score': f1_score(y_test, prediction_ram)},
    {'Model': 'Gradient Boosting', 'Accuracy': best_gradient.score(x_test, y_test), 'Precision': precision_score(y_test, prediction_gradient), 'Recall': recall_score(y_test, prediction_gradient), 'ROC_AUC': roc_auc_score(y_test, prediction_gradient), 'f1_score': f1_score(y_test, prediction_gradient)},
    {'Model': 'Decision Tree', 'Accuracy': best_tree.score(x_test, y_test), 'Precision': precision_score(y_test, prediction_tree), 'Recall': recall_score(y_test, prediction_tree), 'ROC_AUC': roc_auc_score(y_test, prediction_tree), 'f1_score': f1_score(y_test, prediction_tree)},
    {'Model': 'KNeighbors', 'Accuracy': best_neigh.score(x_test, y_test), 'Precision': precision_score(y_test, prediction_neigh), 'Recall': recall_score(y_test, prediction_neigh), 'ROC_AUC': roc_auc_score(y_test, prediction_neigh), 'f1_score': f1_score(y_test, prediction_neigh)},
    {'Model': 'GaussianNB', 'Accuracy': gauss.score(x_test, y_test), 'Precision': precision_score(y_test, prediction_gauss), 'Recall': recall_score(y_test, prediction_gauss), 'ROC_AUC': roc_auc_score(y_test, prediction_gauss), 'f1_score': f1_score(y_test, prediction_gauss)}
]
results = pd.DataFrame(scores)

In [25]:
display(results)

Unnamed: 0,Model,Accuracy,Precision,Recall,ROC_AUC,f1_score
0,Logistic Regression,0.993846,0.994573,0.997279,0.990306,0.995924
1,Random Forest,0.995385,0.994584,0.99932,0.991327,0.996946
2,Gradient Boosting,0.992821,0.995238,0.995238,0.990327,0.995238
3,Decision Tree,0.98359,0.989782,0.988435,0.978593,0.989108
4,KNeighbors,0.953846,0.955746,0.984354,0.922385,0.969839
5,GaussianNB,0.968718,0.984859,0.973469,0.963818,0.979131
