# Setup

In [1]:
# Loading the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [2]:
# Setting up tweaks for the visualization
sns.set(style="whitegrid")

# Loading the data

In [None]:
# Be aware that the path that you are using is different to mine
path = r'D:\Training\ITESM\MLOps_ITESM\wine_ml_pipeline\data\raw\wine_quality_df.csv'
raw_wine_df = pd.read_csv(path)
raw_wine_df.head()

In [None]:
raw_wine_df.info()

In [None]:
raw_wine_df.head().T

In [None]:
raw_wine_df.describe().T

# Adding some visualizations

In [None]:
# Looking at the distributions
raw_wine_df.hist(bins=15, figsize=(15, 10))
plt.show()


In [None]:
# Looking at the correlations
plt.figure(figsize=(12, 8))
sns.heatmap(raw_wine_df.corr(), annot=True, fmt=".2f", cmap='coolwarm')
plt.show()


In [None]:
# Loking at the relationship between the features and the wine quality
for column in raw_wine_df.columns[:-1]:  # Excluye la columna 'quality'
    plt.figure(figsize=(8, 4))
    sns.boxplot(x='quality', y=column, data=raw_wine_df)
    plt.title(f'Relationship between wine quality and {column}')
    plt.show()

# Preprocessing and Feature Engineering

In [15]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
features_scaled = scaler.fit_transform(raw_wine_df.drop('quality', axis=1))
wine_df_scaled = pd.DataFrame(features_scaled, columns=raw_wine_df.columns[:-1])
wine_df_scaled['quality'] = raw_wine_df['quality']

# Splitting the dataset

In [16]:
from sklearn.model_selection import train_test_split

X = wine_df_scaled.drop('quality', axis=1)
y = wine_df_scaled['quality']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training the model

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Evaluating the model

In [20]:
from sklearn.metrics import confusion_matrix, classification_report

y_pred = model.predict(X_test)

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10,7))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=np.unique(y), yticklabels=np.unique(y))
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()


In [None]:
from sklearn.metrics import classification_report
import pandas as pd

report = classification_report(y_test, y_pred, output_dict=True)

report_df = pd.DataFrame(report).transpose()

print(report_df)

# Improving the model

In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(model, X, y, cv=5)
print("Average accuracy with CV:", np.mean(scores))