# Setup

In [None]:
# Loading the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [None]:
# Setting up tweaks for the visualization
sns.set(style="whitegrid")

# Loading the data

In [None]:
# Be aware that the path that you are using is different to mine
path = '../../data/raw/wine_quality_df.csv'
raw_wine_df = pd.read_csv(path)
raw_wine_df.head()

In [None]:
raw_wine_df.info()

In [None]:
raw_wine_df.head().T

In [None]:
raw_wine_df.describe().T

# Adding some visualizations

In [None]:
# Looking at the distributions
raw_wine_df.hist(bins=15, figsize=(15, 10))
plt.show()


In [None]:
# Looking at the correlations
plt.figure(figsize=(12, 8))
sns.heatmap(raw_wine_df.corr(), annot=True, fmt=".2f", cmap='coolwarm')
plt.show()


In [None]:
# Loking at the relationship between the features and the wine quality
for column in raw_wine_df.columns[:-1]:  # Excluye la columna 'quality'
    plt.figure(figsize=(8, 4))
    sns.boxplot(x='quality', y=column, data=raw_wine_df)
    plt.title(f'Relationship between wine quality and {column}')
    plt.show()

# Preprocessing and Feature Engineering

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
features_scaled = scaler.fit_transform(raw_wine_df.drop('quality', axis=1))
wine_df_scaled = pd.DataFrame(features_scaled, columns=raw_wine_df.columns[:-1])
wine_df_scaled['quality'] = raw_wine_df['quality']