In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import numpy as np

In [None]:
from sklearn.preprocessing import MinMaxScaler
# Scatter matrix
from pandas.plotting import scatter_matrix

# MP2

## Ingesting the data

In [5]:
#The data is placed in an folder so we load the data through the folder with \
data1 = pd.read_excel("wine-data\winequality-red.xlsx", header=1)
data2 = pd.read_excel("wine-data\winequality-white.xlsx", header=1)

### Aggregating the data together into one data file - but first we add a column named type

In [6]:
#Adding the type
data1["type"] = "red"
data2["type"] = "white"

# Merge/concatenate - the ignore_index resets the index numbers
wine = pd.concat([data1, data2], ignore_index=True)

In [None]:
#Shows a random sample, to show that the data is mixed
wine.sample(10)

## Cleaning the data

In [None]:
#We use shape to show the size of the dataset, through the number of observations and how many variables there are
wine.shape

In [None]:
#Finding missing values (null) - It shows how many of the different variables has null values
print(wine.isnull().sum())

### Since there are no null values, we do not need to drop them

In [None]:
#Shows how many duplicated values there are. It compares all of the columns.
wine.duplicated().sum()

### We therefore have 1177 duplicates

In [None]:
#This shows the first five duplicates.
duplicates = wine[wine.duplicated()]
print(duplicates.head())

In [None]:
#Dropping the 1177 duplicates and resetting the index
wine.drop_duplicates(inplace=True, ignore_index=True)

### Checking for damaged values

In [None]:
#Negative values - 
(wine.drop(columns="type") < 0).sum()

In [None]:
#pH value 7 or bigger for wine, as wine is acid and 7 is neutral - 
wine[(wine['pH']>7)].sum()

## The outliers for the "accepted" wine values

In [None]:
def check_min_max(list, expected_range):
    report = []
    for col, (min_val, max_val) in expected_range.items():
        count = ((list[col] < min_val) | (list[col] > max_val)).sum()
        report.append({"column": col, "outliers": count})

    report_df = pd.DataFrame(report)
    print(report_df)

In [None]:
#Our expected ranges for red wine
expected_ranges_red = {
    "fixed acidity": (4.6, 15.9),
    "volatile acidity": (0.12, 1.58),
    "citric acid": (0.0, 1.0),
    "residual sugar": (0.9, 15.5),
    "chlorides": (0.012, 0.611),
    "free sulfur dioxide": (1, 72),
    "total sulfur dioxide": (6, 289),
    "density": (0.99007, 1.00369),
    "pH": (2.74, 4.01),
    "sulphates": (0.33, 2.0),
    "alcohol": (8.4, 14.9),
    "quality": (3, 8)
}
red_wine = wine[wine["type"] == "red"]

check_min_max(red_wine, expected_ranges_red)

In [None]:
#Our expected ranges for white wine
expected_ranges_white = {
    "fixed acidity": (3.8, 14.2),
    "volatile acidity": (0.08, 1.10),
    "citric acid": (0.0, 1.66),
    "residual sugar": (0.6, 65.8),
    "chlorides": (0.009, 0.346),
    "free sulfur dioxide": (2, 289),
    "total sulfur dioxide": (9, 440),
    "density": (0.98711, 1.03898),
    "pH": (2.72, 3.82),
    "sulphates": (0.22, 1.08),
    "alcohol": (8.0, 14.2),
    "quality": (3, 9)
}

#Right now we are only checking for the values of white wine, so we need to filter on the type of wine
white_wine = wine[wine["type"] == "white"]

check_min_max(white_wine, expected_ranges_white)

In [None]:
#Shows the count, mean std fopr the collective winelist
wine.describe()

In [None]:
#Description of red wine - the list is made previously: red_wine = wine[wine["type"] == "red"]
red_wine.describe()

In [None]:
#Description of white wine - the list is made previously: white_wine = wine[wine["type"] == "white"]
white_wine.describe()

In [None]:
#The shape of red wine after cleaning
red_wine.shape

In [None]:
#The shape of white wine after cleaning
white_wine.shape

## Endcoding

In [None]:
#Pre Label encoding - Categorical Encoding
wine.dtypes

wine["type"] = wine["type"].astype("category")

wine.dtypes

cat_data = wine.select_dtypes('category').columns

cat_data

# print these columns only
wine[cat_data].sample(10)

categorical_data = wine[cat_data]
categorical_data

In [None]:
# Label Endcoding which is converting the text values into numerical values

cats = categorical_data["type"].unique()
cats

categorical_data = wine[["type"]].copy()
categorical_data["type_encoded"] = categorical_data["type"].cat.codes
categorical_data.sample(10)

In [None]:
# adding the encoded column to the original dataframe and dropping the text column
# White wine = 1 and red wine = 0
wine["type_encoded"] = wine["type"].cat.codes
wine_transformed = wine.drop(["type"], axis=1)

print(wine_transformed)

### Normalization

In [None]:
# Sørg for, at kolonnen "type_encoded" er til stede i red_wine og white_wine
red_wine["type_encoded"] = wine[wine["type"] == "red"]["type_encoded"]
white_wine["type_encoded"] = wine[wine["type"] == "white"]["type_encoded"]

# Fjern kolonnen "type" fra red_wine og white_wine
red_wine_transformed = red_wine.drop(columns=["type"]).copy()
white_wine_transformed = white_wine.drop(columns=["type"]).copy()

# normalization - Scaling the data to a range of 0 to 1
scaler = MinMaxScaler()

# Fit og transformér hele datasættet
all_wine_norm = scaler.fit_transform(wine_transformed)

# Brug transform() for at genbruge skalaen på red_wine og white_wine
red_wine_norm = scaler.transform(red_wine_transformed)
white_wine_norm = scaler.transform(white_wine_transformed)

# Konverter tilbage til DataFrame for at bevare kolonnenavne
all_wine_norm = pd.DataFrame(all_wine_norm, columns=wine_transformed.columns)
red_wine_norm = pd.DataFrame(red_wine_norm, columns=red_wine_transformed.columns)
white_wine_norm = pd.DataFrame(white_wine_norm, columns=white_wine_transformed.columns)

# Udskriv resultater
print(all_wine_norm.head())
print(red_wine_norm.head())
print(white_wine_norm.head())


In [None]:
# normal distrubted histogram -> bellcurve

# Histogram af kvalitet for rød, hvid og alle vine
plt.figure(figsize=(12, 6))

# Histogram for rødvin
plt.hist(wine[wine["type"] == "red"]["quality"], bins=10, alpha=0.5, label="Red wine", color="red", edgecolor="black")

# Histogram for hvidvin
plt.hist(wine[wine["type"] == "white"]["quality"], bins=10, alpha=0.5, label="White wine", color="orange", edgecolor="black")

# Histogram for alle vine
plt.hist(wine["quality"], bins=10, alpha=0.5, label="All wines", color="blue", edgecolor="black")

# Tilføj labels og titel
plt.xlabel("Quality")
plt.ylabel("Count")
plt.title("Distribution of Wine Quality (Red, White, and All Wines)")
plt.legend()
plt.show()

## Exploration

### Correlation

In [None]:
# Selects the data that are only numbers
num_data = wine.select_dtypes(include=["number"]).columns.tolist()
num_data

In [None]:
# Shows a scatter plot matrix
scatter_matrix(wine[num_data], figsize=(30,26))

In [None]:
# Calculates the correlation values
post_norm_corr_all = all_wine_norm.corr(method='pearson', numeric_only=True)
# Prints the correlation coeficients in a table for all the values
post_norm_corr_all

In [None]:
#heatmap with normalized data
# Heatmap
plt.figure(figsize=(12, 10))
sb.heatmap(post_norm_corr_all, annot=True, cmap='coolwarm')
plt.title("All wines - post normaliztion", fontsize=14)
plt.show()

## Correlations for all wines seems to low everywhere,so it warrents that we check red and white for them selves

In [None]:
# Calculates the correlation values
post_norm_corr_red = red_wine_norm.corr(method='pearson', numeric_only=True)
# Prints the correlation coeficients in a table for all the values
post_norm_corr_red

In [None]:
# Calculates the correlation values
post_norm_corr_white = white_wine_norm.corr(method='pearson', numeric_only=True)
# Prints the correlation coeficients in a table for all the values
post_norm_corr_white

In [None]:
# Heatmap
plt.figure(figsize=(12, 10))
sb.heatmap(post_norm_corr_white, annot=True, cmap='coolwarm')
plt.title("White wine - post normaliztion", fontsize=14)

plt.show()

## The correlations for white wine:
it is high for:  
    1: density and residual sugar 
    2: density and alcohol

In [None]:
# Heatmap
plt.figure(figsize=(12, 10))
sb.heatmap(post_norm_corr_red, annot=True, cmap='coolwarm')
plt.title("Red wines - post normaliztion", fontsize=14)

plt.show()

## The correlations for red wine:
it is high for:
1. fixed acidity and Ph
2. fixed acidity and density

low for :
1. pH and citric acid
2. alcohol and density


## Warrents futher investigation for correlations

1. we want to see why residual sugar and alcohol compared to density doesn't correlate the same for red and white wine
2. we want to see why acids doesn't have a higher correlations with pH

In [None]:
# Residual sugar histograms for red and white wines
fig, axes = plt.subplots(1, 2, figsize=(12,5), sharey=True, sharex=True)

axes[0].hist(
    wine[wine["type_encoded"]==0]["residual sugar"],
    bins=30, color="red", edgecolor="black"
)
axes[0].set_title("Red Wine Residual Sugar")
axes[0].set_xlabel("Residual Sugar (g/dm³)")
axes[0].set_ylabel("Count")

axes[1].hist(
    wine[wine["type_encoded"]==1]["residual sugar"],
    bins=30, color="orange", edgecolor="black"
)
axes[1].set_title("White Wine Residual Sugar")
axes[1].set_xlabel("Residual Sugar (g/dm³)")

plt.suptitle("Residual Sugar Distribution by Wine Type", fontsize=14)
plt.show()



## Findings

We can see that there is a lot more residual sugar in white wines. Therefor it affects the density a lot more than in red wines because the amount of sugar is much larger, so it has a higher impact on the density. That is also why we also see a lower correlation between sugar and density in red wines.

In [None]:
from scipy.stats import norm

# Select red wine alcohol column
red_alcohol = red_wine_norm["alcohol"]

# Calculate mean and std for normal curve
mu, sigma = red_alcohol.mean(), red_alcohol.std()

# Plot histogram
plt.figure(figsize=(8,5))
count, bins, ignored = plt.hist(red_alcohol, bins=20, density=True,
                                color="red", edgecolor="black", alpha=0.6)

# Plot bell curve
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = norm.pdf(x, mu, sigma)
plt.plot(x, p, "k", linewidth=2, label="Normal distribution")

plt.title("Alcohol Distribution in Red Wine with Bell Curve")
plt.xlabel("Alcohol (%)")
plt.ylabel("Density")
plt.legend()
plt.show()



In [None]:
# Select white wine alcohol column
white_alcohol = white_wine_norm["alcohol"]

# Calculate mean and std for normal curve
mu, sigma = white_alcohol.mean(), white_alcohol.std()

# Plot histogram
plt.figure(figsize=(8,5))
count, bins, ignored = plt.hist(white_alcohol, bins=20, density=True,
                                color="yellow", edgecolor="black", alpha=0.6)

# Plot bell curve
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = norm.pdf(x, mu, sigma)
plt.plot(x, p, "k", linewidth=2, label="Normal distribution")

plt.title("Alcohol Distribution in White Wine with Bell Curve")
plt.xlabel("Alcohol (%)")
plt.ylabel("Density")
plt.legend()
plt.show()


## Findings

We know that alcohol have a correlation with density since alcohol is less dense than water, which is what most wines is primarily made of.

The alcohol in red wine doesn't have a very good normal distribution so we expect that any correlation between alcohol and density for red wines will not be very high.

We can see that white wine has a better normal distribution for their alcohol. Therefor we expect that white wine has a better correlation between alcohol and density since its data is more normalized.

In [None]:
# Select white wine pH column
white_ph = white_wine_norm["pH"]

# Calculate mean and std for normal curve
mu, sigma = white_ph.mean(), white_ph.std()

# Plot histogram
plt.figure(figsize=(8,5))
count, bins, ignored = plt.hist(white_ph, bins=20, density=True,
                                color="yellow", edgecolor="black", alpha=0.6)

# Plot bell curve
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = norm.pdf(x, mu, sigma)
plt.plot(x, p, "k", linewidth=2, label="Normal distribution")

plt.title("pH Distribution in White Wine with Bell Curve")
plt.xlabel("pH")
plt.ylabel("Quantity")
plt.legend()
plt.show()

In [None]:
print(f"Mode: {white_ph.mode()}")
print(f"Mean: {white_ph.mean()}")

## Findings in relation to the bell curve for pH value in white wine
There is a bit of positive scewing as is shown on the figure above, which means there are larger portion of the dataset with a lower pH value than the mean. The skewing can also be expolated by comparing the mean and the mode

The mode for normalized pH value in white wine is 0.341, but the mean is 0.368, which means it is positively skewed.

In [None]:
# Select red wine ph column
red_ph = red_wine_norm["pH"]

# Calculate mean and std for normal curve
mu, sigma = red_ph.mean(), red_ph.std()

# Plot histogram
plt.figure(figsize=(8,5))
count, bins, ignored = plt.hist(red_ph, bins=20, density=True,
                                color="red", edgecolor="black", alpha=0.6)

# Plot bell curve
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = norm.pdf(x, mu, sigma)
plt.plot(x, p, "k", linewidth=2, label="Normal distribution")

plt.title("pH Distribution in Red Wine with Bell Curve")
plt.xlabel("pH")
plt.ylabel("Quantity")
plt.legend()
plt.show()

In [None]:
print(f"Mode: {red_ph.mode()}")
print(f"Mean: {red_ph.mean()}")

## Findings in relation to the bell curve for pH value in red wine
There is a very small amount of positive skewing as is shown on the figure above, as there are larger portion of the dataset with a little bit lower pH value than the mode.
The mode for normalized pH value in red wine is 0.449, but the mean is 0.457, which means it is positively skewed.

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 6), sharey=True, sharex=True)

# Red wine
axes[0].scatter(
    red_wine["fixed acidity"],
    red_wine["pH"],
    alpha=0.5,
    c="red"
)
axes[0].set_title("Red wine")
axes[0].set_xlabel("Fixed acidity")
axes[0].set_ylabel("pH")

# White wine
axes[1].scatter(
    white_wine["fixed acidity"],
    white_wine["pH"],
    alpha=0.5,
    c="orange"
)
axes[1].set_title("White wine")
axes[1].set_xlabel("Fixed acidity")

fig.suptitle("Scatterplots of pH vs Fixed acidity by wine type")
plt.tight_layout()
plt.show()

### Scatterplot for fixed acidity and pH value

The range for fixed acidity in the two different types of wine is:
    red: 4.6 - 15.9
    white: 3.8 - 14.2
Which means that there generally is more fixed acid in a red wine than white wine. This is despite that the pH value generally is lower for white wine.

Typically the more acidity something has, the lower pH value it has, as a low pH value indicates something's acid level, but it also depends on other compenents in the wine. In our diagram above we can see that red wine typically have a higher level of fixed acidity, despite the level of pH value typically is lower. This is due to other components in the wine that interacts with the acid.

We can also observe from the diagram that white wine is more clustered together than red wine.

#### Red wine
On the left side, we can see that as the amount of fixed acidity increases, the pH value decreases, which correlates with the heatmap information and general workings pH value.

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 6), sharey=True, sharex=True)

# Red wine
axes[0].scatter(
    red_wine["citric acid"],
    red_wine["pH"],
    alpha=0.5,
    c="red"
)
axes[0].set_title("Red wine")
axes[0].set_xlabel("Citric acid")
axes[0].set_ylabel("pH")

# White wine
axes[1].scatter(
    white_wine["citric acid"],
    white_wine["pH"],
    alpha=0.5,
    c="orange"
)
axes[1].set_title("White wine")
axes[1].set_xlabel("Citric acid")

fig.suptitle("Scatterplots of pH vs citric acid by wine type")
plt.tight_layout()
plt.show()

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 6), sharey=True, sharex=True)

# Red wine
axes[0].scatter(
    red_wine["volatile acidity"],
    red_wine["pH"],
    alpha=0.5,
    c="red"
)
axes[0].set_title("Red wine")
axes[0].set_xlabel("Volatile acidity")
axes[0].set_ylabel("pH")

# White wine
axes[1].scatter(
    white_wine["volatile acidity"],
    white_wine["pH"],
    alpha=0.5,
    c="orange"
)
axes[1].set_title("White wine")
axes[1].set_xlabel("Volatile acidity")

fig.suptitle("Scatterplots of pH vs volatile acidity by wine type")
plt.tight_layout()
plt.show()

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 6), sharey=True, sharex=True)

# Red wine
axes[0].scatter(
    red_wine["fixed acidity"],
    red_wine["citric acid"],
    alpha=0.5,
    c="red"
)
axes[0].set_title("Red wine")
axes[0].set_xlabel("fixed acidity")
axes[0].set_ylabel("citric acid")

# White wine
axes[1].scatter(
    white_wine["fixed acidity"],
    white_wine["citric acid"],
    alpha=0.5,
    c="orange"
)
axes[1].set_title("White wine")
axes[1].set_xlabel("fixed acidity")

fig.suptitle("Scatterplots of citric acid vs fixed acidity by wine type")
plt.tight_layout()
plt.show()

In [None]:
## Findings in regards to acids and pH values in wine
the pH value measures whether something is primarily acidic or basic. Which is why one could assume all acids influence the pH value. However in the heat maps we can see that the volatile acid has a very low correlation to the pH value, but fixed acid is much higher correlated with pH value than volatile, and the citric acid is in between. 

In reards to the two different types of wine, we can see a much lower correlation bvetween the acids and pH value for white wine than red wine. We have decided to investigate it further as it interests us. 

First it is important to distinguish between the different types of acid, as they are very different in abilities and quantities.

### Fixed acid 
Fixed acid is one of the main determinants for the pH value in wine, as it releases the H+ ion, which is what the pH value measures in terms of acidity. Fixed acid is the part of the wine that gives it the tartness and mouthfeel of the wine. So in regards to the heat map it makes sense that it correlates, but as there are other components to wine than just fixed acid, it also makes sense that it is not a complete match, as there are other things that influence the pH value. 

### Volatile acid
Volatile acid is acids that can vaporize and is the by-product of microbial metabolism. When the fermantation happens in wine, the citric acid acts with microbes and is turned into volatile acid. The amount that is generally created in wine is extremely low and therefor does not influence the pH value. If there is a high concentration of volatile acid, it can be an indicator of a spoiled wine, and can make it taste like vinegar, as it is one of the main components of vinegar. Due to the low amounts of volatile acid generally found in wine per liter, it does not release enough ions to affect the pH value, so the low correlation to the heatmap does make sense. 

### Citric acid
Citric acid is a type of fixed acid. It is naturally occurent in many citrus fruits. It is used in wine making to give it a fruity tartness to the taste. As citric acid is a fixed acid, one can extrapolate that the pH value that is calculated from fixed acid includes the value from citric acid, and therefore it we can conclude that the correlation should be and is, bigger for fixed acid than citric acid, as it has 
a larger quantity in the wine. 

### Heatmap
In regards to the three types of acids we can therefore conclude that the correlation between volatile acid and pH value, should be lower than both citric acid and fixed acidity, as the quantity of volatile acid is too low to have an impact on the pH value.
Furthermore, the citric acid and pH value should have a lower correlation than the fixed acidity and pH value, as the citric acid is a part of the fixed acidity, but not all of it, and therefore there is a larger quantity of fixed acid to correlate with the pH value.

# Plot diagrams

In [None]:
# a. Boxplots for at sammenligne fordelingen af quality, alcohol og residual sugar
plt.figure(figsize=(12,4))
plt.subplot(1,3,1)
sb.boxplot(x="type", y="quality", data=wine)
plt.title("Quality by Wine Type")
plt.subplot(1,3,2)
sb.boxplot(x="type", y="alcohol", data=wine)
plt.title("Alcohol by Wine Type")
plt.subplot(1,3,3)
sb.boxplot(x="type", y="residual sugar", data=wine)
plt.title("Residual Sugar by Wine Type")
plt.tight_layout()
plt.show()

In [None]:
# which type of wine has higher average quality, how big is the difference?
mean_quality = wine.groupby("type")["quality"].mean()
print("average quality:\n", mean_quality)
print("difference:", abs(mean_quality["white"] - mean_quality["red"]))

# white has on average higher quality

In [None]:
# which type of wine has higher average level of alcohol?
mean_alcohol = wine.groupby("type")["alcohol"].mean()
print("Average alcohol:\n", mean_alcohol)
print("difference:", abs(mean_alcohol["white"] - mean_alcohol["red"]))

# white has on average higher level of alcohol

In [None]:
# which one has higher average quantity of residual sugar?

mean_sugar = wine.groupby("type")["residual sugar"].mean()
print("Average residual sugar:\n", mean_sugar)
print("difference:", abs(mean_sugar["white"] - mean_sugar["red"]))

# white has on average higher quantity of residual sugar

In [None]:
plt.figure(figsize=(10,4))
plt.subplot(1,2,1)
sb.scatterplot(x="alcohol", y="quality", hue="type", data=wine, alpha=0.5)
plt.title("Alcohol vs. Quality")
plt.subplot(1,2,2)
sb.scatterplot(x="residual sugar", y="quality", hue="type", data=wine, alpha=0.5)
plt.title("Residual Sugar vs. Quality")
plt.tight_layout()
plt.show()

In [None]:
print("Korrelationskoefficient (alcohol vs. quality):")
print(wine[["alcohol", "quality"]].corr())
print("Korrelationskoefficient (residual sugar vs. quality):")
print(wine[["residual sugar", "quality"]].corr())

In [None]:
#Binning wines in groups of 5 and 10 bins.

def find_highest_density(df, bins, label):
    #Bin the pH column
    binned = pd.cut(df["pH"], bins=bins)
    
    counts = binned.value_counts().sort_index()
    
    #Print the densest bin.
    max_bin = counts.idxmax()
    print(f"{label}: Number of bins = {bins} Highest density in pH range of {max_bin} with {counts.max()} samples")
    return counts

print("5 bins")
find_highest_density(wine, bins=5, label="All wines")
find_highest_density(wine[wine["type_encoded"]==0], bins=5, label="Red wines")
find_highest_density(wine[wine["type_encoded"]==1], bins=5, label="White wines")

print("\n10 bins")
find_highest_density(wine, bins=10, label="All wines")
find_highest_density(wine[wine["type_encoded"]==0], bins=10, label="Red wines")
find_highest_density(wine[wine["type_encoded"]==1], bins=10, label="White wines")

In [None]:
#Visual graph for pH density on red, white and all wines combined.
plt.figure(figsize=(12,6))

#Change bins=x the higer x is the more detailed the graph gets.
plt.hist(wine[wine["type_encoded"]==0]["pH"], bins=20, alpha=0.5, label="Red wine", edgecolor="black", color="red")
plt.hist(wine[wine["type_encoded"]==1]["pH"], bins=20, alpha=0.5, label="White wine", edgecolor="black", color="orange")
plt.hist(wine["pH"], bins=20, alpha=0.5, label="All wines", edgecolor="black")

plt.xlabel("pH")
plt.ylabel("Count")
plt.title("pH Distribution of Red, White, and Combined Wine")
plt.legend()
plt.show()

In [None]:
#Individual graphs for red and white wines.
fig, axes = plt.subplots(1, 2, figsize=(12,5), sharey=True)

axes[0].hist(wine[wine["type_encoded"]==0]["pH"], bins=20, color="red", edgecolor="black")
axes[0].set_title("Red Wine pH Distribution")
axes[0].set_xlabel("pH")
axes[0].set_ylabel("Count")

axes[1].hist(wine[wine["type_encoded"]==1]["pH"], bins=20, color="orange", edgecolor="black")
axes[1].set_title("White Wine pH Distribution")
axes[1].set_xlabel("pH")

plt.suptitle("Individual Histograms by Wine Type", fontsize=14)
plt.show()

In [None]:
#Define bins and labels
bins = [2.5, 3.1, 3.4, 4.5]  
labels = ["Low", "Medium", "High"]

wine["pH_category"] = pd.cut(wine["pH"], bins=bins, labels=labels)

category_counts = wine.groupby(["pH_category", "type_encoded"]).size().unstack(fill_value=0)
category_counts.columns = ["Red wine", "White wine"]

range_labels = [
    f"Low ({bins[0]}–{bins[1]})",
    f"Medium ({bins[1]}–{bins[2]})",
    f"High ({bins[2]}–{bins[3]})"
]

ax = category_counts.plot(
    kind="bar", 
    stacked=True, 
    color=["red", "orange"], 
    edgecolor="black",
    figsize=(9,6)
)

ax.set_xlabel("pH Category")
ax.set_ylabel("Count")
ax.set_title("Wine Distribution by pH Category (Red vs White Wine)")
ax.set_xticklabels(range_labels, rotation=0)
plt.legend()
plt.show()