In [None]:
import pandas as pd
import matplotlib.pyplot as plt #Scatterplot
import seaborn as sb
import numpy as np

In [None]:
from sklearn.preprocessing import MinMaxScaler
# Scatter matrix
from pandas.plotting import scatter_matrix

# MP2

## Ingesting the data

In [None]:
#The data is placed in an folder so we load the data through the folder with \
data1 = pd.read_excel("wine-data\winequality-red.xlsx", header=1)
data2 = pd.read_excel("wine-data\winequality-white.xlsx", header=1)

### Aggregating the data together into one data file - but first we add a column named type

In [None]:
#Adding the type
data1["type"] = "red"
data2["type"] = "white"

# Merge/concatenate - the ignore_index resets the index numbers
wine = pd.concat([data1, data2], ignore_index=True)

In [None]:
#Shows a random sample, to show that the data is mixed
wine.sample(10)

## Cleaning the data

In [None]:
#We use shape to show the size of the dataset, through the number of observations and how many variables there are
wine.shape

In [None]:
#Finding missing values (null) - It shows how many of the different variables has null values
print(wine.isnull().sum())

### Since there are no null values, we do not need to drop them

In [None]:
#Shows how many duplicated values there are. It compares all of the columns.
wine.duplicated().sum()

### We therefore have 1177 duplicates

In [None]:
#This shows the first five duplicates.
duplicates = wine[wine.duplicated()]
print(duplicates.head())

In [None]:
#Dropping the 1177 duplicates and resetting the index
wine.drop_duplicates(inplace=True, ignore_index=True)

### Checking for damaged values

In [None]:
#Negative values - 
(wine.drop(columns="type") < 0).sum()

In [None]:
#pH value 7 or bigger for wine, as wine is acid and 7 is neutral - 
wine[(wine['pH']>7)].sum()

## The outliers for the "accepted" wine values

In [None]:
def check_min_max(list, expected_range):
    report = []
    for col, (min_val, max_val) in expected_range.items():
        count = ((list[col] < min_val) | (list[col] > max_val)).sum()
        report.append({"column": col, "outliers": count})

    report_df = pd.DataFrame(report)
    print(report_df)

In [None]:
#Our expected ranges for red wine
expected_ranges_red = {
    "fixed acidity": (4.6, 15.9),
    "volatile acidity": (0.12, 1.58),
    "citric acid": (0.0, 1.0),
    "residual sugar": (0.9, 15.5),
    "chlorides": (0.012, 0.611),
    "free sulfur dioxide": (1, 72),
    "total sulfur dioxide": (6, 289),
    "density": (0.99007, 1.00369),
    "pH": (2.74, 4.01),
    "sulphates": (0.33, 2.0),
    "alcohol": (8.4, 14.9),
    "quality": (3, 8)
}
red_wine = wine[wine["type"] == "red"]

check_min_max(red_wine, expected_ranges_red)

In [None]:
#Our expected ranges for white wine
expected_ranges_white = {
    "fixed acidity": (3.8, 14.2),
    "volatile acidity": (0.08, 1.10),
    "citric acid": (0.0, 1.66),
    "residual sugar": (0.6, 65.8),
    "chlorides": (0.009, 0.346),
    "free sulfur dioxide": (2, 289),
    "total sulfur dioxide": (9, 440),
    "density": (0.98711, 1.03898),
    "pH": (2.72, 3.82),
    "sulphates": (0.22, 1.08),
    "alcohol": (8.0, 14.2),
    "quality": (3, 9)
}

#Right now we are only checking for the values of white wine, so we need to filter on the type of wine
white_wine = wine[wine["type"] == "white"]

check_min_max(white_wine, expected_ranges_white)

In [None]:
#Shows the count, mean std fopr the collective winelist
wine.describe()

In [None]:
#Description of red wine - the list is made previously: red_wine = wine[wine["type"] == "red"]
red_wine.describe()

In [None]:
#Description of white wine - the list is made previously: white_wine = wine[wine["type"] == "white"]
white_wine.describe()

In [None]:
#The shape of red wine after cleaning
red_wine.shape

In [None]:
#The shape of white wine after cleaning
white_wine.shape

## Endcoding

In [None]:
#Pre Label encoding - Categorical Encoding
wine.dtypes

wine["type"] = wine["type"].astype("category")

wine.dtypes

cat_data = wine.select_dtypes('category').columns

# print these columns only
wine[cat_data].sample(10)

categorical_data = wine[cat_data]

In [None]:
# Label Endcoding which is converting the text values into numerical values

cats = categorical_data["type"].unique()

categorical_data = wine[["type"]].copy()
categorical_data["type_encoded"] = categorical_data["type"].cat.codes

In [None]:
# adding the encoded column to the original dataframe and dropping the text column
# White wine = 1 and red wine = 0
wine["type_encoded"] = wine["type"].cat.codes

wine_transformed = wine.drop(["type"], axis=1)

### Normalization

In [None]:
# Sørg for, at kolonnen "type_encoded" er til stede i red_wine og white_wine
red_wine["type_encoded"] = wine[wine["type"] == "red"]["type_encoded"]
white_wine["type_encoded"] = wine[wine["type"] == "white"]["type_encoded"]

# Fjern kolonnen "type" fra red_wine og white_wine
red_wine_transformed = red_wine.drop(columns=["type"]).copy()
white_wine_transformed = white_wine.drop(columns=["type"]).copy()

# normalization - Scaling the data to a range of 0 to 1
scaler = MinMaxScaler()

# Fit og transformér hele datasættet
all_wine_norm = scaler.fit_transform(wine_transformed)

# Brug transform() for at genbruge skalaen på red_wine og white_wine
red_wine_norm = scaler.transform(red_wine_transformed)
white_wine_norm = scaler.transform(white_wine_transformed)

# Konverter tilbage til DataFrame for at bevare kolonnenavne
all_wine_norm = pd.DataFrame(all_wine_norm, columns=wine_transformed.columns)
red_wine_norm = pd.DataFrame(red_wine_norm, columns=red_wine_transformed.columns)
white_wine_norm = pd.DataFrame(white_wine_norm, columns=white_wine_transformed.columns)

# Udskriv resultater
print(all_wine_norm.head())
print(red_wine_norm.head())
print(white_wine_norm.head())

In [None]:
# normal distrubted histogram -> bellcurve

# Histogram af kvalitet for rød, hvid og alle vine
plt.figure(figsize=(12, 6))

# Histogram for rødvin
plt.hist(wine[wine["type"] == "red"]["quality"], bins=10, alpha=0.5, label="Red wine", color="red", edgecolor="black")

# Histogram for hvidvin
plt.hist(wine[wine["type"] == "white"]["quality"], bins=10, alpha=0.5, label="White wine", color="orange", edgecolor="black")

# Histogram for alle vine
plt.hist(wine["quality"], bins=10, alpha=0.5, label="All wines", color="blue", edgecolor="black")

# Tilføj labels og titel
plt.xlabel("Quality")
plt.ylabel("Count")
plt.title("Distribution of Wine Quality (Red, White, and All Wines)")
plt.legend()
plt.show()

## Exploration

### Correlation

In [None]:
# Calculates the correlation values
pre_norm_corr_all = wine.corr(method='pearson', numeric_only=True)
# Prints the correlation coeficients in a table for all the values
pre_norm_corr_all

In [None]:
# Calculates the correlation values
post_norm_corr_red = red_wine_norm.corr(method='pearson', numeric_only=True)
# Prints the correlation coeficients in a table for all the values
post_norm_corr_red

In [None]:
# Calculates the correlation values
post_norm_corr_white = white_wine_norm.corr(method='pearson', numeric_only=True)
# Prints the correlation coeficients in a table for all the values
post_norm_corr_white

In [None]:
# Calculates the correlation values
post_norm_corr_all = all_wine_norm.corr(method='pearson', numeric_only=True)
# Prints the correlation coeficients in a table for all the values
post_norm_corr_all

In [None]:
#heatmap with normalized data
# Heatmap
plt.figure(figsize=(12, 10))
sb.heatmap(post_norm_corr_all, annot=True, cmap='coolwarm')
plt.title("All wines - post normaliztion", fontsize=14)
plt.show()

## Correlations for all wines seems to low everywhere,so it warrents that we check red and white for them selves

In [None]:
# Heatmap
plt.figure(figsize=(12, 10))
sb.heatmap(post_norm_corr_white, annot=True, cmap='coolwarm')
plt.title("White wine - post normaliztion", fontsize=14)

plt.show()

## The correlations for white wine:
it is high for:  
    1: density and residual sugar 
    2: density and alcohol

In [None]:
# Heatmap
plt.figure(figsize=(12, 10))
sb.heatmap(post_norm_corr_red, annot=True, cmap='coolwarm')
plt.title("Red wines - post normaliztion", fontsize=14)

plt.show()

## The correlations for red wine:
it is high for:
1. fixed acidity and Ph
2. fixed acidity and density

low for :
1. pH and citric acid
2. alcohol and density


## Warrents futher investigation for correlations

1. we want to see why residual sugar and alcohol compared to density doesn't correlate the same for red and white wine
2. we want to see why acids doesn't have a higher correlations with pH

In [None]:
# --- Filtrer data ---
red_wine_filtered = red_wine[red_wine["fixed acidity"] < 14]
white_wine_filtered = white_wine[white_wine["fixed acidity"] < 14]

# --- Find fælles skala ---
xmin = min(red_wine_filtered["pH"].min(), white_wine_filtered["pH"].min())
xmax = max(red_wine_filtered["pH"].max(), white_wine_filtered["pH"].max())
ymin = min(red_wine_filtered["fixed acidity"].min(), white_wine_filtered["fixed acidity"].min())
ymax = max(red_wine_filtered["fixed acidity"].max(), white_wine_filtered["fixed acidity"].max())

# --- Opret figur med 2 subplots ---
fig, axes = plt.subplots(1, 2, figsize=(14, 6), sharex=True, sharey=True)

# --- Scatterplot for rødvin ---
axes[0].scatter(red_wine_filtered["pH"], red_wine_filtered["fixed acidity"],
                alpha=0.5, color="red")
axes[0].set_title("Red Wine")
axes[0].set_xlabel("pH")
axes[0].set_ylabel("Fixed acidity")
axes[0].set_xlim(xmin, xmax)
axes[0].set_ylim(ymin, ymax)

# --- Scatterplot for hvidvin ---
axes[1].scatter(white_wine_filtered["pH"], white_wine_filtered["fixed acidity"],
                alpha=0.5, color="orange")
axes[1].set_title("White Wine")
axes[1].set_xlabel("pH")
axes[1].set_xlim(xmin, xmax)
axes[1].set_ylim(ymin, ymax)

# --- Tilpas layout ---
fig.suptitle("Scatterplot of pH vs Fixed Acidity (Same Scale)", fontsize=16)
plt.tight_layout()
plt.show()

## Findings? explore more before conclusion
/ the correlation between fixed acidity and pH seems to be different between red and white could partly due we have 3 times the amount of data points for white wine /

In [None]:
# a. Boxplots for at sammenligne fordelingen af quality, alcohol og residual sugar
plt.figure(figsize=(12,4))
plt.subplot(1,3,1)
sb.boxplot(x="type", y="quality", data=wine)
plt.title("Quality by Wine Type")
plt.subplot(1,3,2)
sb.boxplot(x="type", y="alcohol", data=wine)
plt.title("Alcohol by Wine Type")
plt.subplot(1,3,3)
sb.boxplot(x="type", y="residual sugar", data=wine)
plt.title("Residual Sugar by Wine Type")
plt.tight_layout()
plt.show()

In [None]:
# which type of wine has higher average quality, how big is the difference?
mean_quality = wine.groupby("type")["quality"].mean()
print("average quality:\n", mean_quality)
print("difference:", abs(mean_quality["white"] - mean_quality["red"]))

# white has on average higher quality

In [None]:
# which type of wine has higher average level of alcohol?
mean_alcohol = wine.groupby("type")["alcohol"].mean()
print("Average alcohol:\n", mean_alcohol)
print("difference:", abs(mean_alcohol["white"] - mean_alcohol["red"]))

# white has on average higher level of alcohol

In [None]:
# which one has higher average quantity of residual sugar?

mean_sugar = wine.groupby("type")["residual sugar"].mean()
print("Average residual sugar:\n", mean_sugar)
print("difference:", abs(mean_sugar["white"] - mean_sugar["red"]))

# white has on average higher quantity of residual sugar

In [None]:
plt.figure(figsize=(10,4))
plt.subplot(1,2,1)
sb.scatterplot(x="alcohol", y="quality", hue="type", data=wine, alpha=0.5)
plt.title("Alcohol vs. Quality")
plt.subplot(1,2,2)
sb.scatterplot(x="residual sugar", y="quality", hue="type", data=wine, alpha=0.5)
plt.title("Residual Sugar vs. Quality")
plt.tight_layout()
plt.show()

In [None]:
print("Korrelationskoefficient (alcohol vs. quality):")
print(wine[["alcohol", "quality"]].corr())
print("Korrelationskoefficient (residual sugar vs. quality):")
print(wine[["residual sugar", "quality"]].corr())

In [None]:
#Binning wines in groups of 5 and 10 bins.

def find_highest_density(df, bins, label):
    #Bin the pH column
    binned = pd.cut(df["pH"], bins=bins)
    
    counts = binned.value_counts().sort_index()
    
    #Print the densest bin.
    max_bin = counts.idxmax()
    print(f"{label}: Number of bins = {bins} Highest density in pH range of {max_bin} with {counts.max()} samples")
    return counts

print("5 bins")
find_highest_density(wine, bins=5, label="All wines")
find_highest_density(wine[wine["type_encoded"]==0], bins=5, label="Red wines")
find_highest_density(wine[wine["type_encoded"]==1], bins=5, label="White wines")

print("\n10 bins")
find_highest_density(wine, bins=10, label="All wines")
find_highest_density(wine[wine["type_encoded"]==0], bins=10, label="Red wines")
find_highest_density(wine[wine["type_encoded"]==1], bins=10, label="White wines")

In [None]:
#Visual graph for pH density on red, white and all wines combined.
plt.figure(figsize=(12,6))

#Change bins=x the higer x is the more detailed the graph gets.
plt.hist(wine[wine["type_encoded"]==0]["pH"], bins=20, alpha=0.5, label="Red wine", edgecolor="black", color="red")
plt.hist(wine[wine["type_encoded"]==1]["pH"], bins=20, alpha=0.5, label="White wine", edgecolor="black", color="orange")
plt.hist(wine["pH"], bins=20, alpha=0.5, label="All wines", edgecolor="black")

plt.xlabel("pH")
plt.ylabel("Count")
plt.title("pH Distribution of Red, White, and Combined Wine")
plt.legend()
plt.show()

In [None]:
#Individual graphs for red and white wines.
fig, axes = plt.subplots(1, 2, figsize=(12,5), sharey=True)

axes[0].hist(wine[wine["type_encoded"]==0]["pH"], bins=20, color="red", edgecolor="black")
axes[0].set_title("Red Wine pH Distribution")
axes[0].set_xlabel("pH")
axes[0].set_ylabel("Count")

axes[1].hist(wine[wine["type_encoded"]==1]["pH"], bins=20, color="orange", edgecolor="black")
axes[1].set_title("White Wine pH Distribution")
axes[1].set_xlabel("pH")

plt.suptitle("Individual Histograms by Wine Type", fontsize=14)
plt.show()

In [None]:
#Define bins and labels
bins = [2.5, 3.1, 3.4, 4.5]  
labels = ["Low", "Medium", "High"]

wine["pH_category"] = pd.cut(wine["pH"], bins=bins, labels=labels)

category_counts = wine.groupby(["pH_category", "type_encoded"]).size().unstack(fill_value=0)
category_counts.columns = ["Red wine", "White wine"]

range_labels = [
    f"Low ({bins[0]}–{bins[1]})",
    f"Medium ({bins[1]}–{bins[2]})",
    f"High ({bins[2]}–{bins[3]})"
]

ax = category_counts.plot(
    kind="bar", 
    stacked=True, 
    color=["red", "orange"], 
    edgecolor="black",
    figsize=(9,6)
)

ax.set_xlabel("pH Category")
ax.set_ylabel("Count")
ax.set_title("Wine Distribution by pH Category (Red vs White Wine)")
ax.set_xticklabels(range_labels, rotation=0)
plt.legend()
plt.show()

In [None]:
# Scatterplot: x = index (bare rækkefølgen af vin), y = pH
# plt.figure(figsize=(10,6))
# plt.scatter(wine.index, wine["pH"], alpha=0.5, c=wine["type"].map({"red":"red","white":"orange"}))
# plt.xlabel("Vin index")
# plt.ylabel("pH")
# plt.title("Scatterplot af pH værdier for rød og hvid vin")
# plt.show()


In [None]:
# Scatterplot: x = x = fixed acidity, y = pH
# plt.figure(figsize=(10,6))
# plt.scatter(wine["fixed acidity"], wine["pH"], alpha=0.5, c=wine["type"].map({"red":"red","white":"orange"}))
# plt.xlabel("Fixed acidity")
# plt.ylabel("pH")
# plt.title("Scatterplot af pH værdier for rød og hvid vin, i forhold til fixed acidity")
# plt.show()

In [None]:
# Scatterplot: x = volatile acidity, y = pH
# plt.figure(figsize=(10,6))
# plt.scatter(wine["volatile acidity"], wine["pH"], alpha=0.5, c=wine["type"].map({"red":"red","white":"orange"}))
# plt.xlabel("Volatile acidity")
# plt.ylabel("pH")
# plt.title("Scatterplot af pH værdier for rød og hvid vin, i forhold til volatile acidity")
# plt.show()

In [None]:
# Scatterplot: x = volatile acidity, y = fixed acidity
# plt.figure(figsize=(10,6))
# plt.scatter(wine["volatile acidity"], wine["fixed acidity"], alpha=0.5, c=wine["type"].map({"red":"red","white":"orange"}))
# plt.xlabel("Volatile acidity")
# plt.ylabel("Fixed acidity")
# plt.title("Scatterplot af volatile acidity værdier for rød og hvid vin, i forhold til fixed acidity")
# plt.show()

In [None]:
# Scatterplot: x = alcohol, y = quality
# plt.figure(figsize=(10,10))
# plt.scatter(wine["alcohol"], wine["quality"], alpha=0.5, c=wine["type"].map({"red":"red","white":"orange"}))
# plt.xlabel("Alcohol")
# plt.ylabel("Quality")
# plt.title("Scatterplot af alcohol værdier for rød og hvid vin, i forhold til quality")
# plt.show()

In [None]:
#KOMMER BARE FRA CHAT
# plt.figure(figsize=(8,6))
# wine.boxplot(column="pH", by="type", grid=False, patch_artist=True,
#              boxprops=dict(facecolor="lightblue"),
#              medianprops=dict(color="red"))
# plt.title("Boxplot af pH værdier for rød vs. hvid vin")
# plt.suptitle("")  # fjerner standardtitel
# plt.xlabel("Vin type")
# plt.ylabel("pH")
# plt.show()

In [None]:
# plt.hist(wine["pH"], bins=5, edgecolor="black")
# plt.xlabel("pH")
# plt.ylabel("Frekvens")
# plt.title("Histogram af pH-værdier")
# plt.show()