In [1]:
import numpy as np 
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [2]:
red_wine = pd.read_csv('noisy_datasets/wine_quality/winequality-red.csv', sep=';')
white_wine = pd.read_csv('noisy_datasets/wine_quality/winequality-white.csv', sep=';')
red_wine['Wine Type'] = 'Red'
white_wine['Wine Type'] = 'White'
wine_quality = pd.concat([red_wine, white_wine], ignore_index=True)
print(wine_quality.describe())


       fixed acidity  volatile acidity  citric acid  residual sugar  \
count    6497.000000       6497.000000  6497.000000     6497.000000   
mean        7.215307          0.339666     0.318633        5.443235   
std         1.296434          0.164636     0.145318        4.757804   
min         3.800000          0.080000     0.000000        0.600000   
25%         6.400000          0.230000     0.250000        1.800000   
50%         7.000000          0.290000     0.310000        3.000000   
75%         7.700000          0.400000     0.390000        8.100000   
max        15.900000          1.580000     1.660000       65.800000   

         chlorides  free sulfur dioxide  total sulfur dioxide      density  \
count  6497.000000          6497.000000           6497.000000  6497.000000   
mean      0.056034            30.525319            115.744574     0.994697   
std       0.035034            17.749400             56.521855     0.002999   
min       0.009000             1.000000         

In [3]:
print("Missing values: ",wine_quality.isnull().sum())
print("As there are no missing values, we do not have to handle it.")

Missing values:  fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
Wine Type               0
dtype: int64
As there are no missing values, we do not have to handle it.


In [4]:
print(wine_quality.info())
print("As Wine Type is the only object column which was also entered by us, we know the case will be constant.")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6497 entries, 0 to 6496
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         6497 non-null   float64
 1   volatile acidity      6497 non-null   float64
 2   citric acid           6497 non-null   float64
 3   residual sugar        6497 non-null   float64
 4   chlorides             6497 non-null   float64
 5   free sulfur dioxide   6497 non-null   float64
 6   total sulfur dioxide  6497 non-null   float64
 7   density               6497 non-null   float64
 8   pH                    6497 non-null   float64
 9   sulphates             6497 non-null   float64
 10  alcohol               6497 non-null   float64
 11  quality               6497 non-null   int64  
 12  Wine Type             6497 non-null   object 
dtypes: float64(11), int64(1), object(1)
memory usage: 660.0+ KB
None
As Wine Type is the only object column which was also ente

In [5]:
outlier_indices = []
for col in wine_quality.select_dtypes(include=['number']).columns:
    Q1 = wine_quality[col].quantile(0.25)
    Q3 = wine_quality[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = wine_quality[(wine_quality[col] < lower_bound) | (wine_quality[col] > upper_bound)]
    outlier_indices.extend(outliers.index)
    wine_quality[col] = wine_quality[col].where((wine_quality[col] >= lower_bound) & (wine_quality[col] <= upper_bound), wine_quality[col].median())
print(f"Outliers count: {len(set(outlier_indices))}")
print("Since the dataset will be reduced by more than 1600 values, we replace the outliers with the median.")
print("")


Outliers count: 1657
Since the dataset will be reduced by more than 1600 values, we replace the outliers with the median.
