In [79]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import seaborn as sns

In [80]:
# read in the data
raw_data = pd.read_csv('winequality-red.csv', sep=';')

In [81]:
# remove repeated data
raw_data = raw_data.drop_duplicates()
raw_data.info()
raw_data.describe()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1359 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1359 non-null   float64
 1   volatile acidity      1359 non-null   float64
 2   citric acid           1359 non-null   float64
 3   residual sugar        1359 non-null   float64
 4   chlorides             1359 non-null   float64
 5   free sulfur dioxide   1359 non-null   float64
 6   total sulfur dioxide  1359 non-null   float64
 7   density               1359 non-null   float64
 8   pH                    1359 non-null   float64
 9   sulphates             1359 non-null   float64
 10  alcohol               1359 non-null   float64
 11  quality               1359 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 138.0 KB


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1359.0,1359.0,1359.0,1359.0,1359.0,1359.0,1359.0,1359.0,1359.0,1359.0,1359.0,1359.0
mean,8.310596,0.529478,0.272333,2.5234,0.088124,15.893304,46.825975,0.996709,3.309787,0.658705,10.432315,5.623252
std,1.73699,0.183031,0.195537,1.352314,0.049377,10.44727,33.408946,0.001869,0.155036,0.170667,1.082065,0.823578
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.9967,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.43,2.6,0.091,21.0,63.0,0.99782,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [82]:
# remove outliers
for col in raw_data.columns:
    if col != 'quality':
        iqr = raw_data[col].quantile(0.75) - raw_data[col].quantile(0.25)
        upper_bound = raw_data[col].quantile(0.75) + 3 * iqr
        lower_bound = raw_data[col].quantile(0.25) - 3 * iqr
        raw_data = raw_data[(raw_data[col] < upper_bound) & (raw_data[col] > lower_bound)]

raw_data.info()
raw_data.describe()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1217 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1217 non-null   float64
 1   volatile acidity      1217 non-null   float64
 2   citric acid           1217 non-null   float64
 3   residual sugar        1217 non-null   float64
 4   chlorides             1217 non-null   float64
 5   free sulfur dioxide   1217 non-null   float64
 6   total sulfur dioxide  1217 non-null   float64
 7   density               1217 non-null   float64
 8   pH                    1217 non-null   float64
 9   sulphates             1217 non-null   float64
 10  alcohol               1217 non-null   float64
 11  quality               1217 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 123.6 KB


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1217.0,1217.0,1217.0,1217.0,1217.0,1217.0,1217.0,1217.0,1217.0,1217.0,1217.0,1217.0
mean,8.28636,0.526652,0.262021,2.278554,0.079132,15.759655,45.419474,0.996597,3.31788,0.641561,10.446385,5.631882
std,1.701773,0.18031,0.190105,0.593928,0.01657,9.894299,30.943551,0.001795,0.148797,0.132852,1.067512,0.811656
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.86,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.069,8.0,22.0,0.99551,3.22,0.55,9.5,5.0
50%,7.9,0.52,0.25,2.2,0.078,14.0,37.0,0.9966,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.5,0.088,21.0,60.0,0.99774,3.41,0.71,11.1,6.0
max,15.0,1.33,0.76,4.7,0.147,57.0,165.0,1.0021,3.9,1.22,14.0,8.0


In [83]:
# normalize features except quality
from sklearn.preprocessing import StandardScaler
input_features = raw_data.drop('quality', axis=1)

scaler = StandardScaler()
input_features = scaler.fit_transform(input_features)

# contact quality to input features
processed_data = np.concatenate((input_features, raw_data['quality'].values.reshape(-1, 1)), axis=1)

In [84]:
# show processed data
processed_data = pd.DataFrame(processed_data, columns=raw_data.columns)
processed_data.info()
processed_data.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1217 entries, 0 to 1216
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1217 non-null   float64
 1   volatile acidity      1217 non-null   float64
 2   citric acid           1217 non-null   float64
 3   residual sugar        1217 non-null   float64
 4   chlorides             1217 non-null   float64
 5   free sulfur dioxide   1217 non-null   float64
 6   total sulfur dioxide  1217 non-null   float64
 7   density               1217 non-null   float64
 8   pH                    1217 non-null   float64
 9   sulphates             1217 non-null   float64
 10  alcohol               1217 non-null   float64
 11  quality               1217 non-null   float64
dtypes: float64(12)
memory usage: 114.2 KB


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1217.0,1217.0,1217.0,1217.0,1217.0,1217.0,1217.0,1217.0,1217.0,1217.0,1217.0,1217.0
mean,-2.802469e-16,-4.02855e-16,0.0,-1.634774e-16,-4.437243e-16,-5.2546300000000005e-17,-1.1676960000000001e-17,-4.83426e-14,5.604939e-16,-1.795332e-16,-4.203704e-16,5.631882
std,1.000411,1.000411,1.000411,1.000411,1.000411,1.000411,1.000411,1.000411,1.000411,1.000411,1.000411,0.811656
min,-2.167078,-2.256216,-1.378863,-2.322031,-4.053072,-1.492346,-1.274439,-3.636546,-3.078476,-2.346136,-1.917754,3.0
25%,-0.6974183,-0.7581809,-0.905247,-0.6376348,-0.6117312,-0.7845775,-0.7571562,-0.6055012,-0.6580793,-0.6894796,-0.8868973,5.0
50%,-0.2271271,-0.03690493,-0.063261,-0.1323158,-0.06836149,-0.1779184,-0.2722033,0.001822158,-0.05298002,-0.1623615,-0.2308975,6.0
75%,0.5370962,0.6288883,0.831348,0.3730032,0.5353826,0.5298505,0.4713913,0.6370044,0.6193525,0.5153617,0.6125309,6.0
max,3.946708,4.457199,2.620567,4.078676,4.097473,4.169805,3.866062,3.066298,3.913782,4.355793,3.330244,8.0


In [85]:
print(raw_data['quality'].value_counts())

5    512
6    488
7    148
4     48
8     15
3      6
Name: quality, dtype: int64


In [86]:
# save processed data to csv
processed_data.to_csv('processed_data.csv', index=False)