In [2]:
import pandas as pd

In [71]:
df = pd.read_csv("data/winequality-red.csv", sep=";")
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


# Renaming Columns

We need to change the name of the columns and put underline in the place of white spaces

In [7]:
new_df = df.rename(columns={"fixed acidity": 'fixed_acidity',
                            'volatile acidity': 'volatile_acidity',
                            'citric acid': 'citric_acid',
                            'residual sugar': 'residual_sugar',
                            'free sulfur dioxide': 'free_sulfur_dioxide',
                            'total sulfur dioxide': 'total_sulfur_dioxide'})

new_df.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


# Better way to do that

In [72]:
labels = list(df.columns)

# list comprehension to replace white spaces with underlying 
labels = list([label.replace(' ', '_') for label in labels])

df.columns = labels
df.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


# Analyzing Features

In [73]:
def num_median(name="density"):
    median = df[name].median()
    
    for i, column in enumerate(df[name]):
        if column >= median:
            df.loc[i, name] = "high"
        else:
            df.loc[i, name] = "low"
    
    return df.groupby(name).quality.mean() 

In [74]:
for feature in df.columns[:-1]:
    print(num_median(feature))
    print('\n')

fixed_acidity
high    5.726061
low     5.540052
Name: quality, dtype: float64


volatile_acidity
high    5.392157
low     5.890166
Name: quality, dtype: float64


citric_acid
high    5.822360
low     5.447103
Name: quality, dtype: float64


residual_sugar
high    5.665880
low     5.602394
Name: quality, dtype: float64


chlorides
high    5.507194
low     5.776471
Name: quality, dtype: float64


free_sulfur_dioxide
high    5.595268
low     5.677136
Name: quality, dtype: float64


total_sulfur_dioxide
high    5.522981
low     5.750630
Name: quality, dtype: float64


density
high    5.540574
low     5.731830
Name: quality, dtype: float64


pH
high    5.598039
low     5.675607
Name: quality, dtype: float64


sulphates
high    5.898917
low     5.351562
Name: quality, dtype: float64


alcohol
high    5.958904
low     5.310302
Name: quality, dtype: float64




In [75]:
df.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality
0,low,high,low,low,low,low,low,high,high,low,low,5
1,low,high,low,high,high,high,high,high,low,high,low,5
2,low,high,low,high,high,high,high,high,low,high,low,5
3,high,low,high,low,low,high,high,high,low,low,low,6
4,low,high,low,low,low,low,low,high,high,low,low,5
