In [13]:
import pandas as pd

# Load the dataset
wine_data = pd.read_csv('combined_and_preprocessed_data.csv')

# Define bins and labels for categorizing the last three columns
bin_labels = ['low', 'medium', 'high']

# Calculate the sulfur dioxide ratio if it doesn't exist
wine_data['sulfur_dioxide_ratio'] = wine_data['free sulfur dioxide'] / wine_data['total sulfur dioxide']

# Separate each ratio column into three groups
wine_data['acidity_ratio_group'] = pd.qcut(wine_data['acidity_ratio'], q=3, labels=bin_labels)
wine_data['sulfur_dioxide_ratio_group'] = pd.qcut(wine_data['sulfur_dioxide_ratio'], q=3, labels=bin_labels)
wine_data['alcohol_sulfur_dioxide_ratio_group'] = pd.qcut(wine_data['alcohol_sulfur_dioxide_ratio'], q=3, labels=bin_labels)

# Drop the specified columns
updated_wine_data = wine_data.drop(columns=['alcohol_density', 'non-free sulfur dioxide', 'total_acidity'])

# Function to calculate covariance for each group within each type of wine
def calculate_group_covariances(data, ratio_column, group_column):
    return data.groupby(['type', group_column]).apply(lambda g: g[ratio_column].cov(g['quality']))

# Calculate covariances for each ratio and group level, separately for red and white wine
cov_acidity = calculate_group_covariances(updated_wine_data, 'acidity_ratio', 'acidity_ratio_group')
cov_sulfur_dioxide = calculate_group_covariances(updated_wine_data, 'sulfur_dioxide_ratio', 'sulfur_dioxide_ratio_group')
cov_alcohol_sulfur = calculate_group_covariances(updated_wine_data, 'alcohol_sulfur_dioxide_ratio', 'alcohol_sulfur_dioxide_ratio_group')

# Combine results into a DataFrame
cov_data = pd.DataFrame({
    'type': ['red', 'red', 'red', 'white', 'white', 'white'] * 3,
    'group': ['fixed_acidity_ratio'] * 6 + ['free_sulfur_dioxide_ratio'] * 6 + ['alcohol_sulfur_dioxide_ratio'] * 6,
    'category': ['low', 'medium', 'high'] * 6,
    'covariance': list(cov_acidity) + list(cov_sulfur_dioxide) + list(cov_alcohol_sulfur)
})

# Save the prepared dataset for D3 plotting
output_path = "covariance_data.csv"
cov_data.to_csv(output_path, index=False)


In [14]:
cov_data

Unnamed: 0,type,group,category,covariance
0,red,fixed_acidity_ratio,low,0.00242
1,red,fixed_acidity_ratio,medium,0.000365
2,red,fixed_acidity_ratio,high,0.000478
3,white,fixed_acidity_ratio,low,0.002227
4,white,fixed_acidity_ratio,medium,-4.6e-05
5,white,fixed_acidity_ratio,high,0.000342
6,red,free_sulfur_dioxide_ratio,low,-0.000184
7,red,free_sulfur_dioxide_ratio,medium,0.002397
8,red,free_sulfur_dioxide_ratio,high,0.010431
9,white,free_sulfur_dioxide_ratio,low,0.010111
