In [55]:
# Overall File
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns
import csv
from scipy.stats import multivariate_normal
# define data paths
file1 = './../data/41467_2024_46346_MOESM4_ESM.csv'
file2 = './../data/41467_2024_46346_MOESM7_ESM.csv'
# read chemical data and merge it with only the overall score
chemData = pd.read_csv(file1)
dataWithOverall = pd.read_csv(file2)
overallAndId = dataWithOverall[['beer','overall']]
# now merge and remove duplicates
merged_df = chemData.merge(overallAndId, on = ['beer'], how='inner')
# Convert string in tasting_category_fine using factorize
merged_df['tasting_category_fine'], category_mapping = pd.factorize(merged_df['tasting_category_fine'])
# Remove NAs
na_counts_per_column = merged_df.isna().sum()
na_counts_per_column = na_counts_per_column[na_counts_per_column > 0]
threshold = 30
# Filter to show only columns with NA counts greater than the threshold
columns_to_drop = na_counts_per_column[na_counts_per_column > threshold].index
# Drop the columns with NA counts greater than the threshold
merged_df_withoutNa = merged_df.drop(columns=columns_to_drop)
merged_df_withoutNa = merged_df_withoutNa.fillna(0)
# drop all 4 SUM columns since we are only interested in the chems themselves
df = merged_df_withoutNa[merged_df_withoutNa.columns.drop(list(merged_df_withoutNa.filter(regex='_sum')))]
df.head(10)

Unnamed: 0,beer,beer_id,tasting_category_fine,acetaldehyde,CS2,DES,DMS,ethyl_2.methyl_butyrate,ethyl_acetate,ethyl_decanoate,...,Z.Z.geranyl.linalool,ethyl.hexadecanoate,manool.oxide,X13.epi.manool.oxide,isopropyl.hexadecanoate,manool,ethyl.octadecanoate,ethyl.pentanoate,X2.ethyl.3.methylpyrazine,overall
0,10,5410228202929,0,-0.641741,-2.146292,-1.905267,0.046211,-3.19552,-0.59935,-1.975309,...,-1.817819,-1.817819,3.702195,4.360243,-1.817819,5.721204,4.438951,0.014728,0.054353,-0.467852
1,86,5410783031019,0,-0.174824,-2.146292,-0.257098,0.347241,-3.19552,-0.507254,-1.75597,...,3.576605,-1.817819,4.133651,4.512589,4.323292,5.810501,4.247826,0.023167,0.0014,-0.994806
2,2,5410228142003,0,0.511067,-2.146292,-1.905267,0.898604,-3.19552,-0.134514,-1.893639,...,-1.817819,-1.817819,3.96673,4.394267,-1.817819,5.585529,3.598293,0.017571,0.000699,-1.528544
3,83,54055520,0,0.249793,-2.146292,-1.905267,0.782126,-3.19552,-0.233874,-2.086981,...,-1.817819,-1.817819,3.812901,4.174304,4.088807,5.204583,4.498781,0.016959,0.001101,-0.934138
4,40,5410228141181,0,-1.902829,-0.756987,-1.34727,0.553356,-3.19552,0.207751,-1.713095,...,-1.817819,6.484789,1.618481,4.742899,4.651536,6.515499,4.78922,0.117921,0.000492,-1.735965
5,247,5411081000264,1,0.404802,-2.146292,-1.905267,2.108738,-2.850781,0.659248,-2.161466,...,3.55243,-1.817819,3.905719,-1.817819,4.323532,5.908192,5.322008,0.023274,0.049058,-0.800698
6,48,8711406995211,2,0.891771,-2.146292,0.028093,1.399666,-2.512155,1.577115,-0.704521,...,5.252547,6.207206,4.051992,3.271554,4.647266,5.923461,5.228784,0.017974,0.00997,-0.259324
7,165,5411081000363,1,-0.243322,-2.146292,-0.556487,0.81955,-2.367037,0.465608,-2.67469,...,4.26816,-1.817819,4.048183,-1.817819,4.693955,5.858582,5.262941,0.022817,0.071466,-0.25089
8,124,5412186002658,3,0.08773,-2.146292,-0.33636,1.155214,-3.19552,0.949666,-1.944622,...,3.016781,5.598731,2.101812,4.129525,-1.817819,5.889627,-1.817819,0.034472,0.00118,0.230788
9,207,5411223030036,1,0.567943,-2.146292,-0.521224,1.085299,-0.539583,0.78646,-2.195861,...,3.853942,-1.817819,4.059355,-1.817819,4.724163,6.081144,4.399498,0.021659,0.513474,-0.677687


In [56]:
def display_column_names(dataframe):
    """
    Display all column names of the DataFrame.

    Parameters:
    dataframe (pd.DataFrame): The input DataFrame.
    """
    column_names = dataframe.columns
    print("Column names:")
    for col in column_names:
        print(col)



display_column_names(df)
df.to_csv('./../data/result/combinedData.csv', index=False)

Column names:
beer
beer_id
tasting_category_fine
acetaldehyde
CS2
DES
DMS
ethyl_2.methyl_butyrate
ethyl_acetate
ethyl_decanoate
ethyl_hexanoate
ethyl_isovalerate
ethyl_octanoate
ethyl_propionate
H2S
isoamyl_alcohol
isobutanol
isobutyl_acetate
isopentyl_acetate
MeSH
octyl_acetate
phenethyl_acetate
phenethyl_alcohol
X4VG
SO2.mg.L.
acetic_acid.g.L.
ammonia.mg.L.
color.EBC.
betaglucan.mg.L.
lactic_acid.mg.L.
glycerol
iron
pH
protein.g.L.
bitterness
GFS
ethanol..v.v.
CO2.PSI.
unfermentables
kcalperc
X1.propanol
X2.3.butanedione
X3.methylbutanal
propylene.glycol
propyl.acetate
n.pentanol
X2.methyl.1.butanol
isopentyl.formate
ethyl.isobutyrate
X3.methyl.2.butenal
X2.methyltetrahydrofuran.3.one
ethyl.butyrate
butyl.acetate
methylpyrazine
furfural
ethyl.lactate
X3Z.hexenol
n.hexanol
styrene
X2.furanmethanol
X3.methyl.2.hexanol
X3.furanmethanol
X2.acetylfuran
X2.6.dimethyl.pyrazine
amyl.acetate
ethyl.tiglate
benzaldehyde
X5.methylfurfural
n.heptanol
X1.octen.3.ol
X3.methylthio.propanol
isomaltol

In [52]:
def generate_samples_with_bad(dataframe, n_samples, bad_columns, cat_columns, percentile=10):
    sampled_data = {}
    
    # Determine numeric columns
    num_cols = dataframe.select_dtypes(include=['number']).columns
    
    # Sample values for each column
    for col in dataframe.columns:
        if col in cat_columns:
            unique_values = dataframe[col].dropna().unique()
            if unique_values.any():
                sampled_data[col] = np.random.choice(unique_values, size=n_samples)
            else:
                sampled_data[col] = [np.nan] * n_samples
        elif col in num_cols:
            col_data = dataframe[col].dropna()
            mu, sigma = stats.norm.fit(col_data)
            if col in bad_columns:
                threshold = np.percentile(col_data, percentile)
                left_tail = col_data[col_data <= threshold]
                sampled_data[col] = np.random.choice(left_tail, size=n_samples, replace=True)
            else:
                samples = stats.norm(loc=mu, scale=sigma).rvs(size=n_samples)
                # Filter out -inf and inf values
                samples = samples[np.isfinite(samples)]
                # Ensure enough samples are drawn
                while len(samples) < n_samples:
                    additional_samples = stats.norm(loc=mu, scale=sigma).rvs(size=(n_samples - len(samples)))
                    additional_samples = additional_samples[np.isfinite(additional_samples)]
                    samples = np.concatenate((samples, additional_samples))
                sampled_data[col] = samples[:n_samples]
        else:
            raise ValueError(f"Column {col} is neither in categorical nor in numerical columns.")

    return pd.DataFrame(sampled_data)

def generate_samples_with_bad(dataframe, n_samples, bad_columns, cat_columns, percentile=10):
    """
    Generate new rows by drawing samples from fitted distributions, 
    with "bad" samples from specified columns.

    Parameters:
    dataframe (pd.DataFrame): The input DataFrame.
    n_samples (int): The number of new samples (rows) to generate.
    bad_columns (list): List of column names to take "bad" samples from.
    percentile (float): Percentile to define the left tail (e.g., 10 for the 10th percentile).

    Returns:
    pd.DataFrame: A new DataFrame with generated rows.
    """
    sampled_data = {}
    
    # Determine numeric and categorical columns
    num_cols = dataframe.select_dtypes(include=['number']).columns
    #cat_cols = dataframe.select_dtypes(include=['int']).columns
    
    # Sample integer values for categorical columns
    for col in dataframe.columns:
        if col in cat_cols:
            unique_values = dataframe[col].dropna().unique()
            if unique_values.any():
                sampled_data[col] = np.random.choice(unique_values, size=n_samples)
            else:
                sampled_data[col] = [np.nan] * n_samples
        elif col in num_cols:
            mu, sigma = stats.norm.fit(dataframe[col])
            if col in bad_columns:
                threshold = np.percentile(dataframe[col], percentile)
                left_tail = stats.norm(loc=mu, scale=sigma).ppf(np.linspace(0, percentile/100, 100))
                # Filter out -inf and inf values
                left_tail = left_tail[np.isfinite(left_tail)]
                sampled_data[col] = np.random.choice(left_tail, size=n_samples)
            else:
                sampled_data[col] = stats.norm(loc=mu, scale=sigma).rvs(size=n_samples)
    
    return pd.DataFrame(sampled_data)

In [53]:
min_values = df.min()
max_values = df.max()
avg_values = df.mean()

print(min_values['ethanol..v.v.'])
print(max_values['ethanol..v.v.'])
print(avg_values['ethanol..v.v.'])

-1.455931956
1.06595298
0.8039263130039999


In [54]:
# Remove Beer/Beer_Id/tasting_category and overall from newly generated sample
# Why tasting category? If we don't remove it we get in trouble because we would have a dependency between ethanol and alcohol free beers
# since the category itself is determined by chemical properties we are fine with omiting it from the result
#rand_df = df.drop(columns=['beer','beer_id','overall'])
rand_df = df.drop(columns=['beer','beer_id']) # keep overall and sample it randomly
# Use name of columns that perform best in our GP analysis, start with what paper people found
rand_sample = generate_samples_with_bad(rand_df, 500, bad_columns=['ethyl_acetate','ethanol..v.v.', 'ethyl_octanoate','ethyl.phenylacetate','protein.g.L.', 'lactic_acid.mg.L.'],cat_columns=['tasting_category_fine'],percentile=20)
rand_sample.head(10)

ValueError: Not enough values in the left tail for column ethyl_acetate.

In [36]:
min_values_rand = rand_sample.min()
max_values_rand = rand_sample.max()
avg_values_rand = rand_sample.mean()

print(min_values_rand['ethanol..v.v.'])
print(max_values_rand['ethanol..v.v.'])
print(avg_values_rand['ethanol..v.v.'])

-1.455931956
0.721398376
0.5117974392019999


In [40]:
rand_sample.to_csv('./../data/result/createdSampleData.csv', index=False)