In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns
import csv
from scipy.stats import multivariate_normal
# define data paths
file1 = './../data/41467_2024_46346_MOESM4_ESM.csv'
file2 = './../data/41467_2024_46346_MOESM7_ESM.csv'

# read chemical data and merge it with only the overall score
chemData = pd.read_csv(file1)
dataWithOverall = pd.read_csv(file2)
overallAndId = dataWithOverall[['beer','overall']]

# now merge and remove duplicates
merged_df = chemData.merge(overallAndId, on = ['beer'], how='inner')
# Convert string in tasting_category_fine using factorize
merged_df['tasting_category_fine'], category_mapping = pd.factorize(merged_df['tasting_category_fine'])

# Remove NAs
na_counts_per_column = merged_df.isna().sum()
na_counts_per_column = na_counts_per_column[na_counts_per_column > 0]
threshold = 30
# Filter to show only columns with NA counts greater than the threshold
columns_to_drop = na_counts_per_column[na_counts_per_column > threshold].index
# Drop the columns with NA counts greater than the threshold
merged_df_withoutNa = merged_df.drop(columns=columns_to_drop)
merged_df_withoutNa = merged_df_withoutNa.fillna(0)

merged_df_withoutNa.head(10)

Unnamed: 0,beer,beer_id,tasting_category_fine,acetaldehyde,CS2,DES,DMS,ethyl_2.methyl_butyrate,ethyl_acetate,ethyl_decanoate,...,isopropyl.hexadecanoate,manool,ethyl.octadecanoate,ethyl.pentanoate,X2.ethyl.3.methylpyrazine,esters_sum,aroma_hops_sum,acids_sum,sulfur_sum,overall
0,10,5410228202929,0,-0.641741,-2.146292,-1.905267,0.046211,-3.19552,-0.59935,-1.975309,...,-1.817819,5.721204,4.438951,0.014728,0.054353,-0.532634,3.696586,0.244085,0.166286,-0.467852
1,86,5410783031019,0,-0.174824,-2.146292,-0.257098,0.347241,-3.19552,-0.507254,-1.75597,...,4.323292,5.810501,4.247826,0.023167,0.0014,-0.465401,1.219563,-0.152156,0.098931,-0.994806
2,2,5410228142003,0,0.511067,-2.146292,-1.905267,0.898604,-3.19552,-0.134514,-1.893639,...,-1.817819,5.585529,3.598293,0.017571,0.000699,-0.409346,1.148974,-0.161504,-0.022943,-1.528544
3,83,54055520,0,0.249793,-2.146292,-1.905267,0.782126,-3.19552,-0.233874,-2.086981,...,4.088807,5.204583,4.498781,0.016959,0.001101,-0.42008,1.416847,-0.056092,0.068372,-0.934138
4,40,5410228141181,0,-1.902829,-0.756987,-1.34727,0.553356,-3.19552,0.207751,-1.713095,...,4.651536,6.515499,4.78922,0.117921,0.000492,-0.144457,0.672328,-1.204001,0.639462,-1.735965
5,247,5411081000264,1,0.404802,-2.146292,-1.905267,2.108738,-2.850781,0.659248,-2.161466,...,4.323532,5.908192,5.322008,0.023274,0.049058,-0.194932,1.88696,0.528207,0.837224,-0.800698
6,48,8711406995211,2,0.891771,-2.146292,0.028093,1.399666,-2.512155,1.577115,-0.704521,...,4.647266,5.923461,5.228784,0.017974,0.00997,1.278832,1.558301,-0.06385,0.764213,-0.259324
7,165,5411081000363,1,-0.243322,-2.146292,-0.556487,0.81955,-2.367037,0.465608,-2.67469,...,4.693955,5.858582,5.262941,0.022817,0.071466,-0.140149,1.774321,0.190947,0.318556,-0.25089
8,124,5412186002658,3,0.08773,-2.146292,-0.33636,1.155214,-3.19552,0.949666,-1.944622,...,-1.817819,5.889627,-1.817819,0.034472,0.00118,0.297808,1.469216,-0.052,0.52413,0.230788
9,207,5411223030036,1,0.567943,-2.146292,-0.521224,1.085299,-0.539583,0.78646,-2.195861,...,4.724163,6.081144,4.399498,0.021659,0.513474,0.330095,1.425281,0.584239,0.415574,-0.677687


In [29]:
# drop all 4 SUM columns since we are only interested in the chems themselves
df = merged_df_withoutNa[merged_df_withoutNa.columns.drop(list(merged_df_withoutNa.filter(regex='_sum')))]
df.head(10)

Unnamed: 0,beer,beer_id,tasting_category_fine,acetaldehyde,CS2,DES,DMS,ethyl_2.methyl_butyrate,ethyl_acetate,ethyl_decanoate,...,Z.Z.geranyl.linalool,ethyl.hexadecanoate,manool.oxide,X13.epi.manool.oxide,isopropyl.hexadecanoate,manool,ethyl.octadecanoate,ethyl.pentanoate,X2.ethyl.3.methylpyrazine,overall
0,10,5410228202929,0,-0.641741,-2.146292,-1.905267,0.046211,-3.19552,-0.59935,-1.975309,...,-1.817819,-1.817819,3.702195,4.360243,-1.817819,5.721204,4.438951,0.014728,0.054353,-0.467852
1,86,5410783031019,0,-0.174824,-2.146292,-0.257098,0.347241,-3.19552,-0.507254,-1.75597,...,3.576605,-1.817819,4.133651,4.512589,4.323292,5.810501,4.247826,0.023167,0.0014,-0.994806
2,2,5410228142003,0,0.511067,-2.146292,-1.905267,0.898604,-3.19552,-0.134514,-1.893639,...,-1.817819,-1.817819,3.96673,4.394267,-1.817819,5.585529,3.598293,0.017571,0.000699,-1.528544
3,83,54055520,0,0.249793,-2.146292,-1.905267,0.782126,-3.19552,-0.233874,-2.086981,...,-1.817819,-1.817819,3.812901,4.174304,4.088807,5.204583,4.498781,0.016959,0.001101,-0.934138
4,40,5410228141181,0,-1.902829,-0.756987,-1.34727,0.553356,-3.19552,0.207751,-1.713095,...,-1.817819,6.484789,1.618481,4.742899,4.651536,6.515499,4.78922,0.117921,0.000492,-1.735965
5,247,5411081000264,1,0.404802,-2.146292,-1.905267,2.108738,-2.850781,0.659248,-2.161466,...,3.55243,-1.817819,3.905719,-1.817819,4.323532,5.908192,5.322008,0.023274,0.049058,-0.800698
6,48,8711406995211,2,0.891771,-2.146292,0.028093,1.399666,-2.512155,1.577115,-0.704521,...,5.252547,6.207206,4.051992,3.271554,4.647266,5.923461,5.228784,0.017974,0.00997,-0.259324
7,165,5411081000363,1,-0.243322,-2.146292,-0.556487,0.81955,-2.367037,0.465608,-2.67469,...,4.26816,-1.817819,4.048183,-1.817819,4.693955,5.858582,5.262941,0.022817,0.071466,-0.25089
8,124,5412186002658,3,0.08773,-2.146292,-0.33636,1.155214,-3.19552,0.949666,-1.944622,...,3.016781,5.598731,2.101812,4.129525,-1.817819,5.889627,-1.817819,0.034472,0.00118,0.230788
9,207,5411223030036,1,0.567943,-2.146292,-0.521224,1.085299,-0.539583,0.78646,-2.195861,...,3.853942,-1.817819,4.059355,-1.817819,4.724163,6.081144,4.399498,0.021659,0.513474,-0.677687


In [68]:
# We start by getting the distribution of our respective columns
# define a helper function
def plot_distributions(dataframe, folderName):
    num_cols = dataframe.select_dtypes(include=['number']).columns
    for col in num_cols:
        plt.figure(figsize=(10, 6))
        sns.histplot(dataframe[col], kde=True)
        plt.title(f'Distribution of {col}')
        plt.xlabel(col)
        plt.ylabel('Frequency')
        #plt.show()
        plt.savefig(f'./../plot/{folderName}/{col}.png')

In [None]:
# Only do this once
plot_distributions(df,'data')

  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mo

In [26]:
# Now create a helper that takes a random sample from each column to create new data
# we can provide the columns from where to sample bad by providing their names
def generate_existing_samples_with_bad(dataframe, n_samples, bad_columns, percentile=10):
    """
    Generate new rows by taking random samples from each column, with "bad" samples from specified columns.

    Parameters:
    dataframe (pd.DataFrame): The input DataFrame.
    n_samples (int): The number of new samples (rows) to generate.
    bad_columns (list): List of column names to take "bad" samples from.
    percentile (float): Percentile to define the left tail (e.g., 10 for the 10th percentile).

    Returns:
    pd.DataFrame: A new DataFrame with generated rows.
    """
    sampled_data = {}
    num_cols = dataframe.select_dtypes(include=['number']).columns

    for col in num_cols:
        if col in bad_columns:
            # Identify the left tail of the distribution
            threshold = np.percentile(dataframe[col], percentile)
            bad_samples = dataframe[dataframe[col] <= threshold][col]
            sampled_data[col] = np.random.choice(bad_samples, size=n_samples)
        else:
            sampled_data[col] = np.random.choice(dataframe[col], size=n_samples)
    
    return pd.DataFrame(sampled_data)

In [27]:
new_sample = generate_existing_samples_with_bad(df, 10, bad_columns=['ethyl_acetate','protein'])
new_sample.head(10)

Unnamed: 0,beer,beer_id,tasting_category_fine,acetaldehyde,CS2,DES,DMS,ethyl_2.methyl_butyrate,ethyl_acetate,ethyl_decanoate,...,Z.Z.geranyl.linalool,ethyl.hexadecanoate,manool.oxide,X13.epi.manool.oxide,isopropyl.hexadecanoate,manool,ethyl.octadecanoate,ethyl.pentanoate,X2.ethyl.3.methylpyrazine,overall
0,149,5425005660465,7,0.699326,-2.146292,-1.905267,1.595524,-1.857767,1.025313,-0.992935,...,3.460578,6.654277,3.903787,4.65162,-1.817819,5.630405,5.302779,0.005463,0.003539,-0.270704
1,134,8711406031766,5,0.791307,-2.146292,-0.643429,1.535283,-3.19552,-0.134514,-2.132091,...,4.836773,7.05768,3.164233,4.490356,4.926436,5.934491,5.004485,0.011906,0.004578,-0.097426
2,13,9120041150678,15,0.630754,-0.771189,-0.299119,1.455311,-1.816019,0.784719,-1.250225,...,5.446021,7.181703,4.194435,3.736902,4.088807,4.674443,4.958638,0.016959,0.027052,-0.140587
3,43,5425100322608,9,0.429493,-2.146292,0.049228,1.243357,-3.19552,0.784719,-1.448245,...,-1.817819,5.701363,3.68555,4.473057,5.198096,5.434658,4.419788,0.042359,0.017383,0.07698
4,34,5412186002658,16,0.562626,-0.523357,-0.559697,1.575836,-3.19552,0.949666,-0.628101,...,-1.817819,-1.817819,2.273939,4.269354,4.834998,5.419175,4.887502,0.04324,0.006624,-0.569657
5,203,5411676710004,2,0.788825,-2.146292,0.086749,1.652001,-1.323444,0.936354,-0.796954,...,4.946925,6.239734,3.838441,4.589631,5.232502,6.052729,4.293774,0.012185,0.001312,-0.887408
6,160,5400000000012,2,0.455635,-2.146292,0.05143,1.170133,-1.583276,1.040623,-0.385752,...,4.946925,6.17391,3.85466,4.165018,-1.817819,4.797557,6.34084,0.034936,0.01027,0.468766
7,75,5425006700405,1,1.073684,-2.146292,0.13585,0.046211,-2.07443,1.051042,-2.67469,...,-1.817819,6.55723,3.883145,2.475317,-1.817819,5.424951,4.958638,0.014728,0.005911,0.433419
8,48,54107014,13,0.219773,-0.829533,-0.436614,1.26526,-2.464073,-0.507254,-1.036236,...,3.742369,6.196995,-1.817819,4.664389,4.248633,6.478908,6.670236,0.02902,0.030985,-1.414394
9,169,54017016,12,0.97376,-2.146292,-0.109369,1.488814,-2.555175,0.837321,-0.796695,...,4.255207,6.98029,1.712367,4.206558,5.042763,5.172559,5.322008,0.006731,0.005651,0.244959


In [60]:
df.dtypes
df.select_dtypes(include=['int']).columns
df['tasting_category_fine'].unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21], dtype=int64)

In [63]:
def generate_samples_with_bad(dataframe, n_samples, bad_columns, percentile=10):
    """
    Generate new rows by drawing samples from fitted distributions, 
    with "bad" samples from specified columns.

    Parameters:
    dataframe (pd.DataFrame): The input DataFrame.
    n_samples (int): The number of new samples (rows) to generate.
    bad_columns (list): List of column names to take "bad" samples from.
    percentile (float): Percentile to define the left tail (e.g., 10 for the 10th percentile).

    Returns:
    pd.DataFrame: A new DataFrame with generated rows.
    """
    sampled_data = {}
    
    # Determine numeric and categorical columns
    num_cols = dataframe.select_dtypes(include=['number']).columns
    cat_cols = dataframe.select_dtypes(include=['int']).columns
    
    # Sample integer values for categorical columns
    for col in dataframe.columns:
        if col in cat_cols:
            unique_values = dataframe[col].unique()
            if unique_values.any():
                sampled_data[col] = np.random.choice(unique_values, size=n_samples)
            else:
                # Handle the case where unique_int_values is empty
                sampled_data[col] = np.nan
        elif col in num_cols:
            mu, sigma = stats.norm.fit(dataframe[col])
            if col in bad_columns:
                threshold = np.percentile(dataframe[col], percentile)
                left_tail = stats.norm(loc=mu, scale=sigma).ppf(np.linspace(0, percentile/100, 100))
                sampled_data[col] = np.random.choice(left_tail, size=n_samples)
            else:
                sampled_data[col] = stats.norm(loc=mu, scale=sigma).rvs(size=n_samples)
    
    return pd.DataFrame(sampled_data)


Unnamed: 0,beer,beer_id,tasting_category_fine,acetaldehyde,CS2,DES,DMS,ethyl_2.methyl_butyrate,ethyl_acetate,ethyl_decanoate,...,Z.Z.geranyl.linalool,ethyl.hexadecanoate,manool.oxide,X13.epi.manool.oxide,isopropyl.hexadecanoate,manool,ethyl.octadecanoate,ethyl.pentanoate,X2.ethyl.3.methylpyrazine,overall
0,82,1882,0,0.140615,-2.850428,0.207975,1.102154,-3.016188,0.937343,-1.378159,...,-0.993692,2.975372,4.529449,2.244414,0.007612,4.593408,3.966936,-0.018131,0.170761,0.376779
1,148,5411516001491,6,0.36751,-2.138538,-0.681901,1.645548,-2.930895,0.898179,-1.825633,...,2.694634,3.976268,2.732505,2.876508,1.27665,7.104464,2.471506,0.118628,0.109404,-0.663736
2,28,5413963300011,10,0.613688,-1.474289,-0.891804,1.134006,-1.616786,0.797089,-0.579036,...,3.375423,7.427213,3.41812,1.302403,8.395121,6.189805,5.318077,0.015532,-0.18695,0.13912
3,209,541286000975,18,1.088095,-2.11043,0.402998,1.899318,-2.600037,0.770541,-0.976043,...,0.94863,3.684098,3.540372,2.567007,1.123849,2.893486,7.740507,0.06098,0.11941,0.61489
4,129,5425006700405,14,0.33213,-1.043334,-1.453264,1.66176,-1.434989,0.66029,-0.82398,...,5.211371,1.982895,3.34564,3.842182,3.183516,4.361441,5.956912,-0.010832,0.231287,0.274252
5,71,5425006700016,3,0.204112,-1.361321,-0.412318,1.334687,-2.802849,0.874348,-1.801352,...,4.676524,4.555218,3.509936,8.102221,6.00678,7.653196,7.441944,0.118015,0.211375,-0.454072
6,142,5410228201076,19,0.9049,-2.524337,-0.782617,1.151234,-2.065643,0.933057,-0.541266,...,2.620654,-1.416853,4.140515,4.860887,6.519191,4.554168,5.627049,0.056177,0.205818,0.038143
7,56,5425006700139,4,0.746455,-1.834217,-1.23831,1.005074,-2.785955,0.669348,-0.586817,...,2.695997,5.155559,4.739661,6.049574,6.427579,3.256324,5.099231,0.237034,0.233647,0.454005
8,215,54138278,6,0.023731,-2.299143,0.81947,2.126572,-2.656864,0.928695,-0.616123,...,4.784765,7.001746,1.912476,-0.85627,4.599731,5.106994,7.245551,0.191453,0.023634,0.19272
9,121,54085190,9,0.868009,-1.561771,-0.726024,1.106422,-2.039756,0.755636,-0.317625,...,7.216809,13.669779,3.664511,1.593491,-4.344996,5.867528,2.911274,0.135299,-0.065229,-0.791487


In [67]:
# Remove Beer/Beer_Id/tasting_category and overall from newly generated sample
# Why tasting category? If we don't remove it we get in trouble because we would have a dependency between ethanol and alcohol free beers
# since the category itself is determined by chemical properties we are fine with omiting it from the result
rand_df = df.drop(columns=['beer','beer_id','overall'])
# Use name of columns that perform best in our GP analysis, start with what paper people found
rand_sample = generate_samples_with_bad(rand_df, 500, bad_columns=['ethyl_acetate','protein'])
rand_sample.head(10)
rand_sample

Unnamed: 0,tasting_category_fine,acetaldehyde,CS2,DES,DMS,ethyl_2.methyl_butyrate,ethyl_acetate,ethyl_decanoate,ethyl_hexanoate,ethyl_isovalerate,...,methyl.hexadecanoate,Z.Z.geranyl.linalool,ethyl.hexadecanoate,manool.oxide,X13.epi.manool.oxide,isopropyl.hexadecanoate,manool,ethyl.octadecanoate,ethyl.pentanoate,X2.ethyl.3.methylpyrazine
0,10,0.997884,-3.050227,-0.956125,0.841399,-2.315910,0.677900,-1.140769,-0.977997,-2.377092,...,5.292753,3.411974,4.850000,4.001642,6.506568,0.698029,2.624450,3.824074,0.110966,-0.182406
1,17,0.608324,-2.622666,-1.441090,1.767402,-1.655108,0.921998,-1.043631,-1.280445,-3.262245,...,4.531996,-3.580483,7.527513,1.765289,5.378522,2.668825,5.593642,7.740181,0.169390,-0.070047
2,7,1.250799,-2.191017,0.093126,1.114596,-2.771216,0.841069,-1.446369,-1.000522,-3.431650,...,2.973528,4.163596,1.755098,3.009847,1.473447,-2.149374,1.231392,2.862376,-0.117738,0.165887
3,7,0.211701,-1.295049,-0.351002,1.363744,-1.838383,0.879873,-0.906886,-0.287809,-2.547801,...,5.072089,2.577873,6.182986,3.437979,8.779452,5.175695,3.050114,4.066556,-0.120048,-0.106880
4,14,0.256456,-2.918347,-1.447129,1.957393,-2.328762,0.669348,-0.921916,0.218653,-4.201753,...,3.097670,0.925178,1.320971,4.560725,1.325694,1.607462,3.676005,5.034172,-0.017602,0.097387
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,0,-0.270115,-1.784215,-0.308089,1.811507,-2.631980,0.534240,-0.963410,-0.679895,-2.218674,...,4.130786,0.701110,4.555189,4.141788,-1.207059,4.992810,5.299342,4.069537,0.188392,0.198768
496,21,0.736592,-1.365023,0.472531,1.974898,-1.672980,0.930886,-1.318803,-0.396668,-2.226352,...,6.172145,-2.146932,12.544272,5.097150,4.582513,-4.394165,6.404780,4.811848,-0.070851,0.184479
497,8,0.479465,-1.511471,-0.884044,1.738689,-1.949374,0.945695,-0.453608,-0.979856,-3.633851,...,1.816352,5.414864,5.786262,4.349136,-0.828356,1.074647,6.712558,2.571822,-0.082396,-0.057463
498,2,0.117572,-2.545978,-1.042758,1.375071,-2.728409,0.779827,-1.076234,-0.761154,-3.693271,...,1.126070,-1.713217,4.350509,1.438745,-0.632300,4.037469,6.443216,4.201266,0.023478,0.085820


In [None]:
rand_sample.to_csv('./../data/result/createdSampleData.csv', index=False)

In [None]:
plot_distributions(rand_sample,'sample')