In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns
import csv
from scipy.stats import multivariate_normal
# define data paths
file1 = './../data/41467_2024_46346_MOESM4_ESM.csv'
file2 = './../data/41467_2024_46346_MOESM7_ESM.csv'

# read chemical data and merge it with only the overall score
chemData = pd.read_csv(file1)
dataWithOverall = pd.read_csv(file2)
overallAndId = dataWithOverall[['beer','overall']]

# now merge and remove duplicates
merged_df = chemData.merge(overallAndId, on = ['beer'], how='inner')
# Convert string in tasting_category_fine using factorize
merged_df['tasting_category_fine'], category_mapping = pd.factorize(merged_df['tasting_category_fine'])

# Remove NAs
na_counts_per_column = merged_df.isna().sum()
na_counts_per_column = na_counts_per_column[na_counts_per_column > 0]
threshold = 30
# Filter to show only columns with NA counts greater than the threshold
columns_to_drop = na_counts_per_column[na_counts_per_column > threshold].index
# Drop the columns with NA counts greater than the threshold
merged_df_withoutNa = merged_df.drop(columns=columns_to_drop)
merged_df_withoutNa = merged_df_withoutNa.fillna(0)

merged_df_withoutNa.head(10)

Unnamed: 0,beer,beer_id,tasting_category_fine,acetaldehyde,CS2,DES,DMS,ethyl_2.methyl_butyrate,ethyl_acetate,ethyl_decanoate,...,isopropyl.hexadecanoate,manool,ethyl.octadecanoate,ethyl.pentanoate,X2.ethyl.3.methylpyrazine,esters_sum,aroma_hops_sum,acids_sum,sulfur_sum,overall
0,10,5410228202929,0,-0.641741,-2.146292,-1.905267,0.046211,-3.19552,-0.59935,-1.975309,...,-1.817819,5.721204,4.438951,0.014728,0.054353,-0.532634,3.696586,0.244085,0.166286,-0.467852
1,86,5410783031019,0,-0.174824,-2.146292,-0.257098,0.347241,-3.19552,-0.507254,-1.75597,...,4.323292,5.810501,4.247826,0.023167,0.0014,-0.465401,1.219563,-0.152156,0.098931,-0.994806
2,2,5410228142003,0,0.511067,-2.146292,-1.905267,0.898604,-3.19552,-0.134514,-1.893639,...,-1.817819,5.585529,3.598293,0.017571,0.000699,-0.409346,1.148974,-0.161504,-0.022943,-1.528544
3,83,54055520,0,0.249793,-2.146292,-1.905267,0.782126,-3.19552,-0.233874,-2.086981,...,4.088807,5.204583,4.498781,0.016959,0.001101,-0.42008,1.416847,-0.056092,0.068372,-0.934138
4,40,5410228141181,0,-1.902829,-0.756987,-1.34727,0.553356,-3.19552,0.207751,-1.713095,...,4.651536,6.515499,4.78922,0.117921,0.000492,-0.144457,0.672328,-1.204001,0.639462,-1.735965
5,247,5411081000264,1,0.404802,-2.146292,-1.905267,2.108738,-2.850781,0.659248,-2.161466,...,4.323532,5.908192,5.322008,0.023274,0.049058,-0.194932,1.88696,0.528207,0.837224,-0.800698
6,48,8711406995211,2,0.891771,-2.146292,0.028093,1.399666,-2.512155,1.577115,-0.704521,...,4.647266,5.923461,5.228784,0.017974,0.00997,1.278832,1.558301,-0.06385,0.764213,-0.259324
7,165,5411081000363,1,-0.243322,-2.146292,-0.556487,0.81955,-2.367037,0.465608,-2.67469,...,4.693955,5.858582,5.262941,0.022817,0.071466,-0.140149,1.774321,0.190947,0.318556,-0.25089
8,124,5412186002658,3,0.08773,-2.146292,-0.33636,1.155214,-3.19552,0.949666,-1.944622,...,-1.817819,5.889627,-1.817819,0.034472,0.00118,0.297808,1.469216,-0.052,0.52413,0.230788
9,207,5411223030036,1,0.567943,-2.146292,-0.521224,1.085299,-0.539583,0.78646,-2.195861,...,4.724163,6.081144,4.399498,0.021659,0.513474,0.330095,1.425281,0.584239,0.415574,-0.677687


In [2]:
# drop all 4 SUM columns since we are only interested in the chems themselves
df = merged_df_withoutNa[merged_df_withoutNa.columns.drop(list(merged_df_withoutNa.filter(regex='_sum')))]
df.head(10)

Unnamed: 0,beer,beer_id,tasting_category_fine,acetaldehyde,CS2,DES,DMS,ethyl_2.methyl_butyrate,ethyl_acetate,ethyl_decanoate,...,Z.Z.geranyl.linalool,ethyl.hexadecanoate,manool.oxide,X13.epi.manool.oxide,isopropyl.hexadecanoate,manool,ethyl.octadecanoate,ethyl.pentanoate,X2.ethyl.3.methylpyrazine,overall
0,10,5410228202929,0,-0.641741,-2.146292,-1.905267,0.046211,-3.19552,-0.59935,-1.975309,...,-1.817819,-1.817819,3.702195,4.360243,-1.817819,5.721204,4.438951,0.014728,0.054353,-0.467852
1,86,5410783031019,0,-0.174824,-2.146292,-0.257098,0.347241,-3.19552,-0.507254,-1.75597,...,3.576605,-1.817819,4.133651,4.512589,4.323292,5.810501,4.247826,0.023167,0.0014,-0.994806
2,2,5410228142003,0,0.511067,-2.146292,-1.905267,0.898604,-3.19552,-0.134514,-1.893639,...,-1.817819,-1.817819,3.96673,4.394267,-1.817819,5.585529,3.598293,0.017571,0.000699,-1.528544
3,83,54055520,0,0.249793,-2.146292,-1.905267,0.782126,-3.19552,-0.233874,-2.086981,...,-1.817819,-1.817819,3.812901,4.174304,4.088807,5.204583,4.498781,0.016959,0.001101,-0.934138
4,40,5410228141181,0,-1.902829,-0.756987,-1.34727,0.553356,-3.19552,0.207751,-1.713095,...,-1.817819,6.484789,1.618481,4.742899,4.651536,6.515499,4.78922,0.117921,0.000492,-1.735965
5,247,5411081000264,1,0.404802,-2.146292,-1.905267,2.108738,-2.850781,0.659248,-2.161466,...,3.55243,-1.817819,3.905719,-1.817819,4.323532,5.908192,5.322008,0.023274,0.049058,-0.800698
6,48,8711406995211,2,0.891771,-2.146292,0.028093,1.399666,-2.512155,1.577115,-0.704521,...,5.252547,6.207206,4.051992,3.271554,4.647266,5.923461,5.228784,0.017974,0.00997,-0.259324
7,165,5411081000363,1,-0.243322,-2.146292,-0.556487,0.81955,-2.367037,0.465608,-2.67469,...,4.26816,-1.817819,4.048183,-1.817819,4.693955,5.858582,5.262941,0.022817,0.071466,-0.25089
8,124,5412186002658,3,0.08773,-2.146292,-0.33636,1.155214,-3.19552,0.949666,-1.944622,...,3.016781,5.598731,2.101812,4.129525,-1.817819,5.889627,-1.817819,0.034472,0.00118,0.230788
9,207,5411223030036,1,0.567943,-2.146292,-0.521224,1.085299,-0.539583,0.78646,-2.195861,...,3.853942,-1.817819,4.059355,-1.817819,4.724163,6.081144,4.399498,0.021659,0.513474,-0.677687


In [7]:
# We start by getting the distribution of our respective columns
# define a helper function
def plot_distributions(dataframe, folderName):
    num_cols = dataframe.select_dtypes(include=['number']).columns
    for col in num_cols:
        plt.figure(figsize=(10, 6))
        sns.histplot(dataframe[col], kde=True)
        plt.title(f'Distribution of {col}')
        plt.xlabel(col)
        plt.ylabel('Frequency')
        #plt.show()
        plt.savefig(f'./../plot/{folderName}/{col}.png')

In [None]:
# Only do this once
plot_distributions(df,'data')

  plt.figure(figsize=(10, 6))


In [3]:
# Now create a helper that takes a random sample from each column to create new data
# we can provide the columns from where to sample bad by providing their names
def generate_existing_samples_with_bad(dataframe, n_samples, bad_columns, percentile=10):
    """
    Generate new rows by taking random samples from each column, with "bad" samples from specified columns.

    Parameters:
    dataframe (pd.DataFrame): The input DataFrame.
    n_samples (int): The number of new samples (rows) to generate.
    bad_columns (list): List of column names to take "bad" samples from.
    percentile (float): Percentile to define the left tail (e.g., 10 for the 10th percentile).

    Returns:
    pd.DataFrame: A new DataFrame with generated rows.
    """
    sampled_data = {}
    num_cols = dataframe.select_dtypes(include=['number']).columns

    for col in num_cols:
        if col in bad_columns:
            # Identify the left tail of the distribution
            threshold = np.percentile(dataframe[col], percentile)
            bad_samples = dataframe[dataframe[col] <= threshold][col]
            sampled_data[col] = np.random.choice(bad_samples, size=n_samples)
        else:
            sampled_data[col] = np.random.choice(dataframe[col], size=n_samples)
    
    return pd.DataFrame(sampled_data)

In [4]:
new_sample = generate_existing_samples_with_bad(df, 10, bad_columns=['ethyl_acetate','protein'])
new_sample.head(10)

Unnamed: 0,beer,beer_id,tasting_category_fine,acetaldehyde,CS2,DES,DMS,ethyl_2.methyl_butyrate,ethyl_acetate,ethyl_decanoate,...,Z.Z.geranyl.linalool,ethyl.hexadecanoate,manool.oxide,X13.epi.manool.oxide,isopropyl.hexadecanoate,manool,ethyl.octadecanoate,ethyl.pentanoate,X2.ethyl.3.methylpyrazine,overall
0,93,5412858000104,12,0.875,-2.146292,0.05143,1.358979,-3.19552,0.207751,-1.782648,...,5.112418,6.323522,4.247686,3.361587,4.85916,5.887567,5.577463,0.033965,0.002328,-0.201182
1,150,9120041150685,6,0.384908,-2.146292,-0.194931,1.220304,-1.575935,0.78646,-0.618344,...,4.718596,6.330183,4.009213,3.856458,4.215297,-1.817819,4.662119,0.016345,0.513474,-0.955883
2,19,5430000043014,7,1.077715,-2.146292,-0.230937,1.230182,-2.205861,0.950474,-0.496529,...,4.984686,6.576473,4.081458,-1.817819,5.173705,5.69824,4.815422,0.037154,0.006247,-0.314852
3,248,5410228141181,18,0.406826,-2.146292,0.666766,1.193135,-3.19552,0.892526,-1.706196,...,5.100846,6.613861,3.773398,-1.817819,4.294242,6.414915,5.339044,0.019235,0.001867,0.103597
4,203,5411858000145,6,0.153864,-2.146292,-1.905267,1.8728,-3.19552,1.025315,-0.632341,...,3.645972,6.549995,2.609173,4.405774,4.773675,4.495545,5.262941,0.064979,0.012114,0.293484
5,37,5410693100324,12,0.531078,-2.146292,-0.567327,1.363346,-3.19552,1.040623,-0.579887,...,5.93643,-1.817819,3.990581,4.114019,4.341853,5.639284,5.396366,0.036933,0.003099,-0.747557
6,54,5425006241304,12,0.662379,-0.57146,0.406303,1.687159,-2.301465,0.919785,-1.305439,...,6.238041,6.458024,3.86452,3.313113,-1.817819,5.546596,4.715593,0.044602,0.042405,-0.096972
7,171,5411098700010,12,0.97376,-0.535019,-0.02452,1.801009,-3.19552,0.784719,-1.046216,...,-1.817819,6.709501,3.90543,4.075256,4.757926,5.717925,4.346609,0.014125,0.023993,0.927273
8,244,5410908000128,1,0.310058,-0.547355,0.252213,1.355227,-1.98569,-0.233874,-1.004014,...,4.228595,-1.817819,4.620393,4.512589,-1.817819,4.205021,4.032387,0.0081,0.786006,0.18135
9,245,54004238,18,0.225261,-2.146292,0.366725,1.154682,-2.206908,1.040623,-1.782648,...,5.026564,-1.817819,4.399984,4.35283,4.085146,5.751837,-1.817819,0.042838,0.026432,-0.487611


In [5]:
df.dtypes
df.select_dtypes(include=['int']).columns
df['tasting_category_fine'].unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21], dtype=int64)

In [1]:
def generate_samples_with_bad(dataframe, n_samples, bad_columns, percentile=10):
    """
    Generate new rows by drawing samples from fitted distributions, 
    with "bad" samples from specified columns.

    Parameters:
    dataframe (pd.DataFrame): The input DataFrame.
    n_samples (int): The number of new samples (rows) to generate.
    bad_columns (list): List of column names to take "bad" samples from.
    percentile (float): Percentile to define the left tail (e.g., 10 for the 10th percentile).

    Returns:
    pd.DataFrame: A new DataFrame with generated rows.
    """
    sampled_data = {}
    
    # Determine numeric and categorical columns
    num_cols = dataframe.select_dtypes(include=['number']).columns
    cat_cols = dataframe.select_dtypes(include=['int']).columns
    
    # Sample integer values for categorical columns
    for col in dataframe.columns:
        if col in cat_cols:
            unique_values = dataframe[col].unique()
            if unique_values.any():
                sampled_data[col] = np.random.choice(unique_values, size=n_samples)
            else:
                # Handle the case where unique_int_values is empty
                sampled_data[col] = np.nan
        elif col in num_cols:
            mu, sigma = stats.norm.fit(dataframe[col])
            if col in bad_columns:
                threshold = np.percentile(dataframe[col], percentile)
                left_tail = stats.norm(loc=mu, scale=sigma).ppf(np.linspace(0, percentile/100, 100))
                # Filter out -inf and inf values
                left_tail = left_tail[np.isfinite(left_tail)]
                sampled_data[col] = np.random.choice(left_tail, size=n_samples)
            else:
                sampled_data[col] = stats.norm(loc=mu, scale=sigma).rvs(size=n_samples)
    
    return pd.DataFrame(sampled_data)


In [2]:
# Remove Beer/Beer_Id/tasting_category and overall from newly generated sample
# Why tasting category? If we don't remove it we get in trouble because we would have a dependency between ethanol and alcohol free beers
# since the category itself is determined by chemical properties we are fine with omiting it from the result
rand_df = df.drop(columns=['beer','beer_id','overall'])
# Use name of columns that perform best in our GP analysis, start with what paper people found
rand_sample = generate_samples_with_bad(rand_df, 500, bad_columns=['ethyl_acetate','protein'])
rand_sample.head(10)
rand_sample

NameError: name 'df' is not defined

In [None]:
rand_sample.to_csv('./../data/result/createdSampleData.csv', index=False)

In [None]:
plot_distributions(rand_sample,'sample')