In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns
import csv
from scipy.stats import multivariate_normal
from sklearn.neighbors import KernelDensity
# define data paths
file1 = './../data/41467_2024_46346_MOESM4_ESM.csv'
file2 = './../data/41467_2024_46346_MOESM7_ESM.csv'

# read chemical data and merge it with only the overall score
chemData = pd.read_csv(file1)
dataWithOverall = pd.read_csv(file2)
overallAndId = dataWithOverall[['beer','overall']]

# now merge and remove duplicates
merged_df = chemData.merge(overallAndId, on = ['beer'], how='inner')
# Convert string in tasting_category_fine using factorize
merged_df['tasting_category_fine'], category_mapping = pd.factorize(merged_df['tasting_category_fine'])

# Remove NAs
na_counts_per_column = merged_df.isna().sum()
na_counts_per_column = na_counts_per_column[na_counts_per_column > 0]
threshold = 30
# Filter to show only columns with NA counts greater than the threshold
columns_to_drop = na_counts_per_column[na_counts_per_column > threshold].index
# Drop the columns with NA counts greater than the threshold
merged_df_withoutNa = merged_df.drop(columns=columns_to_drop)
merged_df_withoutNa = merged_df_withoutNa.fillna(0)
# drop all 4 SUM columns since we are only interested in the chems themselves
df = merged_df_withoutNa[merged_df_withoutNa.columns.drop(list(merged_df_withoutNa.filter(regex='_sum')))]
df.head(10)

Unnamed: 0,beer,beer_id,tasting_category_fine,acetaldehyde,CS2,DES,DMS,ethyl_2.methyl_butyrate,ethyl_acetate,ethyl_decanoate,...,Z.Z.geranyl.linalool,ethyl.hexadecanoate,manool.oxide,X13.epi.manool.oxide,isopropyl.hexadecanoate,manool,ethyl.octadecanoate,ethyl.pentanoate,X2.ethyl.3.methylpyrazine,overall
0,10,5410228202929,0,-0.641741,-2.146292,-1.905267,0.046211,-3.19552,-0.59935,-1.975309,...,-1.817819,-1.817819,3.702195,4.360243,-1.817819,5.721204,4.438951,0.014728,0.054353,-0.467852
1,86,5410783031019,0,-0.174824,-2.146292,-0.257098,0.347241,-3.19552,-0.507254,-1.75597,...,3.576605,-1.817819,4.133651,4.512589,4.323292,5.810501,4.247826,0.023167,0.0014,-0.994806
2,2,5410228142003,0,0.511067,-2.146292,-1.905267,0.898604,-3.19552,-0.134514,-1.893639,...,-1.817819,-1.817819,3.96673,4.394267,-1.817819,5.585529,3.598293,0.017571,0.000699,-1.528544
3,83,54055520,0,0.249793,-2.146292,-1.905267,0.782126,-3.19552,-0.233874,-2.086981,...,-1.817819,-1.817819,3.812901,4.174304,4.088807,5.204583,4.498781,0.016959,0.001101,-0.934138
4,40,5410228141181,0,-1.902829,-0.756987,-1.34727,0.553356,-3.19552,0.207751,-1.713095,...,-1.817819,6.484789,1.618481,4.742899,4.651536,6.515499,4.78922,0.117921,0.000492,-1.735965
5,247,5411081000264,1,0.404802,-2.146292,-1.905267,2.108738,-2.850781,0.659248,-2.161466,...,3.55243,-1.817819,3.905719,-1.817819,4.323532,5.908192,5.322008,0.023274,0.049058,-0.800698
6,48,8711406995211,2,0.891771,-2.146292,0.028093,1.399666,-2.512155,1.577115,-0.704521,...,5.252547,6.207206,4.051992,3.271554,4.647266,5.923461,5.228784,0.017974,0.00997,-0.259324
7,165,5411081000363,1,-0.243322,-2.146292,-0.556487,0.81955,-2.367037,0.465608,-2.67469,...,4.26816,-1.817819,4.048183,-1.817819,4.693955,5.858582,5.262941,0.022817,0.071466,-0.25089
8,124,5412186002658,3,0.08773,-2.146292,-0.33636,1.155214,-3.19552,0.949666,-1.944622,...,3.016781,5.598731,2.101812,4.129525,-1.817819,5.889627,-1.817819,0.034472,0.00118,0.230788
9,207,5411223030036,1,0.567943,-2.146292,-0.521224,1.085299,-0.539583,0.78646,-2.195861,...,3.853942,-1.817819,4.059355,-1.817819,4.724163,6.081144,4.399498,0.021659,0.513474,-0.677687


In [3]:
# Generate new data using kernel density
def generate_new_data(df):
    """
    Generate new rows by sampling from the estimated statistical distribution of each column in the input dataframe.
    
    Parameters:
    - df (pandas.DataFrame): Input dataframe containing the original data.
    
    Returns:
    - new_df (pandas.DataFrame): New dataframe containing the generated data.
    """
    # Create a new dataframe to store the sampled rows
    new_rows = pd.DataFrame()

    # Loop through each column in the original dataframe
    for col in df.columns:
        # Get the values of the current column
        values = df[col].values.reshape(-1, 1)  # Reshape to 2D array for KDE
        
        # Fit kernel density estimation to the data
        kde = KernelDensity(bandwidth=0.5).fit(values)
        
        # Generate new samples from the estimated distribution
        new_samples = kde.sample(n_samples=len(df), random_state=42)
        
        # Add the new samples as a new column in the new dataframe
        new_rows[col] = new_samples.flatten()

    # Concatenate the new rows with the original dataframe
    #new_df = pd.concat([df, new_rows], axis=0, ignore_index=True)
    
    return new_rows

In [4]:
new_rows = generate_new_data(df)

  new_rows[col] = new_samples.flatten()
  new_rows[col] = new_samples.flatten()
  new_rows[col] = new_samples.flatten()
  new_rows[col] = new_samples.flatten()
  new_rows[col] = new_samples.flatten()
  new_rows[col] = new_samples.flatten()
  new_rows[col] = new_samples.flatten()
  new_rows[col] = new_samples.flatten()
  new_rows[col] = new_samples.flatten()
  new_rows[col] = new_samples.flatten()
  new_rows[col] = new_samples.flatten()
  new_rows[col] = new_samples.flatten()
  new_rows[col] = new_samples.flatten()
  new_rows[col] = new_samples.flatten()
  new_rows[col] = new_samples.flatten()
  new_rows[col] = new_samples.flatten()
  new_rows[col] = new_samples.flatten()
  new_rows[col] = new_samples.flatten()
  new_rows[col] = new_samples.flatten()
  new_rows[col] = new_samples.flatten()
  new_rows[col] = new_samples.flatten()
  new_rows[col] = new_samples.flatten()
  new_rows[col] = new_samples.flatten()
  new_rows[col] = new_samples.flatten()
  new_rows[col] = new_samples.flatten()


NameError: name 'plot_distributions' is not defined

In [6]:
new_rows
new_rows.to_csv('./../data/result/createKernelDensitySample.csv', index=False)

In [7]:
# Try Monte Carlo instead
def generate_new_data_from_distribution(df, num_samples):
    """
    Generate new rows of data by sampling from the underlying distribution of each column in the dataframe.
    
    Parameters:
    - df (pandas.DataFrame): Input dataframe containing the original data.
    - num_samples (int): Number of new samples to generate.
    
    Returns:
    - new_df (pandas.DataFrame): New dataframe containing the generated data.
    """
    # Create a new dataframe to store the sampled rows
    new_rows = pd.DataFrame()
    
    # Loop through each column in the original dataframe
    for col in df.columns:
        # Get the values of the current column
        values = df[col].values
        
        # Fit a probability distribution to the observed data
        distribution = getattr(stats, 'norm')  # Example: Using normal distribution as default
        params = distribution.fit(values)
        
        # Generate new samples from the fitted distribution
        new_samples = distribution.rvs(*params, size=num_samples)
        
        # Add the new samples as a new column in the new dataframe
        new_rows[col] = new_samples
    
    # Concatenate the new rows with the original dataframe
    #new_df = pd.concat([df, new_rows], axis=0, ignore_index=True)
    
    return new_rows

In [8]:
mc_rows = generate_new_data_from_distribution(df,500)
mc_rows

  new_rows[col] = new_samples
  new_rows[col] = new_samples
  new_rows[col] = new_samples
  new_rows[col] = new_samples
  new_rows[col] = new_samples
  new_rows[col] = new_samples
  new_rows[col] = new_samples
  new_rows[col] = new_samples
  new_rows[col] = new_samples
  new_rows[col] = new_samples
  new_rows[col] = new_samples
  new_rows[col] = new_samples
  new_rows[col] = new_samples
  new_rows[col] = new_samples
  new_rows[col] = new_samples
  new_rows[col] = new_samples
  new_rows[col] = new_samples
  new_rows[col] = new_samples
  new_rows[col] = new_samples
  new_rows[col] = new_samples
  new_rows[col] = new_samples
  new_rows[col] = new_samples
  new_rows[col] = new_samples
  new_rows[col] = new_samples
  new_rows[col] = new_samples
  new_rows[col] = new_samples
  new_rows[col] = new_samples
  new_rows[col] = new_samples
  new_rows[col] = new_samples
  new_rows[col] = new_samples
  new_rows[col] = new_samples
  new_rows[col] = new_samples
  new_rows[col] = new_samples
  new_rows

Unnamed: 0,beer,beer_id,tasting_category_fine,acetaldehyde,CS2,DES,DMS,ethyl_2.methyl_butyrate,ethyl_acetate,ethyl_decanoate,...,Z.Z.geranyl.linalool,ethyl.hexadecanoate,manool.oxide,X13.epi.manool.oxide,isopropyl.hexadecanoate,manool,ethyl.octadecanoate,ethyl.pentanoate,X2.ethyl.3.methylpyrazine,overall
0,113.158120,8.410472e+12,9.518176,0.298784,-1.464338,0.481151,1.653846,-1.703699,1.202377,-2.239670,...,2.799881,7.895529,1.788743,5.124827,-3.254952,4.358251,3.509852,-0.002831,0.169133,-0.379187
1,118.477482,7.706132e+12,12.213349,1.693762,-2.142312,-1.117665,1.950889,-1.559551,1.240875,-0.533743,...,3.243427,2.817981,5.233122,3.416353,2.404708,4.695845,6.700816,0.058973,-0.130844,-0.007405
2,60.291681,5.986687e+12,11.375408,0.286562,-1.447031,-0.445331,0.932533,-2.245437,1.347207,-1.622062,...,5.202167,8.846741,3.451514,4.409753,0.433969,6.315290,4.377753,-0.073077,0.091565,0.630644
3,26.151242,1.882828e+12,10.425221,0.575610,-2.593067,-1.144784,1.798504,-0.887378,1.017685,-0.971922,...,7.295512,3.422414,3.048579,3.098065,2.175474,4.558619,4.158859,0.198247,0.416252,-0.662732
4,96.571812,7.796224e+12,23.118746,0.277742,-1.558221,0.174097,1.286238,-2.644656,1.515269,-1.072783,...,6.583631,4.168334,3.550809,4.730271,1.678686,8.139660,5.335777,0.134740,0.031978,-1.253211
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,153.066346,3.331419e+12,15.005077,0.691854,-1.899449,-1.108619,1.632715,-3.224013,1.125368,-1.229101,...,-0.355302,7.208169,5.588225,-0.113546,4.675987,6.525921,1.315712,0.026712,-0.098456,-0.056171
96,78.405549,3.717280e+12,14.020766,0.101838,-2.023221,-0.665054,1.798270,-2.497219,1.868625,-0.802227,...,5.908280,5.322826,5.225690,2.816560,-0.607739,7.525175,6.553206,0.209437,0.348091,-0.987678
97,135.187473,9.703344e+12,9.205144,0.862601,-1.205466,-0.243595,1.147652,-2.334360,0.724135,-0.763826,...,5.824992,1.206991,2.825617,2.878796,-0.643219,6.991408,5.305906,0.171343,0.146478,0.299023
98,201.318445,1.563125e+12,5.527834,0.545384,-2.233563,0.280094,1.109732,-1.581173,1.407293,-1.769940,...,3.747823,6.016641,5.780189,4.950863,0.619852,8.712207,4.234013,-0.026411,0.088869,-0.646683


In [None]:
mc_rows.to_csv('./../data/result/createMonteCarloSample.csv', index=False)