In [None]:
import pandas as pd

file = 'bean.csv'

df = pd.read_csv(file)

In [None]:
df.head()

In [None]:
means_array = df.mean()
covariance_matrix = df.cov()

In [None]:
# # prompt
# I want to use multivariate kde for synthetic data generation 

# with mean as each number of dataset and bandwidth as 1 and use the concept of covariance as needed

# give me descriptive code in python with nice modularity

# the data is collected from .csv file
# 0	5.1	3.5	1.4	0.2	Iris-setosa
# 1	4.9	3.0	1.4	0.2	Iris-setosa
# 2	4.7	3.2	1.3	0.2	Iris-setosa

# this is the dataset to consider for e.g.

In [34]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KernelDensity
from scipy.stats import multivariate_normal

# Load CSV data
def load_data(file_path):
    """
    Load the dataset from a CSV file and preprocess the numeric values.
    
    Parameters:
        file_path (str): The path to the CSV file.
        
    Returns:
        data (pd.DataFrame): A dataframe containing the numeric data only.
    """
    data = pd.read_csv(file_path, header=None)
    numeric_data = data.iloc[:, :-6].values  # Adjust based on the dataset format
    return numeric_data

from sklearn.neighbors import KernelDensity
import numpy as np
from sklearn.metrics import pairwise_distances

def fit_adaptive_kde(data, bandwidth, k=10):
    """
    Fit an Adaptive Kernel Density Estimation (KDE) model with a k-nearest neighbors approach.

    Parameters:
        data (np.array): The original dataset for KDE.
        bandwidth (float): The initial bandwidth parameter for KDE (default: 1.0).
        k (int): Number of nearest neighbors to use for adaptive bandwidth.

    Returns:
        kde_models (list): List of individual KDE models with adaptive bandwidths.
    """
    # Compute pairwise distances
    distances = pairwise_distances(data)
    
    # Calculate adaptive bandwidths using the k-nearest neighbors
    adaptive_bandwidths = []
    for i in range(data.shape[0]):
        local_bandwidth = bandwidth * np.mean(np.sort(distances[i])[:k])
        adaptive_bandwidths.append(local_bandwidth)
    
    kde_models = []
    for i, sample in enumerate(data):
        kde = KernelDensity(bandwidth=adaptive_bandwidths[i], kernel='gaussian')
        kde.fit(data)
        kde_models.append(kde)
    
    return kde_models

# Generate synthetic data using Adaptive KDE
def generate_adaptive_synthetic_data(kde_models, n_samples):
    """
    Generate synthetic data using the fitted Adaptive KDE models.
    
    Parameters:
        kde_models (list): List of individual KDE models with adaptive bandwidths.
        n_samples (int): Number of synthetic samples to generate.
        
    Returns:
        synthetic_data (np.array): Generated synthetic dataset.
    """
    synthetic_data = []
    for _ in range(n_samples):
        chosen_kde = np.random.choice(kde_models)
        synthetic_sample = chosen_kde.sample()[0]
        synthetic_data.append(synthetic_sample)
    
    return np.array(synthetic_data)

# SEKER, BARBUNYA, BOMBAY, CALI, HOROZ
# Function to save synthetic data to a new CSV file
def save_synthetic_data(original_df, synthetic_data, output_file, Class):
    """
    Save synthetic data to a new CSV file.
    
    Parameters:
        original_df (pd.DataFrame): Original dataframe for reference columns.
        synthetic_data (np.array): Synthetic data to save.
        output_file (str): File path for saving synthetic data.
    """
    synthetic_df = pd.DataFrame(np.round(synthetic_data, 2), columns=['Area', 'Perimeter', 'MajorAxisLength', 'MinorAxisLength', 'AspectRation', 'Eccentricity', 'ConvexArea', 'EquivDiameter', 'Extent', 'Solidity', 'roundness'])
    synthetic_df['Class'] = Class
    synthetic_df.to_csv(output_file, index=False)
    print(f"Synthetic data saved to {output_file}")

# Main function to combine all steps
def main(file_path, output_file, Class, bandwidth=0.5, k = 5, n_samples=500):
    """
    Main function to execute the data generation process using Adaptive KDE.
    
    Parameters:
        file_path (str): Path to the CSV file containing the dataset.
        output_file (str): Path to save the synthetic data.
        n_samples (int): Number of synthetic samples to generate (default: 500).
        bandwidth (float): Initial bandwidth for KDE (default: 1.0).
        
    Returns:
        synthetic_data (np.array): Final synthetic data.
    """
    # Load and preprocess data
    df = load_data(file_path)
    
    # Fit Adaptive KDE models
    kde_models = fit_adaptive_kde(df, bandwidth=bandwidth, k = k)
    
    # Generate synthetic data
    synthetic_data = generate_adaptive_synthetic_data(kde_models, n_samples)
    
    # Save the synthetic data
    save_synthetic_data(df, synthetic_data, output_file, Class)
    
    return synthetic_data

# Example usage
if __name__ == "__main__":
    # File path of the dataset (modify as needed)
    Bandwidth = 3
    k = 5
    Classes = ["SEKER", "BARBUNYA", "BOMBAY", "CALI", "HOROZ"]
    for i, Class in enumerate(Classes):
        input_file = f"bean{i + 1}.csv"  # Dynamic input file name
        output_file = f"syn_bean_{i + 1}.csv"  # Dynamic output file name

        # Generate synthetic samples
        synthetic_data = main(input_file, output_file, Class, Bandwidth, k)

    
    print(synthetic_data)


Synthetic data saved to syn_bean_1.csv
Synthetic data saved to syn_bean_2.csv
Synthetic data saved to syn_bean_3.csv
Synthetic data saved to syn_bean_4.csv
Synthetic data saved to syn_bean_5.csv
[[ 3.57839054e+04  6.61899049e+02 -5.62470472e+02 ...  3.00480450e+02
   3.42945832e+02  3.54011270e+01]
 [ 3.60082786e+04 -5.56542353e+01  1.16177045e+02 ... -7.71956433e+02
  -8.17365984e+02 -1.27558330e+02]
 [ 3.86074754e+04  6.33719276e+02  6.17981305e+02 ... -4.49464746e+02
   5.91861198e+02  4.67766084e+02]
 ...
 [ 3.45464342e+04  2.45908161e+03 -1.49830873e+03 ...  5.46588104e+02
   2.32624382e+02 -8.76250618e+02]
 [ 3.57642686e+04  1.93957587e+03  1.19158311e+03 ...  1.50286235e+03
  -5.12414317e+02  2.84551795e+02]
 [ 3.81014126e+04  1.42020118e+03  5.42450337e+02 ... -1.35822921e+03
  -1.84740319e+03 -8.64017238e+01]]


In [35]:
import pandas as pd
import glob

# Specify the path to the CSV files
csv_files = ['syn_bean_1.csv', 'syn_bean_2.csv', 'syn_bean_3.csv', 'syn_bean_4.csv', 'syn_bean_5.csv']

# Initialize an empty DataFrame
merged_df = pd.DataFrame()

# Loop through each file and append its data (ignoring header after the first file)
for i, file in enumerate(csv_files):
    temp_df = pd.read_csv(file)
    if i == 0:
        merged_df = temp_df  # Include header for the first file
    else:
        merged_df = pd.concat([merged_df, temp_df], ignore_index=True)

# Save the merged DataFrame to a new CSV file
merged_df.to_csv('syn_bean_10.csv', index=False)

print("Files have been successfully merged into 'syn_bean_10.csv'")


Files have been successfully merged into 'syn_bean_10.csv'
