In [None]:
import pandas as pd

file = 'bean.csv'

df = pd.read_csv(file)

In [None]:
df.head()

In [None]:
means_array = df.mean()
covariance_matrix = df.cov()

In [None]:
# # prompt
# I want to use multivariate kde for synthetic data generation 

# with mean as each number of dataset and bandwidth as 1 and use the concept of covariance as needed

# give me descriptive code in python with nice modularity

# the data is collected from .csv file
# 0	5.1	3.5	1.4	0.2	Iris-setosa
# 1	4.9	3.0	1.4	0.2	Iris-setosa
# 2	4.7	3.2	1.3	0.2	Iris-setosa

# this is the dataset to consider for e.g.

In [16]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KernelDensity
from scipy.stats import multivariate_normal

# Load CSV data
def load_data(file_path):
    """
    Load the Iris dataset from a CSV file and preprocess the numeric values.
    
    Parameters:
        file_path (str): The path to the CSV file.
        
    Returns:
        data (pd.DataFrame): A dataframe containing the numeric data only.
    """
    data = pd.read_csv(file_path, header=None)
    numeric_data = data.iloc[:, :-1].values  # Exclude the class label
    return numeric_data

# Fit KDE for data generation
def fit_kde(data, bandwidth=1.0):
    """
    Fit the Kernel Density Estimation (KDE) model with a Gaussian kernel.
    
    Parameters:
        data (np.array): The original dataset for KDE.
        bandwidth (float): The bandwidth parameter for KDE (default: 1.0).
        
    Returns:
        kde_model (KernelDensity): Fitted KDE model.
    """
    kde_model = KernelDensity(bandwidth=bandwidth, kernel='gaussian')
    kde_model.fit(data)
    return kde_model

# Generate synthetic data using KDE
def generate_synthetic_data(kde_model, n_samples):
    """
    Generate synthetic data using the fitted KDE model.
    
    Parameters:
        kde_model (KernelDensity): Fitted KDE model.
        n_samples (int): Number of synthetic samples to generate.
        
    Returns:
        synthetic_data (np.array): Generated synthetic dataset.
    """
    synthetic_data = kde_model.sample(n_samples)
    return synthetic_data

# Add covariance to adjust synthetic data (optional)
def adjust_with_covariance(original_data, synthetic_data):
    """
    Adjust the generated synthetic data using the covariance matrix of the original data.
    
    Parameters:
        original_data (np.array): The original dataset for reference.
        synthetic_data (np.array): The synthetic dataset generated from KDE.
        
    Returns:
        adjusted_synthetic_data (np.array): Adjusted synthetic data based on covariance.
    """
    cov_matrix = np.cov(original_data.T)
    mean_vector = np.mean(original_data, axis=0)
    
    adjusted_synthetic_data = []
    for sample in synthetic_data:
        adjusted_sample = multivariate_normal(mean=mean_vector, cov=cov_matrix).rvs()
        adjusted_synthetic_data.append(adjusted_sample)
    
    return np.array(adjusted_synthetic_data)

# # Function to save synthetic data to a new CSV file
def save_synthetic_data(original_df, synthetic_data, output_file):
    """Saves synthetic data to a new CSV file."""
#     Area,Perimeter,MajorAxisLength,MinorAxisLength,AspectRation,Eccentricity,ConvexArea,EquivDiameter,Extent,Solidity,roundness,Compactness,ShapeFactor1,ShapeFactor2,ShapeFactor3,ShapeFactor4,Class

#     synthetic_df = pd.DataFrame(np.round(synthetic_data, 2), columns=['Area', 'Perimeter', 'MajorAxisLength', 'MinorAxisLength', 'AspectRation', 'Eccentricity', 'ConvexArea', 'EquivDiameter', 'Extent', 'Solidity', 'roundness', 'Compactness', 'ShapeFactor1', 'ShapeFactor2', 'ShapeFactor3', 'ShapeFactor4'])
    synthetic_df = pd.DataFrame(np.round(synthetic_data, 2))
    #     synthetic_df['Species'] = np.random.choice(original_df['Species'], size=len(synthetic_df))  # Randomly assign species
#     synthetic_df.index.name = 'Index'  # Add index as in the original data
#     synthetic_df['Species'] = "Iris-virginica"
    synthetic_df.to_csv(output_file, index=False)
    print(f"Synthetic data saved to {output_file}")
    
# Main function to combine all steps
def main(file_path, output_file, n_samples=500, bandwidth=1.0):
    """
    Main function to execute the data generation process.
    
    Parameters:
        file_path (str): Path to the CSV file containing the dataset.
        n_samples (int): Number of synthetic samples to generate (default: 100).
        bandwidth (float): Bandwidth for KDE (default: 1.0).
        
    Returns:
        final_synthetic_data (np.array): Final synthetic data after adjustments.
    """
    
    # Load and preprocess data
    df = load_data(file_path)
    
    # Fit KDE model
    kde_model = fit_kde(df, bandwidth=bandwidth)
    
    # Generate synthetic data
    synthetic_data = generate_synthetic_data(kde_model, n_samples)
    
    # Adjust synthetic data using covariance
#     final_synthetic_data = adjust_with_covariance(df, synthetic_data)
#     save_synthetic_data(df, final_synthetic_data, output_file)
    save_synthetic_data(df, synthetic_data, output_file)
#     final_synthetic_data.to_csv(output_file, index=True)
    
#     return final_synthetic_data
    return synthetic_data

# Example usage
if __name__ == "__main__":
    # File path of the example Iris dataset (modify as needed)
    input_file = "bean1.csv"
    output_file = "syn_bean1.csv"  # Output CSV file path

    
    # Generate 100 synthetic samples
    synthetic_data = main(input_file, output_file)
    
    print(synthetic_data)


Synthetic data saved to syn_bean1.csv
[[3.02794678e+04 6.34238394e+02 2.13383696e+02 1.83014200e+02
  1.58987352e+00]
 [3.01426912e+04 6.20485935e+02 2.01445339e+02 1.89774873e+02
  6.84389400e-01]
 [3.02796765e+04 6.34929746e+02 2.11574578e+02 1.79963413e+02
  3.61160615e-01]
 ...
 [3.09167315e+04 6.40124544e+02 2.14552087e+02 1.83489795e+02
  1.03874732e+00]
 [3.09179663e+04 6.41143545e+02 2.13928165e+02 1.84127643e+02
  2.78876782e+00]
 [3.10907814e+04 6.37668330e+02 2.08745347e+02 1.88245753e+02
  3.76539685e-01]]
