In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
def data_generation(df, n_synthetic=600, scale=0.1):
    df_ml = df.copy()
    
    drop_cols = ['season_year', 'Year', 'District']
    df_ml.drop(columns=[c for c in drop_cols if c in df_ml.columns], inplace=True)
    
    num_cols = df_ml.select_dtypes(include=np.number).columns.tolist()
    
    synthetic_rows = []
    for _ in range(n_synthetic):
        base_row = df_ml.sample(1).iloc[0]
        row = {}
        for col in df_ml.columns:
            if col in num_cols:
                row[col] = np.random.normal(loc=base_row[col], scale=scale*df_ml[col].std())
            else:
                row[col] = np.random.choice(df_ml[col])
        synthetic_rows.append(row)
    
    df_synthetic = pd.DataFrame(synthetic_rows)
    print("Synthetic data shape:", df_synthetic.shape)
    
    df_augmented = pd.concat([df_ml, df_synthetic], ignore_index=True)
    print("Augmented ML data shape:", df_augmented.shape)
    return df_augmented


In [3]:
base_folder = "data"

In [None]:
for approach in os.listdir(base_folder):
    approach_path = os.path.join(base_folder, approach)
    if not os.path.isdir(approach_path):
        continue
    
    for file_name in os.listdir(approach_path):
        if not file_name.endswith(".csv"):
            continue
        if file_name.endswith("_synthetic.csv"):
            continue
        
        file_path = os.path.join(approach_path, file_name)
        
        df = pd.read_csv(file_path)
        
        df_augmented = data_generation(df, n_synthetic=600, scale=0.1)
        
        synth_file_path = os.path.join(
            approach_path, file_name.replace(".csv", "_synthetic.csv")
        )
        
        df_augmented.to_csv(synth_file_path, index=False)
        print(f"Saved synthetic data: {synth_file_path}")


Synthetic data shape: (600, 13)
Augmented ML data shape: (700, 13)
Saved synthetic data: data\approach_1\knn_imputation_synthetic.csv
Synthetic data shape: (600, 13)
Augmented ML data shape: (700, 13)
Saved synthetic data: data\approach_1\regression_imputation_synthetic.csv
Synthetic data shape: (600, 25)
Augmented ML data shape: (700, 25)
Saved synthetic data: data\approach_2\knn_imputation_synthetic.csv
Synthetic data shape: (600, 25)
Augmented ML data shape: (700, 25)
Saved synthetic data: data\approach_2\regression_imputation_synthetic.csv
