In [11]:
import pandas as pd
import numpy as np

# Read the CSV file
def read_csv(file_path):
    return pd.read_csv(file_path)

# Define augmentation techniques with increased variations
def add_noise(df, noise_level=0.05):
    noisy_df = df.copy()
    for col in noisy_df.select_dtypes(include=np.number).columns:
        noise = np.random.normal(0, noise_level, size=noisy_df[col].shape)
        noisy_df[col] += noise
    return noisy_df

def scale_data(df, scale_factor=1.5):
    scaled_df = df.copy()
    for col in scaled_df.select_dtypes(include=np.number).columns:
        scaled_df[col] *= scale_factor
    return scaled_df

def shuffle_data(df):
    return df.sample(frac=1).reset_index(drop=True)

def drop_columns(df, fraction=0.2):
    dropped_df = df.copy()
    cols_to_drop = np.random.choice(dropped_df.columns, size=int(len(dropped_df.columns) * fraction), replace=False)
    dropped_df.drop(columns=cols_to_drop, inplace=True)
    return dropped_df

def duplicate_rows(df, factor=3):
    return pd.concat([df] * factor, ignore_index=True)

# Apply augmentations
def augment_data(df):
    augmented_dfs = []

    # Apply multiple augmentations
    augmented_dfs.append(add_noise(df))
    augmented_dfs.append(scale_data(df))
    augmented_dfs.append(shuffle_data(df))
    augmented_dfs.append(drop_columns(df))
    augmented_dfs.append(duplicate_rows(df))

    # Optionally apply combinations
    num_augmentations = 3
    for _ in range(num_augmentations):
        augmented_dfs.append(add_noise(df))
        augmented_dfs.append(scale_data(df))
        augmented_dfs.append(shuffle_data(df))
        augmented_dfs.append(drop_columns(df))
        augmented_dfs.append(duplicate_rows(df))

    return pd.concat(augmented_dfs, ignore_index=True)

# Save the augmented data to a new CSV file
def save_csv(df, file_path):
    df.to_csv(file_path, index=False)

# Example usage
file_path = 'loan_approval_dataset (1).csv'
df = read_csv(file_path)
augmented_df = augment_data(df)
save_csv(augmented_df, 'augmented_data.csv')


In [7]:
loan_data=pd.read_csv('loan_approval_dataset (1).csv')
loan_data.shape

(4269, 13)

In [8]:
augment_data=pd.read_csv('augmented_data.csv')
augment_data.shape

(119532, 13)