<a href="https://colab.research.google.com/github/GreenOrange44/Insurance-Fraud-Detection/blob/main/DataSet/DataAugmentation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Load the dataset
# Assuming your dataset is in a DataFrame called 'df'
# Replace with your dataset file if needed
# df = pd.read_csv('your_dataset.csv')

# Separate features and target


df = pd.read_csv('augmented_dataset3.csv')

X = df.drop(['fraud_reported'], axis=1)
y = df['fraud_reported']

# Feature augmentation functions
def add_noise(data, columns, noise_level=0.05):
    """
    Add Gaussian noise to numerical columns.
    """
    for column in columns:
        noise = np.random.normal(0, noise_level * data[column].std(), len(data))
        data[column] += noise
    return data

def augment_categorical(data, column, values_to_swap, swap_prob=0.1):
    """
    Augment categorical features by swapping randomly with other values.
    """
    data[column] = data[column].apply(
        lambda x: np.random.choice(values_to_swap) if np.random.rand() < swap_prob else x
    )
    return data
numerical_features = ['age', 'policy_annual_premium', 'total_claim_amount', 'injury_claim', 'property_claim', 'vehicle_claim']
df = add_noise(df, numerical_features)

# Oversample using SMOTE
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Combine oversampled data back into a DataFrame
augmented_data = pd.DataFrame(X_resampled, columns=X.columns)
augmented_data['fraud_reported'] = y_resampled

# Merge the original dataset with the new augmented data (ensure unique rows)
final_dataset = pd.concat([df, augmented_data]).drop_duplicates().reset_index(drop=True)


# Save the augmented dataset to a file
final_dataset.to_csv('augmented_dataset4.csv', index=False)

print("Original dataset size:", len(df))
print("Augmented dataset size:", len(final_dataset))

Original dataset size: 5518
Augmented dataset size: 11542
