In [28]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE

# Load the compiled dataset
compiled = pd.read_csv('compiled.csv', index_col=0)

# Separate features and target variable
X = compiled.drop('target', axis=1)
y = compiled['target']

# Define the number of entries to keep for testing and training
test_size_per_class = 4
train_size_per_class = 8

# Initialize lists to store test and train indices
test_indices = []
train_indices = []

# Iterate over each class
for class_label in np.unique(y):
    class_indices = np.where(y == class_label)[0]
    test_indices.extend(class_indices[:test_size_per_class])
    train_indices.extend(class_indices[-train_size_per_class:])

# Split the dataset into test and train using indices
X_test = X.iloc[test_indices]
y_test = y.iloc[test_indices]
X_train = X.iloc[train_indices]
y_train = y.iloc[train_indices]

# Apply SMOTE to the training data only
smote = SMOTE(sampling_strategy={0: 1000, 1: 1000, 2: 1000, 3: 1000})
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Save oversampled data
oversampled_data_train = pd.concat([pd.DataFrame(X_train_resampled), pd.DataFrame(y_train_resampled)], axis=1)
oversampled_data_train.to_csv('oversampled_train.csv')

# Check class distribution
print("Class distribution of oversampled training data:")
print(oversampled_data_train['target'].value_counts())


Class distribution of oversampled training data:
target
0    1000
1    1000
2    1000
3    1000
Name: count, dtype: int64


In [32]:
# Apply SMOTE to the test data
smote = SMOTE(sampling_strategy={0: 300, 1: 300, 2: 300, 3: 300}, k_neighbors=3)
X_test_resampled, y_test_resampled = smote.fit_resample(X_test, y_test)

# Save oversampled test data
oversampled_data_test = pd.concat([pd.DataFrame(X_test_resampled), pd.DataFrame(y_test_resampled)], axis=1)
oversampled_data_test.to_csv('oversampled_test.csv', index=False)

# Check class distribution
print("Class distribution of oversampled test data:")
print(oversampled_data_test['target'].value_counts())


Class distribution of oversampled test data:
target
0    300
1    300
2    300
3    300
Name: count, dtype: int64
