In [5]:
import pandas as pd
import numpy as np
from sklearn.utils import resample

# Read the CSV file containing the training set
df_train = pd.read_csv(r"C:\Users\acer\Desktop\Data_2D3layers\train_classification.csv")

# Separate majority and minority classes
df_majority = df_train[df_train['OA_normal_Remodel'] == df_train['OA_normal_Remodel'].value_counts().idxmax()]
df_minority = df_train[df_train['OA_normal_Remodel'] != df_train['OA_normal_Remodel'].value_counts().idxmax()]

# Calculate the number of samples to add
n_samples = len(df_majority) - len(df_minority)

# Oversample minority class with replacement
df_minority_oversampled = resample(df_minority, 
                                   replace=True,     # sample with replacement
                                   n_samples=n_samples,    # to match majority class
                                   random_state=42) # reproducible results

# Combine majority class with oversampled minority class
df_oversampled = pd.concat([df_majority, df_minority, df_minority_oversampled])

# Calculate 'sample_times' for each row
sample_times = df_oversampled.groupby(df_oversampled.index).size().reset_index(name='sample_times')
df_oversampled = df_oversampled.merge(sample_times, left_index=True, right_on='index')
df_oversampled = df_oversampled.drop('index', axis=1)

# Sort the dataframe to group duplicate samples together
df_oversampled = df_oversampled.sort_values('ID')

# Save the oversampled data

df_output = df_oversampled[['ID', 'OA_normal_Remodel', 'sample_times']]
df_output.to_csv('oversampled_training_data.csv', index=False)

# Print summary information
print("Original class distribution:")
print(df_train['OA_normal_Remodel'].value_counts(normalize=True))

print("\nOversampled class distribution:")
print(df_oversampled['OA_normal_Remodel'].value_counts(normalize=True))

print("\nFirst few rows of the oversampled data:")
print(df_oversampled.head(10))

print("\nOversampled data saved as 'oversampled_training_data.csv'")

Original class distribution:
OA_normal_Remodel
1    0.602362
0    0.397638
Name: proportion, dtype: float64

Oversampled class distribution:
OA_normal_Remodel
1    0.5
0    0.5
Name: proportion, dtype: float64

First few rows of the oversampled data:
                 ID  OA_normal_Remodel  c_erosion  c_subCyst  c_genSclerosis  \
22       47-16872 L                  1          1          0               0   
65       47-16872 R                  1          1          1               1   
121      47-22136 L                  1          1          0               0   
7        47-22136 R                  1          1          0               0   
16   47-4881 L 2014                  1          1          1               0   
208  47-4881 L 2018                  1          1          1               0   
76   47-4881 R 2018                  1          1          1               0   
197      48-26453 L                  0          0          0               0   
37        48-5955 R          