In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
import seaborn as sns
import matplotlib.pyplot as plt

# Step 1: Load dataset
df = pd.read_csv("/content/medicinal_plants_100k_dataset.csv")

# Step 2: Separate features and labels
X = df.drop('Crop', axis=1)
y = df['Crop']

# Step 3: Encode class labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Step 4: Define sampling strategy to get 10,000 samples per class
unique_classes, class_counts = np.unique(y_encoded, return_counts=True)
sampling_strategy = {
    label: 10000 for label, count in zip(unique_classes, class_counts) if count < 10000
}

# Step 5: Apply SMOTE
smote = SMOTE(sampling_strategy=sampling_strategy, random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y_encoded)

# Step 6: Convert back to class names
y_resampled_labels = le.inverse_transform(y_resampled)

# Step 7: Combine into DataFrame
balanced_df = pd.DataFrame(X_resampled, columns=X.columns)
balanced_df['Crop'] = y_resampled_labels

# Step 8: Add back original oversampled crops to reach exactly 10K each
original_df = df[df['Crop'].isin(le.inverse_transform(list(sampling_strategy.keys())))]
original_df_encoded = le.transform(original_df['Crop'])
original_df_sampled = pd.DataFrame()

for class_id in sampling_strategy.keys():
    samples_needed = 10000 - sampling_strategy[class_id]
    class_name = le.inverse_transform([class_id])[0]
    class_df = df[df['Crop'] == class_name]
    sampled_df = class_df.sample(n=samples_needed, random_state=42)
    original_df_sampled = pd.concat([original_df_sampled, sampled_df])

# Combine SMOTE output and real data to reach 10K per class
final_df = pd.concat([balanced_df, original_df_sampled], ignore_index=True)

# Optional: shuffle the data
final_df = final_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Step 9: Check result
print("Balanced Dataset Shape:", final_df.shape)
print("\nBalanced Class Distribution:\n", final_df['Crop'].value_counts())


Balanced Dataset Shape: (100000, 7)

Balanced Class Distribution:
 Crop
Ashwagandha    10000
Neem           10000
Fenugreek      10000
Lemongrass     10000
Tulsi          10000
Chamomile      10000
Turmeric       10000
Moringa        10000
Peppermint     10000
Aloe Vera      10000
Name: count, dtype: int64
