In [None]:
Balancing a Dataset with Downsampling 
Imagine we have a dataset for a binary classification task where the class labels are imbalanced, and we want to downsample the majority class to balance the dataset. 

In [8]:
import pandas as pd 
from sklearn.utils import resample 
#Sample dataset 
df=pd.DataFrame( {
    "Age": [22, 25, 27, 28, 30, 35, 46, 45, 50, 55, 60, 65, 70], 
    "Income": [2000, 2500, 2780, 3200, 3500, 3800, 4000, 4200, 4300, 4500, 5000, 5500, 6000], 
    "Class": ['High', 'Low', "Low", "High", "High", "Low", "High", "High", "Low", "Low", "High", "High", "Low"] 
})


In [None]:
High class has 7 instances. 
Low class has 6 instances.

In [25]:
#Separate majority and minority classes 
df_high = df[df['Class'] == "High"] 
df_low=df[df["Class"] == "Low"] 
print(df_low)
print("\n",df_high)

    Age  Income Class
1    25    2500   Low
2    27    2780   Low
5    35    3800   Low
8    50    4300   Low
9    55    4500   Low
12   70    6000   Low

     Age  Income Class
0    22    2000  High
3    28    3200  High
4    30    3500  High
6    46    4000  High
7    45    4200  High
10   60    5000  High
11   65    5500  High


In [11]:
#Downsample majority class 
df_high_downsampled=resample(df_high, replace=False, n_samples=len(df_low), random_state=42) 


In [14]:
#Combine downsampled majority with minority class 
df_balanced=pd.concat([df_high_downsampled, df_low]) 

In [15]:
print(df_balanced['Class'].value_counts()) 

Class
High    6
Low     6
Name: count, dtype: int64


In [26]:
# Display the tables (DataFrames)
print("High Class Table:")
df_balanced

High Class Table:


Unnamed: 0,Age,Income,Class
0,22,2000,High
3,28,3200,High
10,60,5000,High
4,30,3500,High
7,45,4200,High
6,46,4000,High
1,25,2500,Low
2,27,2780,Low
5,35,3800,Low
8,50,4300,Low


In [27]:
df_high_downsampled

Unnamed: 0,Age,Income,Class
0,22,2000,High
3,28,3200,High
10,60,5000,High
4,30,3500,High
7,45,4200,High
6,46,4000,High


In [30]:
df_majority = df[df['Class'] == 'High']
df_minority = df[df['Class'] == 'Low']

df_minority_upsampled = resample(df_minority, 
                                 replace=True,    
                                 n_samples=len(df_majority),  
                                 random_state=42)
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
df_upsampled = df_upsampled.sample(frac=1, random_state=42).reset_index(drop=True)
df_minority_upsampled 

Unnamed: 0,Age,Income,Class
8,50,4300,Low
9,55,4500,Low
5,35,3800,Low
9,55,4500,Low
9,55,4500,Low
2,27,2780,Low
5,35,3800,Low


In [31]:
df_upsampled

Unnamed: 0,Age,Income,Class
0,35,3800,Low
1,55,4500,Low
2,22,2000,High
3,27,2780,Low
4,60,5000,High
5,55,4500,Low
6,30,3500,High
7,28,3200,High
8,35,3800,Low
9,45,4200,High


In [36]:
pip install imbalanced-learn




In [None]:
pip uninstall scikit-learn imbalanced-learn -y

In [None]:
import pandas as pd
from imblearn.over_sampling import SMOTE

# Sample dataset
df = pd.DataFrame({
    'Age': [22, 25, 27, 28, 30, 35, 40, 45, 50, 55, 60, 65, 70],
    'Income': [2000, 2500, 2700, 3200, 3500, 3800, 4000, 4200, 4300, 4500, 5000, 5500, 6000],
    'Class': ['Minority', 'Majority', 'Majority', 'Majority', 'Majority', 'Minority', 'Minority', 'Minority', 'Majority', 'Majority', 'Majority', 'Majority', 'Majority']
})

# Step 1: Convert categorical labels to numerical values
df['Class'] = df['Class'].map({'Majority': 0, 'Minority': 1})

# Step 2: Split features (X) and target variable (y)
X = df[['Age', 'Income']]
y = df['Class']

# Step 3: Apply SMOTE with k_neighbors = 3 (reducing from default 5)
smote = SMOTE(sampling_strategy='auto', random_state=42, k_neighbors=3)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Step 4: Convert numeric labels back to categorical
y_resampled = y_resampled.map({0: 'Majority', 1: 'Minority'})

# Step 5: Combine the resampled dataset
df_balanced = pd.concat([pd.DataFrame(X_resampled, columns=['Age', 'Income']),
                         pd.DataFrame(y_resampled, columns=['Class'])], axis=1)
print(df_balanced['Class'].value_counts())
# Display the balanced dataset
print(df_balanced)
