In [16]:
import pandas as pd
from sklearn.utils import resample

In [17]:
#Sample dataset
df = pd.DataFrame({
    'Age': [22,25,27,28,30,35,40,45,50,55,60,65,70],
    'Income': [2000, 2500, 2700, 3200, 3500, 3800, 4000, 4200, 4300, 4500, 5000, 5500, 6000],
    'Class': ['High','Low','Low','High','High','Low','High','High','Low','Low','High','High','Low']
})
df

Unnamed: 0,Age,Income,Class
0,22,2000,High
1,25,2500,Low
2,27,2700,Low
3,28,3200,High
4,30,3500,High
5,35,3800,Low
6,40,4000,High
7,45,4200,High
8,50,4300,Low
9,55,4500,Low


High Class has 7 Instances

Low class has 6 Instances

In [18]:
#Separate majority and minority classes
df_high = df[df['Class'] == 'High']
df_low = df[df['Class'] == 'Low']

**DownScaling**

In [19]:
#Downsample majority class
df_high_downsampled = resample(df_high, replace=False, n_samples=len(df_low), random_state=42)
df_high_downsampled

Unnamed: 0,Age,Income,Class
0,22,2000,High
3,28,3200,High
10,60,5000,High
4,30,3500,High
7,45,4200,High
6,40,4000,High


In [20]:
#Combine downsampled(reducing high) majority with minority class
df_balanced_down = pd.concat([df_high_downsampled, df_low])

In [21]:
print(df_balanced_down['Class'].value_counts())

Class
High    6
Low     6
Name: count, dtype: int64


**UpScaling**

In [22]:
#Upsample majority class
df_low_upsampled = resample(df_low, replace=True, n_samples=len(df_high), random_state=42)
df_low_upsampled

Unnamed: 0,Age,Income,Class
8,50,4300,Low
9,55,4500,Low
5,35,3800,Low
9,55,4500,Low
9,55,4500,Low
2,27,2700,Low
5,35,3800,Low


In [23]:
#Combine upsampled(increasing low) minority with majority class
df_balanced_up = pd.concat([df_low_upsampled, df_high])

In [24]:
print(df_balanced_up['Class'].value_counts())

Class
Low     7
High    7
Name: count, dtype: int64


In [25]:
pip install -U scikit-learn imbalanced-learn 

Note: you may need to restart the kernel to use updated packages.


In [26]:
from imblearn.over_sampling import SMOTE

#Sample Dataset
df = pd.DataFrame({
    'Age': [22,25,27,28,30,35,40,45,50,55,60,65,70],
    'Income': [2000,2500,2700,3200,3500,3800,4000,4200,4300,4500,5000,5500,6000],
    'Class': ['Minority', 'Majority','Majority','Majority','Majority','Minority','Minority','Minority','Majority','Majority','Majority','Majority','Majority']
})

#1.convert categorical labels to numeric
df['Class']=df['Class'].map({'Majority':0,'Minority':1})

# 2.Separate features (X) and target (y)
X = df[['Age', 'Income']]  # Features
y = df['Class']  # Target variable

# 3.Apply SMOTE to generate synthetic samples for the minority class
smote = SMOTE(sampling_strategy='auto', random_state=42,k_neighbors=3)  # 'auto' balances classes
X_resampled, y_resampled = smote.fit_resample(X, y)

#4.convertin numeric labels back to categorical
y_resampled=y_resampled.map({0:'Majority',1:'Minority'})

#5.combine the resampled dataset
df_balanced=pd.concat([pd.DataFrame(X_resampled,columns=['Age','Income']),pd.DataFrame(y_resampled,columns=['Class'])],axis=1)

#6.print class  distribution
print(df_balanced['Class'].value_counts())

 #7.display the upsampled dataset
print(df_balanced)

Class
Minority    9
Majority    9
Name: count, dtype: int64
    Age  Income     Class
0    22    2000  Minority
1    25    2500  Majority
2    27    2700  Majority
3    28    3200  Majority
4    30    3500  Majority
5    35    3800  Minority
6    40    4000  Minority
7    45    4200  Minority
8    50    4300  Majority
9    55    4500  Majority
10   60    5000  Majority
11   65    5500  Majority
12   70    6000  Majority
13   40    4031  Minority
14   35    3831  Minority
15   44    4176  Minority
16   35    3826  Minority
17   41    4040  Minority
