# Lab | Handling Data Imbalance in Classification Models

In [1]:
# libraries

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
# data

categorical = pd.read_csv('categorical.csv')
numerical = pd.read_csv('numerical.csv')
target = pd.read_csv('target.csv')

In [3]:
data = pd.concat([categorical,numerical,target],axis=1)
data

Unnamed: 0,STATE,CLUSTER,HOMEOWNR,GENDER,DATASRCE,RFA_2R,RFA_2A,GEOCODE2,DOMAIN_A,DOMAIN_B,...,MAXRAMNT,LASTGIFT,TIMELAG,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F,CLUSTER2,TARGET_B,TARGET_D
0,IL,36,H,F,3,L,E,C,T,2,...,12.0,10.0,4,7.741935,95515,0,4,39,0,0.0
1,CA,14,H,M,3,L,G,A,S,1,...,25.0,25.0,18,15.666667,148535,0,2,1,0,0.0
2,NC,43,U,M,3,L,E,C,R,2,...,16.0,5.0,12,7.481481,15078,1,4,60,0,0.0
3,CA,44,U,F,3,L,E,C,R,2,...,11.0,10.0,9,6.812500,172556,1,4,41,0,0.0
4,FL,16,H,F,3,L,F,A,S,2,...,15.0,15.0,14,6.864865,7112,1,2,26,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95407,other,27,H,M,3,L,G,C,C,2,...,25.0,25.0,9,25.000000,184568,0,1,12,0,0.0
95408,TX,24,H,M,3,L,F,A,C,1,...,20.0,20.0,9,20.000000,122706,1,1,2,0,0.0
95409,MI,30,H,M,3,L,E,B,C,3,...,10.0,10.0,3,8.285714,189641,1,3,34,0,0.0
95410,CA,24,H,F,2,L,F,A,C,1,...,21.0,18.0,4,12.146341,4693,1,4,11,1,18.0


In [4]:
# checking NaN values

columns_with_nan = data.columns[data.isnull().any()]
print("Columns with NaN values:")
print(columns_with_nan)

Columns with NaN values:
Index([], dtype='object')


In [5]:
data['FIRSTDATE_MM'].unique()

array([11, 10,  1,  2,  3,  9,  5,  4, 12,  6,  7,  8], dtype=int64)

In [6]:
data['FIRSTDATE_MM'].fillna(data['FIRSTDATE_MM'].mode()[0], inplace=True)

In [7]:
# separating features and target

X = data.drop(['TARGET_B','TARGET_D'],axis=1)
y = data['TARGET_B']  # target variable

In [8]:
X.head()

Unnamed: 0,STATE,CLUSTER,HOMEOWNR,GENDER,DATASRCE,RFA_2R,RFA_2A,GEOCODE2,DOMAIN_A,DOMAIN_B,...,CARDGIFT,MINRAMNT,MAXRAMNT,LASTGIFT,TIMELAG,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F,CLUSTER2
0,IL,36,H,F,3,L,E,C,T,2,...,14,5.0,12.0,10.0,4,7.741935,95515,0,4,39
1,CA,14,H,M,3,L,G,A,S,1,...,1,10.0,25.0,25.0,18,15.666667,148535,0,2,1
2,NC,43,U,M,3,L,E,C,R,2,...,14,2.0,16.0,5.0,12,7.481481,15078,1,4,60
3,CA,44,U,F,3,L,E,C,R,2,...,7,2.0,11.0,10.0,9,6.8125,172556,1,4,41
4,FL,16,H,F,3,L,F,A,S,2,...,8,3.0,15.0,15.0,14,6.864865,7112,1,2,26


In [9]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: TARGET_B, dtype: int64

In [10]:
# splitting into train and test set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# splitting into numerical and categorical 

train_num = X_train.select_dtypes(include=['int64', 'float64'])
train_cat = X_train.select_dtypes(include=['object'])

test_num = X_test.select_dtypes(include=['int64', 'float64'])
test_cat = X_test.select_dtypes(include=['object'])

In [12]:
print("\nShapes after split:")
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("train_num shape:", train_num.shape)
print("train_cat shape:", train_cat.shape)
print("test_num shape:", test_num.shape)
print("test_cat shape:", test_cat.shape)


Shapes after split:
X_train shape: (76329, 337)
X_test shape: (19083, 337)
train_num shape: (76329, 330)
train_cat shape: (76329, 7)
test_num shape: (19083, 330)
test_cat shape: (19083, 7)


In [13]:
# scaling numerical features

scaler = StandardScaler()

train_num_scaled = scaler.fit_transform(train_num)
test_num_scaled = scaler.transform(test_num)

In [14]:
# a sample of scaled numerical features

train_num_scaled[:5]

array([[-1.82065431,  0.63372399, -1.24381765, ...,  0.99867766,
        -0.84960237, -0.24300656],
       [-1.19528599, -0.97164983, -1.24381765, ..., -1.0013241 ,
         0.08230111,  1.46478163],
       [ 0.54184824,  0.63372399,  0.11544927, ...,  0.99867766,
         0.08230111,  1.14457134],
       [-0.5004323 , -0.97164983,  1.47471619, ...,  0.99867766,
        -0.84960237, -0.50984846],
       [ 1.02824582, -0.97164983,  0.11544927, ...,  0.99867766,
        -0.84960237,  0.61088754]])

In [15]:
# encoding categorical features

encoder = OneHotEncoder(drop='first', sparse=False)

train_cat_encoded = encoder.fit_transform(train_cat)
test_cat_encoded = encoder.transform(test_cat)



In [16]:
# a sample of encoded categorical features

train_cat_encoded[:5]

array([[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
        0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1.,
        0., 0., 0., 1., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 1., 0.,
        0., 0., 0., 1., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 1., 0.,
        0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 1.,
        0., 1., 0., 0., 1., 0., 0., 0.]])

In [17]:
# scaled numerical and encoded categorical features

X_train_encoded = pd.concat([pd.DataFrame(train_num_scaled, columns=train_num.columns),
                             pd.DataFrame(train_cat_encoded, columns=encoder.get_feature_names_out(train_cat.columns))],
                            axis=1)

X_test_encoded = pd.concat([pd.DataFrame(test_num_scaled, columns=test_num.columns),
                            pd.DataFrame(test_cat_encoded, columns=encoder.get_feature_names_out(test_cat.columns))],
                           axis=1)

In [18]:
X_train_encoded.head()

Unnamed: 0,CLUSTER,DATASRCE,DOMAIN_B,ODATEW_YR,ODATEW_MM,DOB_YR,DOB_MM,MINRDATE_YR,MINRDATE_MM,MAXRDATE_YR,...,RFA_2A_E,RFA_2A_F,RFA_2A_G,GEOCODE2_B,GEOCODE2_C,GEOCODE2_D,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U
0,-1.820654,0.633724,-1.243818,1.3375,-0.022289,-1.303765,-0.290967,1.321313,-1.123192,0.952189,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,-1.195286,-0.97165,-1.243818,-0.408067,-0.022289,0.2365,-0.621583,-0.91727,1.444322,-1.351316,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,0.541848,0.633724,0.115449,-0.989922,-0.022289,0.281802,-0.621583,-0.171076,-1.379944,0.952189,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,-0.500432,-0.97165,1.474716,-0.989922,-0.022289,-0.125915,-0.621583,0.575119,-1.379944,-0.199564,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,1.028246,-0.97165,0.115449,-0.698994,-0.022289,-1.303765,-0.290967,-1.290367,0.930819,0.376313,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [19]:
X_test_encoded.head()

Unnamed: 0,CLUSTER,DATASRCE,DOMAIN_B,ODATEW_YR,ODATEW_MM,DOB_YR,DOB_MM,MINRDATE_YR,MINRDATE_MM,MAXRDATE_YR,...,RFA_2A_E,RFA_2A_F,RFA_2A_G,GEOCODE2_B,GEOCODE2_C,GEOCODE2_D,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U
0,-1.125801,-2.577024,-1.243818,1.3375,-0.022289,1.323746,-0.621583,1.321313,-1.123192,0.376313,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,-0.639403,0.633724,0.115449,-0.117139,-0.022289,-0.261821,0.03965,-0.171076,1.187571,0.376313,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,-1.820654,0.633724,-1.243818,0.755645,-0.022289,0.780123,-0.621583,0.575119,-1.379944,-0.199564,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,-1.125801,-0.97165,-1.243818,0.464717,-0.022289,0.689519,-0.621583,0.202022,-1.379944,-0.199564,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,-0.153005,0.633724,0.115449,-0.117139,-0.022289,0.598915,-0.621583,-0.171076,0.674068,0.376313,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
# logistic regression model on the training data

#model = LogisticRegression(random_state=42)
model = LogisticRegression(random_state=42, max_iter=1000)
model.fit(X_train_encoded, y_train)

In [21]:
# accuracy on the test data

y_pred = model.predict(X_test_encoded)
accuracy = accuracy_score(y_test, y_pred)

print("Accuracy on test data: ", accuracy)

Accuracy on test data:  0.9486977938479275


In [22]:
# checking for imbalance

# value counts of TARGET_B before resampling:
y_train.value_counts()

TARGET_B
0    72464
1     3865
Name: count, dtype: int64

In [23]:
# SMOTE for upsampling

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_encoded, y_train)

In [24]:
# value counts of TARGET_B after resampling:
print(y_train_resampled.value_counts())

TARGET_B
0    72464
1    72464
Name: count, dtype: int64


In [25]:
# logistic regression model on the resampled training data

model_resampled = LogisticRegression(random_state=42, max_iter=1000)
model_resampled.fit(X_train_resampled, y_train_resampled)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [26]:
# accuracy on the test data after resampling

y_pred_resampled = model_resampled.predict(X_test_encoded)
accuracy_resampled = accuracy_score(y_test, y_pred_resampled)

print("Accuracy on the test data after resampling: ", accuracy_resampled)

Accuracy on the test data after resampling:  0.6143164072734895


In [27]:
from sklearn.metrics import precision_score, recall_score, f1_score

# predictions on the test data after resampling
y_pred_resampled = model_resampled.predict(X_test_encoded)

precision_resampled = precision_score(y_test, y_pred_resampled)

recall_resampled = recall_score(y_test, y_pred_resampled)

f1_resampled = f1_score(y_test, y_pred_resampled)

print("Precision:", precision_resampled)
print("Recall:", recall_resampled)
print("F1-score:", f1_resampled)

Precision: 0.06796642296236122
Recall: 0.5132924335378323
F1-score: 0.12003825920612146
