In [1]:
import pandas as pd
import numpy as np


In [8]:
df=pd.read_csv("imbalanced.csv", names=['balance','v1','v2','v3','v4'])
df.head()

Unnamed: 0,balance,v1,v2,v3,v4
0,B,1,1,1,1
1,R,1,1,1,2
2,R,1,1,1,3
3,R,1,1,1,4
4,R,1,1,1,5


In [9]:
df['balance']=[1 if b=='B' else 0 for b in df['balance']]
df['balance'].value_counts()

0    576
1     49
Name: balance, dtype: int64

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
clf=LogisticRegression()
y=df['balance']
x=df.drop('balance',axis=1)
x.columns

Index(['v1', 'v2', 'v3', 'v4'], dtype='object')

In [11]:
clf.fit(x, y)
y_pred = clf.predict(x)
np.unique(y_pred)

array([0], dtype=int64)

In [12]:
accuracy_score(y, y_pred)

0.92159999999999997

In [18]:
# Approach 1: up-sample minority class
df_major=df[df['balance']==0]
df_minor=df[df['balance']==1]
print(df_major.shape[0], df_minor.shape[0])

576 49


In [20]:
from sklearn.utils import resample
sample = resample(df_minor, replace=True, n_samples=576, random_state=123)
up=pd.concat([df_major, sample])
up.shape

(1152, 5)

In [21]:
clf1=LogisticRegression()
x=up.drop('balance',axis=1)
y=up['balance']
clf1.fit(x,y)
y_pred1 = clf1.predict(x)
print(np.unique(y_pred1))
accuracy_score(y, y_pred1)


[0 1]


0.51388888888888884

In [23]:
# Approach 2: down_sample majority class
sample =resample(df_major, replace=False, n_samples=49, random_state=123)
down=pd.concat([sample, df_minor])
clf2=LogisticRegression()
x=down.drop('balance',axis=1)
y=down['balance']
clf2.fit(x,y)
accuracy_score(y, clf2.predict(x))

0.58163265306122447

In [26]:
# Approach 3: penalize algorithm
from sklearn.svm import SVC

x=df.drop('balance',axis=1)
y=df['balance']
clf4=SVC(kernel='linear', class_weight='balanced', probability=True)
clf4.fit(x,y)
accuracy_score(y, clf4.predict(x))

0.68799999999999994

In [29]:
y_prob = clf4.predict_proba(x)


In [31]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y, y_prob[:,0])

0.53052366780045357

In [32]:
# Approach 4: tree-based algorithm
from sklearn.ensemble import RandomForestClassifier
clf5=RandomForestClassifier()
clf5.fit(x,y)
accuracy_score(y, clf5.predict(x))

0.9728

In [40]:
y_prob2 = clf5.predict_proba(x)
roc_auc_score(y, y_prob2[:,1])

0.99847647392290251