In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

In [3]:
# load data
data = pd.read_csv('../datasets/kdd2004.csv').sample(10000)

In [4]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,65,66,67,68,69,70,71,72,73,target
74228,41.0,24.39,-0.36,-13.0,17.5,767.8,-0.71,-0.5,-8.5,-41.0,...,743.0,-1.92,-0.9,2.0,-27.0,229.6,0.1,-0.02,0.03,-1
70386,54.46,25.4,-0.24,-18.0,9.0,535.6,1.42,-0.7,-12.5,-45.0,...,1260.4,-1.24,-1.18,-4.0,-25.0,34.5,1.43,0.39,-0.1,-1
30794,84.96,23.96,0.05,12.0,-28.5,424.6,1.26,0.03,16.0,-69.0,...,710.5,-0.9,0.52,0.0,-30.0,71.5,1.24,0.34,0.36,-1
1098,40.0,31.11,-0.69,-15.5,16.5,1184.6,-1.33,-0.96,-3.5,-50.5,...,635.6,0.44,1.37,1.0,-47.0,254.5,-0.21,0.38,0.4,-1
89493,74.0,28.38,-0.5,-11.0,64.5,3054.1,-2.21,1.22,1.0,-89.5,...,2014.1,1.28,1.09,-5.0,-69.0,50.2,1.82,0.35,0.24,-1


In [5]:
# imbalanced target
data.target.value_counts(normalize=True)

target
-1    0.9916
 1    0.0084
Name: proportion, dtype: float64

In [6]:
# separate into train and test
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['target'], axis=1),
    data['target'],
    test_size=0.3,
    random_state=0,
)

X_train.shape, X_test.shape

((7000, 74), (3000, 74))

## Using class_weight

In [8]:
# logistic regression with class_weight
def run_logit(X_train, X_test, y_train, y_test, class_weight):
    logit = LogisticRegression(
        penalty='l2',
        solver='newton-cg',
        random_state=0,
        max_iter=10,
        n_jobs=-1,
        class_weight=class_weight,
    )

    logit.fit(X_train, y_train)

    print('Train set')
    pred = logit.predict_proba(X_train)[:, 1]
    print(f'Logisitic Regression roc-auc: {roc_auc_score(y_train, pred):.4f}')

    print('Test set')
    pred = logit.predict_proba(X_test)[:, 1]
    print(f'Logisitic Regression roc-auc: {roc_auc_score(y_test, pred):.4f}')

In [9]:
run_logit(X_train, X_test, y_train, y_test, class_weight=None)

Train set
Logisitic Regression roc-auc: 0.8935
Test set
Logisitic Regression roc-auc: 0.9326


In [10]:
# evaluate performance of algorithm built cost estimated as imbalance ratio
# alternatively, we can pass a different cost in a dictionary
run_logit(X_train, X_test, y_train, y_test, class_weight={-1:1, 1:10})

Train set
Logisitic Regression roc-auc: 0.9589
Test set
Logisitic Regression roc-auc: 0.9626


In [11]:
# evaluate performance of algorithm built cost estimated as imbalance ratio
# alternatively, we can pass a different cost in a dictionary
run_logit(X_train, X_test, y_train, y_test, class_weight='balanced')

Train set
Logisitic Regression roc-auc: 0.9815
Test set
Logisitic Regression roc-auc: 0.9686


## Using sample_weight

In [12]:
# logistic regression with class_weight
def run_logit(X_train, X_test, y_train, y_test, sample_weight):
    logit = LogisticRegression(
        penalty='l2',
        solver='newton-cg',
        random_state=0,
        max_iter=10,
        n_jobs=-1,
    )

    logit.fit(X_train, y_train, sample_weight=sample_weight)

    print('Train set')
    pred = logit.predict_proba(X_train)[:, 1]
    print(f'Logisitic Regression roc-auc: {roc_auc_score(y_train, pred):.4f}')

    print('Test set')
    pred = logit.predict_proba(X_test)[:, 1]
    print(f'Logisitic Regression roc-auc: {roc_auc_score(y_test, pred):.4f}')

In [15]:
run_logit(X_train, X_test, y_train, y_test, sample_weight=None)

Train set
Logisitic Regression roc-auc: 0.8935
Test set
Logisitic Regression roc-auc: 0.9326


In [16]:
# evaluate performance of algorithm built cost estimated as imbalance ratio
# with numpy.where, we introduce a cost of 99 to each observation of the minority class
# and 1 otherwise
run_logit(X_train, X_test, y_train, y_test, sample_weight=np.where(y_train==1, 99, 1))

Train set
Logisitic Regression roc-auc: 0.9834
Test set
Logisitic Regression roc-auc: 0.9718


## Conclusion

CSL has improved the performance of the model