In [None]:
%matplotlib inline

import matplotlib.pyplot as plt

In [1]:
# !pip install panda
# import random
# import pandas as pd
#from imblearn.under_sampling import ClusterCentroids
import dataGenerator
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (brier_score_loss, precision_score, recall_score,
                             f1_score, roc_curve, confusion_matrix)

## DataGenerator package

1. **dataGenererator.splitData()**
    return trainFeature, trainLabel, testFeature, testLabel
2. **dataGenerator.resampling(trainFeature, trainLabel, alpha = 1, method="under_sampling")**
    return resampled balanced features and its corresponding labels. Alpha is the balance factor, it is the ratio between **neg:pos** of the resampled data. method is the sampling method, it can be "under_sampling" or "over_sampling". 
    
    **"under_sampling"** samples the data from negative data which are the majority. 
    
    **"over_sampling"** samples from the positive data which are the minority. 

## Sampling setting

Here, we should choose the ratio, i.e. alpha parameter in the resampling function, to be same for different sampling methods. All of us should choose

**alpha = [1, 10, 20, 50]**

In [5]:
trainFeature, trainLabel, testFeature, testLabel = dataGenerator.splitData()
sampleFeature, sampleLabel = dataGenerator.resampling(trainFeature, trainLabel, alpha = 100, method="over_sampling")
print(len(sampleLabel[sampleLabel == 0]))
# print(sampleLabel)
print(trainFeature.shape, testFeature.shape)

232818
(233215, 30) (56961, 30)


In [None]:
lr_sample = LogisticRegression(C=1., solver='lbfgs', max_iter = 500)
lr_sample.fit(sampleFeature, sampleLabel)
sampleLabelPredict = lr_sample.predict(testFeature)
prob_pos_sample = lr_sample.predict_proba(testFeature)[:, 1]
fpr_sample, tpr_sample, _ = roc_curve(testLabel, prob_pos_sample)

# prob_pos = lr_sample.predict_proba(testFeature)[:, 0]
# print(testLabelPredict)
# trainLabelPredict = lr_sample.predict(trainFeature)
# print("\tPrecision: %1.3f" % precision_score(trainLabelPredict, trainLabel))
# print("\tRecall: %1.3f" % recall_score(trainLabelPredict, trainLabel))
# print("\tF1: %1.3f\n" % f1_score(trainLabelPredict, trainLabel))
sampleConfusion = confusion_matrix(testLabel, sampleLabelPredict)
print("\tPrecision: %1.3f" % precision_score(testLabel, sampleLabelPredict))
print("\tRecall: %1.3f" % recall_score(testLabel, sampleLabelPredict))
print("\tF1: %1.3f\n" % f1_score(testLabel, sampleLabelPredict))
print(sampleConfusion)


In [None]:
lr_full = LogisticRegression(C=1., solver='lbfgs', max_iter = 500)
lr_full.fit(trainFeature, trainLabel)
testLabelPredict = lr_full.predict(testFeature)
prob_pos_test = lr_full.predict_proba(testFeature)[:, 1]
fpr_full, tpr_full, _ = roc_curve(testLabel, prob_pos_test)

In [None]:
fpr_full, tpr_full, _ = roc_curve(testLabel, prob_pos_test)
testConfusion = confusion_matrix(testLabel, testLabelPredict)
print("\tPrecision: %1.3f" % precision_score(testLabelPredict, testLabel))
print("\tRecall: %1.3f" % recall_score(testLabelPredict, testLabel))
print("\tF1: %1.3f\n" % f1_score(testLabelPredict, testLabel))
print(testConfusion)

In [None]:
plt.figure(1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_sample, tpr_sample, label='sampled lr')
plt.plot(fpr_full, tpr_full, label='fully lr')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend(loc='best')
plt.show()