# Logistic Regression
supervised learning - classification - discrete labels

## 二元分类问题

In [1]:
from sklearn.model_selection import train_test_split
import statsmodels.api as sm

def logitRegression(data):
    """
    params:
    -------
    data: DataFrame, 建模数据
    """
    train, test = train_test_split(data, test_size=0.2)
    re = trainModel(train)
    modelSummary(re)
    
def trainModel(data):
    formula = "label_code ~ age + education_num + capital_gain + capital_loss + hours_per_week"
    model = sm.Logit.from_formula(formula, data=data)
    re = model.fit()
    return re

def modelSummary(re):
    # 整体统计分析结果
    print(re.summary())
    
    # 用f test检验education_num的系数是否显著
    print("检验假设education_num的系数等于0")
    print(re.f_test("education_test"))
    
    # 用f_test检验两个假设是否同时成立
    print("检验假设education_num的系数等于0.32和hours_per_week的系数等于0.04同时成立")
    print(re.f_test("education_num=0.32, hours_per_week=0.04"))

In [2]:
# 理解模型结果
def interpretModel(re):
    # re: BinaryResults, 训练好的逻辑回归模型
    
    # 置信区间
    conf = re.conf_int()
    conf["OR"] = re.params
    # conf里面的3列，分别对应着估计值的下界，上界，和估计值本身
    conf.columns = ['2.5%', '97.5%', 'OR']
    print("各个变量对事件发生的影响：")
    print(np.exp(conf))
    
    #计算各个变量的边际效应
    print("各个变量的边际效应")
    print(re.get_margeff(at='overall').summary())

In [3]:
# 预测结果
def makePrediction(re, testSet, alpha=0.5):
    testSet['prob'] = re.predict(testSet)
    print("事件发生概率(预测概率)大于0.6的数据个数:")
    print(testSet[testSet['prob']>0.6].shape[0])
    
    print("事件发生概率(预测概率)大于0.5的数据个数:")
    print(testSet[testSet['prob']>0.5].shape[0])
    
    # 根据预测的概率，得出最终的预测
    testSet['prob'] = testSet.apply(lambda x: 1 if x["prob"] > alpha else 0, axis = 1)
    return testSet

In [4]:
# 评估指标
import numpy as np
def evaluation(re):
    bins = np.array([0, 0.5, 1])
    label = re['label_code']
    pred = re['pred']
    tn, fp, fn, tp = np.histogram2d(label, pred, bins=bins)[0].flatten()
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1 = 2 * precision * recall / (precision + recall)
    print("Precision: %3f\nRecall: %3f\nF1-score: %3f" % (precision, recall, f1))

In [5]:
# ROC(Receiver Operating Characteristic Cureve) 与 AUC
# TPR = TP / (TP + FN)
# FPR = FP / (FP + TN)
# ROC离左上角越近越好

## 多元分类问题

In [7]:
from sklearn.linear_model import LogisticRegression

def multiLogit(data):
    features = ['x1', 'x2']
    labels = 'label'
    methods = ['multinominal', 'ovr']
    
    # 使用两种不同的方法对数据建模
    for i in range(len(methods)):
        model = LogitRegression(multi_class=method[i], 
                                solver='sag',
                                max_iter=1000,
                                random_state=42)
        model.fit(data[features], data[labels])

### 多元回归(multinominal) vs One-vs.-all  
- 多元回归将原始数据按类别分出一个个数据子集，然后在这些子集的基础上训练模型，因此模型参数的协方差矩阵在这些子集内是恒定的
- One-vs.-all的方法则是在个体的基础上建模，参数的协方差矩阵在上面的数据子集内是不恒定的
- 多元回归回倾向于使所谓的决策边界(decision boundary)远离原始数据，所以会有更少的点落在两种分类的边界处

## 非均衡数据集

### 准确度悖论 Accuracy Paradox
ACC = (TP+TN)/(TP+TN+FP+FN)

### 解决方法

In [9]:
# 评估指标
from sklearn.linear_model import LogisticRegression

# 用权重调整占比较少的类别，使其损失增加
def balanceData(X, Y):
    positiveWeight = len(Y(Y>0)) / float(len(Y))
    classWeight = {1:1./positiveWeight, 0: 1./(1-positiveWeight)}
    
    # 为了消除惩罚项的干扰，将惩罚系数设为很大
    model = LogisticRegression(class_weight=classWeight, C=1e4)
    model.fit(X, Y.ravel())
    pred = model.predict(X)
    return pred

# 更多解决方法参考Learning from Imbalanced Data