In [None]:
import pandas as pd
import statsmodels.api as sm
import numpy as np
from sklearn.metrics import roc_curve


## 数据预处理

In [None]:
df = pd.DataFrame(pd.read_csv("Other/logistic_regression_cutoff/binary.csv"))
df = df.rename(columns={'rank':'prestige'})
df = pd.concat([df.loc[:, ['admit', 'gre', 'gpa']], pd.get_dummies(df['prestige'], prefix='prestige')], axis=1)
df = df.loc[:, ['admit', 'gre', 'gpa', 'prestige_1', 'prestige_2', 'prestige_3']]


## 模型的训练

In [None]:
# manually add the intercept
df['intercept'] = 1.0
# fit the model
result = sm.Logit(df.loc[:, 'admit'], df.loc[:, ['gre', 'gpa', 'prestige_1', 'prestige_2', 'prestige_3']]).fit()
# Add prediction to dataframe
df['pred'] = result.predict(df.loc[:, ['gre', 'gpa', 'prestige_1', 'prestige_2', 'prestige_3']])


## cutoff 函数定义

In [None]:
def Find_Optimal_Cutoff(data_frame, actual_column_name, predict_column_name):
    """
    :param data_frame: 只要包含真实响应与预测响应即可
    :param actual_column_name: 真实响应的 column_name
    :param predict_column_name: 预测响应的 column_name（逻辑回归模型输出样本属于 Positive 类的概率）
    :return: thresholds 最佳的 cutoff
    """
    actual = data_frame.loc[:, actual_column_name]
    predict = data_frame.loc[:, predict_column_name]
    
    fpr, tpr, thresholds = roc_curve(actual, predict)
    index = np.arange(len(tpr))
    roc = pd.DataFrame({'fpr': pd.Series(fpr, index=index),
                        'tpr': pd.Series(tpr, index = index), 
                        '1-fpr': pd.Series(1-fpr, index = index), 
                        'tpr-(1-fpr)': pd.Series(tpr-(1-fpr), index = index), 
                        'thresholds': pd.Series(thresholds, index = index)})
    
    return roc.iloc[(roc.loc[:, 'tpr-(1-fpr)']-0).abs().argsort()[0], :]


## cutoff 函数输出

In [17]:
Find_Optimal_Cutoff(data_frame=df, actual_column_name="admit", predict_column_name="pred")