In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

%matplotlib inline


## 1. 加载数据

    由于数据安全原因，当前数据已经过数据降维预处理，故其数据特征不再是现实中的特征。


In [None]:
data = pd.read_csv("creditcard.csv")
data.head()


## 2. 查看数据

    数据被分为0,1两类，0代表正常，1代表欺诈。
    可以发现0类型记录比1类型记录要多出很多


In [None]:
count_classes = pd.value_counts(data['Class'], sort=True).sort_index()
count_classes.plot(kind='bar')
plt.title("Fraud class histogram")
plt.xlabel("Class")
plt.ylabel("Frequency")


## 3. 数据标准化

    可以发现，除amount外的属性均已进行标准化，所以我们需要对amount进行标准化。


In [None]:
from sklearn.preprocessing import StandardScaler

data['normAmount'] = StandardScaler().fit_transform(
    np.array(data['Amount']).reshape(-1, 1))
data = data.drop(['Time', 'Amount'], axis=1)
data.head()


## 4. 数据下采样

由于两类型数据量不同，为了避免精度出现问题，我们需要对 0 类型数据进行下采样


In [None]:
#  截取X,y
X = data.iloc[:, data.columns != 'Class']
y = data.iloc[:, data.columns == 'Class']

# 获取负类数量和索引
number_records_fraud = len(data[data.Class == 1])
fraud_indices = np.array(data[data.Class == 1].index)

# 获取正类索引
normal_indices = data[data.Class == 0].index

# 对正类进行下采样，随机选取与负类相同数量的正类
random_normal_indices = np.random.choice(
    normal_indices, number_records_fraud, replace=False)
random_normal_indices = np.array(random_normal_indices)

# 合并负类与下采样的正类索引
under_sample_indices = np.concatenate([fraud_indices, random_normal_indices])

# 由索引获取数据
under_sample_data = data.iloc[under_sample_indices, :]

# 截取下采样的X,y
X_undersample = under_sample_data.iloc[:, under_sample_data.columns != 'Class']
y_undersample = under_sample_data.iloc[:, under_sample_data.columns == 'Class']

# 打印
print("Percentage of normal transactions: ", len(
    under_sample_data[under_sample_data.Class == 0])/len(under_sample_data))
print("Percentage of fraud transactions: ", len(
    under_sample_data[under_sample_data.Class == 1])/len(under_sample_data))
print("Total number of transactions in resampled data: ", len(under_sample_data))


## 5. 分割训练集和测试集

以三七分的方式来分割


In [None]:
from sklearn.model_selection import train_test_split

# 对于全体数据
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=0)

print("Number transactions train dataset: ", len(X_train))
print("Number transactions test dataset: ", len(X_test))
print("Total number of transactions: ", len(X_train)+len(X_test))

# 对于下采样数据
X_train_undersample, X_test_undersample, y_train_undersample, y_test_undersample = train_test_split(
    X_undersample, y_undersample, test_size=0.3, random_state=0)
print("")
print("Number transactions train dataset: ", len(X_train_undersample))
print("Number transactions test dataset: ", len(X_test_undersample))
print("Total number of transactions: ", len(
    X_train_undersample)+len(X_test_undersample))


## 6. 召回率

当前问题，可以使用召回率来代替精度，以达到更符合问题的效果。
| | 真实正类 | 真实负类 |
| ---- | ---- | ---- |
| 观测正类（positive） | TP | FP |
| 观测负类（negative） | FN | TN |

其中 **召回率（Recall） = TP/(TP+FN)**


In [None]:

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import confusion_matrix, recall_score, classification_report


## 7. k折交叉验证
为了进行参数测试，需要使用k折交叉验证

In [None]:
# k折交叉测试正则化参数
def printing_Kfold_scores(x_train_data, y_train_data):
    # 定义k折
    fold = KFold(5, shuffle=False)

    # 正则化参数
    c_param_range = [0.01, 0.1, 1, 10, 100]

    # 测试结果
    results_table = pd.DataFrame(index=range(len(c_param_range), 2), columns=[
                                 'C_parameter', 'Mean recall score'])
    results_table['C_parameter'] = c_param_range

    for j, c_param in enumerate(c_param_range):
        print('-------------------------------------------')
        print('C parameter: ', c_param)
        print('-------------------------------------------')
        print('')

        recall_accs = []
        for iteration, indices in enumerate(fold.split(x_train_data)):

            # 定义逻辑回归
            lr = LogisticRegression(C=c_param, penalty='l2', max_iter=1000)

            # 训练
            lr.fit(x_train_data.iloc[indices[0], :],
                   y_train_data.iloc[indices[0], :].values.ravel())

            # 预测
            y_pred_undersample = lr.predict(
                x_train_data.iloc[indices[1], :].values)

            # 计算召回率
            recall_acc = recall_score(
                y_train_data.iloc[indices[1], :].values, y_pred_undersample)

            recall_accs.append(recall_acc)
            print('Iteration ', iteration, ': recall score = ', recall_acc)

        # 记录正则化项对应结果
        results_table.loc[j, 'Mean recall score'] = np.mean(recall_accs)
        print('')
        print('Mean recall score ', np.mean(recall_accs))
        print('')

    print(results_table)

    # 计算最佳c
    best_c = results_table.loc[results_table['Mean recall score'].astype(
        float).idxmax(),'C_parameter']

    # Finally, we can check which C parameter is the best amongst the chosen.
    print('*********************************************************************************')
    print('Best model to choose from cross validation is with C parameter = ', best_c)
    print('*********************************************************************************')

    return best_c


In [None]:
best_c = printing_Kfold_scores(X_train_undersample, y_train_undersample)


## 8. 混淆矩阵
| | 观测正类（positive） | 观测负类（negative） |
| ---- | ---- | ---- |
| 真实正类 | TP | FP |
| 真实负类 | FN | TN |

In [None]:
def plot_confusion_matrix(cm, classes,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    绘制混淆矩阵
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=0)
    plt.yticks(tick_marks, classes)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')


In [None]:
import itertools
lr = LogisticRegression(C=best_c, penalty='l2')
lr.fit(X_train_undersample, y_train_undersample.values.ravel())
y_pred_undersample = lr.predict(X_test_undersample.values)

# 生成混淆矩阵
cnf_matrix = confusion_matrix(y_test_undersample, y_pred_undersample)
np.set_printoptions(precision=2)

print("测试数据集 负样本召回率: ",
      cnf_matrix[1, 1]/(cnf_matrix[1, 0]+cnf_matrix[1, 1]))

# Plot non-normalized confusion matrix
class_names = [0, 1]
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names,
                      title='Confusion matrix')
plt.show()


In [None]:
# 在下采样样本中进行训练后，在全体数据集上绘制混淆矩阵
# 可以发现FP很大，即为了保证负类的查全率，错杀的数据量很大
lr = LogisticRegression(C=best_c, penalty='l2')
lr.fit(X_train_undersample, y_train_undersample.values.ravel())
y_pred = lr.predict(X_test.values)

# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test, y_pred)
np.set_printoptions(precision=2)

print("Recall metric in the testing dataset: ",
      cnf_matrix[1, 1]/(cnf_matrix[1, 0]+cnf_matrix[1, 1]))

# Plot non-normalized confusion matrix
class_names = [0, 1]
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names,
                      title='Confusion matrix')
plt.show()


In [None]:
# 可以看到在不采样的原始数据集上，由于正负样本数量差距，算法性能不是很好
best_c = printing_Kfold_scores(X_train, y_train)
best_c

In [None]:
# 直接在原始数据集上进行训练后的混淆矩阵
lr = LogisticRegression(C=best_c, penalty='l2')
lr.fit(X_train, y_train.values.ravel())
y_pred_undersample = lr.predict(X_test.values)

# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test, y_pred_undersample)
np.set_printoptions(precision=2)

print("Recall metric in the testing dataset: ",
      cnf_matrix[1, 1]/(cnf_matrix[1, 0]+cnf_matrix[1, 1]))

# Plot non-normalized confusion matrix
class_names = [0, 1]
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names,
                      title='Confusion matrix')
plt.show()


In [None]:
# 手动修改逻辑回归判断阈值，默认为0.5
lr = LogisticRegression(C=0.01, penalty='l2')
lr.fit(X_train_undersample, y_train_undersample.values.ravel())
# 输出预测概率，而非类别
y_pred_undersample_proba = lr.predict_proba(X_test_undersample.values)

thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

plt.figure(figsize=(10, 10))

for j, i in enumerate(thresholds, start=1):
    # 指定阈值
    y_test_predictions_high_recall = y_pred_undersample_proba[:, 1] > i

    plt.subplot(3, 3, j)
    # Compute confusion matrix
    cnf_matrix = confusion_matrix(
        y_test_undersample, y_test_predictions_high_recall)
    np.set_printoptions(precision=2)

    print("Recall metric in the testing dataset: ",
          cnf_matrix[1, 1]/(cnf_matrix[1, 0]+cnf_matrix[1, 1]))

    # Plot non-normalized confusion matrix
    class_names = [0, 1]
    plot_confusion_matrix(cnf_matrix, classes=class_names,
                          title=f'Threshold >= {i}')


## 9. SMOTE上采样算法
根据k近邻算法，生成样本。

对于少数类的吗，每一个样本寻找n个近邻，然后按比例生成其与近邻之间的数据

直观上看，smote采样就是生成了样本之间的样本。

In [None]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split


In [None]:
credit_cards = pd.read_csv('creditcard.csv')

columns = credit_cards.columns
# The labels are in the last column ('Class'). Simply remove it to obtain features columns
features_columns = columns.delete(len(columns)-1)

features = credit_cards[features_columns]
labels = credit_cards['Class']


In [None]:
features_train, features_test, labels_train, labels_test = train_test_split(features,
                                                                            labels,
                                                                            test_size=0.2,
                                                                            random_state=0)


In [None]:
# 只在训练数据上进行上采样
# 不在测试数据上进行上采样
oversampler = SMOTE(random_state=0)
os_features, os_labels = oversampler.fit_sample(features_train, labels_train)



In [None]:
len(os_labels[os_labels == 1])


In [None]:
os_features = pd.DataFrame(os_features)
os_labels = pd.DataFrame(os_labels)
best_c = printing_Kfold_scores(os_features, os_labels)


In [None]:
lr = LogisticRegression(C=best_c, penalty='l2')
lr.fit(os_features, os_labels.values.ravel())
y_pred = lr.predict(features_test.values)

# Compute confusion matrix
cnf_matrix = confusion_matrix(labels_test, y_pred)
np.set_printoptions(precision=2)

print("Recall metric in the testing dataset: ",
      cnf_matrix[1, 1]/(cnf_matrix[1, 0]+cnf_matrix[1, 1]))

# Plot non-normalized confusion matrix
class_names = [0, 1]
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names,
                      title='Confusion matrix')
plt.show()
