In [3]:
import time
import itertools
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier, plot_importance
import numpy as np
from numpy import sort

# XGBoost

In [4]:
def xgboost(X_train, y_train, X_test, y_test):
    last_time = time.time()
    xgb = XGBClassifier(n_estimators=300, objective='multi:softmax', num_class=13, random_state=0)

    xgb.fit(X_train, y_train)
    middle_time = time.time()

    y_pred = xgb.predict(X_test)

    current_time = time.time()

    accuracy_score(y_test, y_pred)

    print("训练耗时： {}".format(middle_time - last_time))
    print("测试耗时： {}".format(current_time - middle_time))
    cm = confusion_matrix(y_test, y_pred)
    print('confusion matrix xgb:')
    print(cm)
    print('classification report xgb:')
    print(classification_report(y_test, y_pred))

    plot_confusion_matrix(cm, classes=['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13'],
                          normalize=True, title='Normalized confusion matrix')

    plt.show()

In [5]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.4f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="red")
        # color="red" if cm[i, j] > thresh else "black")
    # plt.set_tight_layout(True)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plot_confusion_matrix(cm, classes=['0', '1', '2'], normalize=True, title='Normalized confusion matrix')
    plt.show()

# Combine all csv data

In [6]:
    # 显示所有列
    pd.set_option('display.max_columns', None)
    # 显示所有行
    pd.set_option('display.max_rows', None)
    # 设置value的显示长度为100，默认为50
    pd.set_option('max_colwidth', 100)

    # 读取csv
    # path = r'/home/itu/datadisk/dataset/csv-for-learning/'
    path = r'/Users/xiafei/Downloads/csv-for-learning/'
    # all_n_files = glob.glob(path + "/*.n.csv")
    # all_v_files = glob.glob(path + "/*.v.csv")
    # all_p_files = glob.glob(path + "/*.p.csv")
    all_n_files = [path + x for x in
                   ['20200629.n.csv', '20200630.n.csv', '20200701.n.csv', '20200702.n.csv', '20200703.n.csv',
                    '20200704.n.csv', '20200705.n.csv', '20200706.n.csv']]
    all_v_files = [path + x for x in
                   ['20200629.v.csv', '20200630.v.csv', '20200701.v.csv', '20200702.v.csv', '20200703.v.csv',
                    '20200704.v.csv', '20200705.v.csv', '20200706.v.csv']]
    all_p_files = [path + x for x in
                   ['20200629.p.csv', '20200630.p.csv', '20200701.p.csv', '20200702.p.csv', '20200703.p.csv',
                    '20200704.p.csv', '20200705.p.csv', '20200706.p.csv']]

    li_n = []
    li_v = []
    li_p = []

    for filename in all_n_files:
        print('read_csv network:', filename)
        df = pd.read_csv(filename, index_col=None, header=0)
        li_n.append(df)

    for filename in all_v_files:
        print('read_csv virtual:', filename)
        df = pd.read_csv(filename, index_col=None, header=0)
        li_v.append(df)

    for filename in all_p_files:
        print('read_csv physical:', filename)
        df = pd.read_csv(filename, index_col=None, header=0)
        li_p.append(df)

    dataset_n = pd.concat(li_n, axis=0, ignore_index=True, sort=False)
    dataset_v = pd.concat(li_v, axis=0, ignore_index=True, sort=False)
    dataset_p = pd.concat(li_p, axis=0, ignore_index=True, sort=False)

    print(dataset_n.shape)
    print(dataset_v.shape)
    print(dataset_p.shape)

    dataset_n.drop(['type', 'type_code'], axis=1, inplace=True)
    dataset_v.drop(['type', 'type_code'], axis=1, inplace=True)

    dataset_n.rename(columns=lambda x: 'n_' + x, inplace=True)
    dataset_v.rename(columns=lambda x: 'v_' + x, inplace=True)
    dataset_p.rename(columns=lambda x: 'p_' + x, inplace=True)

    dataset = pd.concat([dataset_n, dataset_v, dataset_p], axis=1, sort=False)
    # 数据集概览
    # print(dataset.describe())
    # print(dataset.head(5))

    # valid
    # print('isnan', np.isnan(dataset.any()))
    # print('isfinite', np.isfinite(dataset.all()))
    # dataset = pd.read_csv('/home/itu/datadisk/dataset/csv-for-learning/20200629.n.csv')

    print('列数:', dataset.shape[1], '行数:', dataset.shape[0])

    column = dataset.columns
    X = dataset[column[:-2]]
    # X= X.values
    Y = dataset[column[-1]]
    # Y= Y.values
    # 划分训练测试
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

    print(X_train.shape, y_train.shape)
    print(X_test.shape, y_test.shape)

    ss = StandardScaler()
    std_X_train = ss.fit_transform(X_train)
    std_X_test = ss.fit_transform(X_test)

read_csv network: /Users/xiafei/Downloads/csv-for-learning/20200629.n.csv
read_csv network: /Users/xiafei/Downloads/csv-for-learning/20200630.n.csv
read_csv network: /Users/xiafei/Downloads/csv-for-learning/20200701.n.csv
read_csv network: /Users/xiafei/Downloads/csv-for-learning/20200702.n.csv
read_csv network: /Users/xiafei/Downloads/csv-for-learning/20200703.n.csv
read_csv network: /Users/xiafei/Downloads/csv-for-learning/20200704.n.csv
read_csv network: /Users/xiafei/Downloads/csv-for-learning/20200705.n.csv
read_csv network: /Users/xiafei/Downloads/csv-for-learning/20200706.n.csv
read_csv virtual: /Users/xiafei/Downloads/csv-for-learning/20200629.v.csv
read_csv virtual: /Users/xiafei/Downloads/csv-for-learning/20200630.v.csv
read_csv virtual: /Users/xiafei/Downloads/csv-for-learning/20200701.v.csv
read_csv virtual: /Users/xiafei/Downloads/csv-for-learning/20200702.v.csv
read_csv virtual: /Users/xiafei/Downloads/csv-for-learning/20200703.v.csv
read_csv virtual: /Users/xiafei/Downlo

# fit model

In [7]:
model = XGBClassifier(importance_type='gain')
model.fit(X, Y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

# show importance image

In [None]:
_, ax = plt.subplots(figsize=(20, 100))
plot_importance(model, ax=ax, max_num_features=200, importance_type='gain')
plt.show()

# write result to file

In [None]:
  # 结果写入文件
im = pd.DataFrame({'importance': model.feature_importances_, 'var': dataset.columns[:-2]})
im = im.sort_values(by='importance', ascending=False)
im.to_csv("feature_important_data_XG.csv")

In [19]:
thresholds = sort(model.feature_importances_)[-100:]
print(thresholds)

[0.00161694 0.0016192  0.00163051 0.00164602 0.00165121 0.00165458
 0.0016593  0.00166001 0.00166666 0.0016675  0.00167826 0.00168187
 0.00168245 0.00168806 0.00169543 0.00170044 0.00170201 0.00170274
 0.00170803 0.00171275 0.00171286 0.00174048 0.00174438 0.00175812
 0.00177849 0.00178315 0.00181165 0.00182481 0.00184686 0.00186827
 0.00188302 0.0018952  0.00189637 0.00189994 0.00193583 0.00200437
 0.00200541 0.00201293 0.00201563 0.0020375  0.00215802 0.00216093
 0.00216682 0.00217154 0.00222244 0.00222873 0.0022385  0.00225193
 0.00230204 0.0023082  0.00230905 0.0023281  0.00236029 0.00239628
 0.00241818 0.00246973 0.00252312 0.00253492 0.0025465  0.00261815
 0.00295379 0.00296716 0.00311144 0.00317391 0.00339135 0.00346303
 0.00359551 0.00393201 0.0041093  0.00417794 0.00444713 0.00448986
 0.00506805 0.00531637 0.00539663 0.00639367 0.00708875 0.00744147
 0.00800852 0.00884959 0.00894771 0.0089646  0.01010152 0.01024031
 0.01124802 0.0118994  0.01225778 0.01707036 0.0204329  0.0216

In [20]:
for thresh in thresholds:
    # select features using threshold
    selection = SelectFromModel(model, threshold=thresh, prefit=True)
    select_X_train = selection.transform(X_train)
    # train model
    selection_model = XGBClassifier()
    selection_model.fit(select_X_train, y_train)
    # eval model
    select_X_test = selection.transform(X_test)
    y_pred = selection_model.predict(select_X_test)
    predictions = [round(value) for value in y_pred]
    accuracy = accuracy_score(y_test, predictions)
    print("Thresh=%.4f, n=%d, Accuracy: %.2f%%" % (thresh, select_X_train.shape[1], accuracy * 100.0))

Thresh=0.0016, n=100, Accuracy: 75.70%
Thresh=0.0016, n=99, Accuracy: 75.28%
Thresh=0.0016, n=98, Accuracy: 75.34%
Thresh=0.0016, n=97, Accuracy: 75.96%
Thresh=0.0017, n=96, Accuracy: 75.80%
Thresh=0.0017, n=95, Accuracy: 76.27%
Thresh=0.0017, n=94, Accuracy: 75.54%
Thresh=0.0017, n=93, Accuracy: 75.75%
Thresh=0.0017, n=92, Accuracy: 75.49%
Thresh=0.0017, n=91, Accuracy: 74.61%
Thresh=0.0017, n=90, Accuracy: 75.28%
Thresh=0.0017, n=89, Accuracy: 76.16%
Thresh=0.0017, n=88, Accuracy: 75.80%
Thresh=0.0017, n=87, Accuracy: 75.80%
Thresh=0.0017, n=86, Accuracy: 75.80%
Thresh=0.0017, n=85, Accuracy: 75.39%
Thresh=0.0017, n=84, Accuracy: 75.23%
Thresh=0.0017, n=83, Accuracy: 75.65%
Thresh=0.0017, n=82, Accuracy: 75.23%
Thresh=0.0017, n=81, Accuracy: 75.34%
Thresh=0.0017, n=80, Accuracy: 75.28%
Thresh=0.0017, n=79, Accuracy: 74.87%
Thresh=0.0017, n=78, Accuracy: 75.49%
Thresh=0.0018, n=77, Accuracy: 76.16%
Thresh=0.0018, n=76, Accuracy: 74.97%
Thresh=0.0018, n=75, Accuracy: 75.75%
Thresh=0.00