# 相关设置

In [315]:
from IPython.core.interactiveshell import InteractiveShell

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

InteractiveShell.ast_node_interactivity = "all"  # 一个cell显示多输出

pd.set_option('display.max_rows', 100)  # 设置最大显示100行
pd.set_option('display.max_columns', 100)  # 设置最大显示100列

In [316]:
EDA_data = pd.read_csv('data/after_EDA/EDA_data.csv')
print(EDA_data.shape)

(45211, 8)


In [317]:
EDA_data['is_pass'].value_counts()

1    31427
0    13784
Name: is_pass, dtype: int64

In [318]:
EDA_data.head()

Unnamed: 0,test_type,difficulty_level,gender,education,city_tier,is_handicapped,trainee_engagement_rating,is_pass
0,1,2,0,3,3.0,0,1.0,0
1,1,1,1,2,4.0,0,3.0,1
2,0,1,0,3,1.0,0,2.0,1
3,1,1,1,3,3.0,0,1.0,1
4,1,2,1,2,1.0,0,4.0,1


In [319]:
pos_data = EDA_data[EDA_data['is_pass'] == 1].values
neg_data = EDA_data[EDA_data['is_pass'] == 0].values

# SMOTE 算法

In [320]:
import random
from sklearn.neighbors import NearestNeighbors

class Smote:
    def __init__(self, samples, N=10, k=5):
        self.n_samples, self.n_attrs = samples.shape
        self.N = N
        self.k = k
        self.samples = samples
        self.new_index = 0

    def over_sampling(self):
        N_ = int(self.N / 100)
        self.synthetic = np.zeros((self.n_samples * N_, self.n_attrs))
        neighbors = NearestNeighbors(n_neighbors=self.k).fit(self.samples)
        # print('neighbors', neighbors)
        for i in range(len(self.samples)):
            # print('samples', self.samples[i])
            # Finds the K-neighbors of a point.
            nnarray = neighbors.kneighbors(self.samples[i].reshape((1, -1)),
                                           return_distance=False)[0]
            # print('nna', nnarray)
            self._populate(N_, i, nnarray)
        return self.synthetic

    # for each minority class sample i ,choose N of the k nearest neighbors and generate N synthetic samples.
    def _populate(self, N, i, nnarray):
        for j in range(N):
            # print('j', j)
            nn = random.randint(0, self.k - 1)  # 包括end
            dif = self.samples[nnarray[nn]] - self.samples[i]
            gap = random.random()
            self.synthetic[self.new_index] = self.samples[i] + gap * dif
            self.new_index += 1
            # print(self.new_index)

In [321]:
# smote = Smote(neg_data, N=100)
# smote_neg_data = smote.over_sampling()

In [322]:
# new_neg_data = np.vstack((neg_data, smote_neg_data))

In [323]:
# len(new_neg_data)
# len(pos_data)

In [324]:
# new_data = pd.DataFrame(np.vstack((new_neg_data, pos_data)), columns=EDA_data.columns)

In [325]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(EDA_data, test_size=0.2, random_state=412)

print(train_set.shape)
print(test_set.shape)

(36168, 8)
(9043, 8)


In [326]:
X_train = train_set[train_set.columns.difference(['is_pass'])].values
y_train = train_set[['is_pass']].values
X_test = test_set[test_set.columns.difference(['is_pass'])].values
y_test = test_set[['is_pass']].values

# 评估方法

In [327]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report

# 随机森林

In [224]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

rf_model.fit(X_train, y_train.ravel())
y_predict = rf_model.predict(X=X_test)

print("accuracy_score: ", accuracy_score(y_test, y_predict))
print('confusion_matrix: ', confusion_matrix(y_test, y_predict))
print('roc_auc_score: ', roc_auc_score(y_test, y_predict))
print('classification_report: ', classification_report(y_test, y_predict))

RandomForestClassifier(n_estimators=10, n_jobs=1)

accuracy_score:  0.6978621787582443
confusion_matrix:  [[ 993 1691]
 [ 966 5144]]
roc_auc_score:  0.6059343603727978
classification_report:                precision    recall  f1-score   support

           0       0.51      0.37      0.43      2684
           1       0.75      0.84      0.79      6110

    accuracy                           0.70      8794
   macro avg       0.63      0.61      0.61      8794
weighted avg       0.68      0.70      0.68      8794



# LightGBM

In [225]:
import lightgbm as lgb

trn_data = lgb.Dataset(X_train, y_train)
val_data = lgb.Dataset(X_test, y_test)

params = {'num_leaves': 60, #结果对最终效果影响较大，越大值越好，太大会出现过拟合
          'min_data_in_leaf': 30,
          'objective': 'binary', #定义的目标函数
          'max_depth': -1,
          'learning_rate': 0.03,
          "min_sum_hessian_in_leaf": 6,
          "boosting": "gbdt",
          "feature_fraction": 0.9,  #提取的特征比率
          "bagging_freq": 1,
          "bagging_fraction": 0.8,
          "bagging_seed": 11,
          "lambda_l1": 0.1,             #l1正则
          # 'lambda_l2': 0.001,     #l2正则
          "verbosity": -1,
          "nthread": -1,                #线程数量，-1表示全部线程，线程越多，运行的速度越快
          'metric': {'binary_logloss', 'auc'},  ##评价函数选择
          "random_state": 2019, #随机数种子，可以防止每次运行的结果不一致
          # 'device': 'gpu' ##如果安装的事gpu版本的lightgbm,可以加快运算
          }

lgb_model = lgb.train(params, 
                      trn_data, 
                      num_boost_round = 1000,
                      valid_sets = [trn_data,val_data], 
                      verbose_eval = 100, 
                      early_stopping_rounds = 100)

y_predict = lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration)

y_predict[y_predict > 0.5] = 1
y_predict[y_predict <= 0.5] = 0

print("accuracy_score: ", accuracy_score(y_test, y_predict))
print('confusion_matrix: ', confusion_matrix(y_test, y_predict))
print('roc_auc_score: ', roc_auc_score(y_test, y_predict))
print('classification_report: ', classification_report(y_test, y_predict))



Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.732365	training's binary_logloss: 0.544026	valid_1's auc: 0.700284	valid_1's binary_logloss: 0.56074
[200]	training's auc: 0.745711	training's binary_logloss: 0.532884	valid_1's auc: 0.702849	valid_1's binary_logloss: 0.559267
[300]	training's auc: 0.754831	training's binary_logloss: 0.525873	valid_1's auc: 0.703734	valid_1's binary_logloss: 0.559172
Early stopping, best iteration is:
[235]	training's auc: 0.749223	training's binary_logloss: 0.530184	valid_1's auc: 0.703482	valid_1's binary_logloss: 0.559041
accuracy_score:  0.721287241300887
confusion_matrix:  [[ 779 1905]
 [ 546 5564]]
roc_auc_score:  0.600438373973428
classification_report:                precision    recall  f1-score   support

           0       0.59      0.29      0.39      2684
           1       0.74      0.91      0.82      6110

    accuracy                           0.72      8794
   macro avg       0.67      0.60      0.60

## XGBoost

In [None]:
from xgboost import XGBClassifier


xgb_model = XGBClassifier(learning_rate =0.05, 
                          n_estimators=1000,
                          use_label_encoder=False,
                          max_depth=5,
                          min_child_weight=1,
                          gamma=0,
                          subsample=0.8,
                          colsample_bytree=0.8,
                          objective= 'binary:logistic',
                          nthread=4,
                          scale_pos_weight=1,
                          random_state=412)

xgb_model.fit(X_train, y_train.ravel())
y_predict = xgb_model.predict(X_test)

print("accuracy_score: ", accuracy_score(y_test, y_predict))
print('confusion_matrix: ', confusion_matrix(y_test, y_predict))
print('roc_auc_score: ', roc_auc_score(y_test, y_predict))
print('classification_report: ', classification_report(y_test, y_predict))



# A榜提交数据生成

In [None]:
test_data = pd.read_csv('data/test/test1.csv')
test_data.head()

# 去除无用列

In [None]:
delete_list = ['program_duration', 'program_type', 'id_num', 'program_id', 'test_id', 'trainee_id', 'age', 'total_programs_enrolled']

In [None]:
test_data.drop(delete_list, inplace=True, axis=1)

# 缺失值填充

In [None]:
# test_data['age'].fillna(test_data['age'].mean(), inplace=True)

# 独热编码

In [None]:
# test_data = test_data.join(pd.get_dummies(test_data[["gender"]]))

# 去除独热编码的列

In [None]:
# test_data.drop(['gender'], inplace=True, axis=1)

# 特征编码

In [None]:
test_type_map = {'online': 0, 'offline': 1}
difficulty_level_map = {'easy': 1, 'intermediate': 2, 'hard': 3, 'vary hard': 4}
education_map = {'No Qualification': 1, 'High School Diploma': 2, 'Matriculation': 3, 'Bachelors': 4, 'Masters': 5}
is_handicapped_map = {'N': 0, 'Y': 1}
gender_map = {'M': 0, 'F': 1}

In [None]:
test_data['test_type'] = test_data['test_type'].map(test_type_map)
test_data['difficulty_level'] = test_data['difficulty_level'].map(difficulty_level_map)
test_data['education'] = test_data['education'].map(education_map)
test_data['is_handicapped'] = test_data['is_handicapped'].map(is_handicapped_map)
test_data['gender'] = test_data['gender'].map(gender_map)

In [None]:
test_data.drop(['is_pass'], axis=1, inplace=True)

In [None]:
test_data

In [None]:
X_submit = test_data.values

In [None]:
y_submit = xgb_model.predict(X_submit)
y_submit[y_submit > 0.5] = 1
y_submit[y_submit <= 0.5] = 0

In [None]:
print(y_submit)

In [None]:
id_num = pd.read_csv('data/test/test1.csv')['id_num']

In [None]:
submission = pd.DataFrame({
        "id_num": id_num,
        "is_pass": y_submit
    })
submission.to_csv('data/submission_Giyn.csv', index=False)