# 相关设置

In [83]:
from IPython.core.interactiveshell import InteractiveShell

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

InteractiveShell.ast_node_interactivity = "all"  # 一个cell显示多输出

pd.set_option('display.max_rows', 100)  # 设置最大显示100行
pd.set_option('display.max_columns', 100)  # 设置最大显示100列

In [84]:
EDA_data = pd.read_csv('data/after_EDA/EDA_data.csv')
print(EDA_data.shape)

(45211, 9)


In [85]:
EDA_data['is_pass'].value_counts()

1    31427
0    13784
Name: is_pass, dtype: int64

In [86]:
EDA_data.head()

Unnamed: 0,test_type,difficulty_level,education,city_tier,is_handicapped,trainee_engagement_rating,is_pass,gender_F,gender_M
0,2,2,3,3.0,1,1.0,0,0,1
1,2,1,2,4.0,1,3.0,1,1,0
2,1,1,3,1.0,1,2.0,1,0,1
3,2,1,3,3.0,1,1.0,1,1,0
4,2,2,2,1.0,1,4.0,1,1,0


In [87]:
pos_data = EDA_data[EDA_data['is_pass'] == 1].values
neg_data = EDA_data[EDA_data['is_pass'] == 0].values

# SMOTE 算法

In [88]:
import random
from sklearn.neighbors import NearestNeighbors

class Smote:
    def __init__(self, samples, N=10, k=5):
        self.n_samples, self.n_attrs = samples.shape
        self.N = N
        self.k = k
        self.samples = samples
        self.new_index = 0

    def over_sampling(self):
        N_ = int(self.N / 100)
        self.synthetic = np.zeros((self.n_samples * N_, self.n_attrs))
        neighbors = NearestNeighbors(n_neighbors=self.k).fit(self.samples)
        # print('neighbors', neighbors)
        for i in range(len(self.samples)):
            # print('samples', self.samples[i])
            # Finds the K-neighbors of a point.
            nnarray = neighbors.kneighbors(self.samples[i].reshape((1, -1)),
                                           return_distance=False)[0]
            # print('nna', nnarray)
            self._populate(N_, i, nnarray)
        return self.synthetic

    # for each minority class sample i ,choose N of the k nearest neighbors and generate N synthetic samples.
    def _populate(self, N, i, nnarray):
        for j in range(N):
            # print('j', j)
            nn = random.randint(0, self.k - 1)  # 包括end
            dif = self.samples[nnarray[nn]] - self.samples[i]
            gap = random.random()
            self.synthetic[self.new_index] = self.samples[i] + gap * dif
            self.new_index += 1
            # print(self.new_index)

In [89]:
smote = Smote(neg_data, N=100)
smote_neg_data = smote.over_sampling()

In [90]:
new_neg_data = np.vstack((neg_data, smote_neg_data))

In [91]:
len(new_neg_data)
len(pos_data)

27568

31427

In [92]:
new_data = pd.DataFrame(np.vstack((new_neg_data, pos_data)), columns=EDA_data.columns)

In [93]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(new_data, test_size=0.2, random_state=412)

print(train_set.shape)
print(test_set.shape)

(47196, 9)
(11799, 9)


In [94]:
X_train = train_set[train_set.columns.difference(['is_pass'])].values
y_train = train_set[['is_pass']].values
X_test = test_set[test_set.columns.difference(['is_pass'])].values
y_test = test_set[['is_pass']].values

# 评估方法

In [95]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report

# 随机森林

In [73]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

rf_model.fit(X_train, y_train.ravel())
y_predict = rf_model.predict(X=X_test)

print("accuracy_score: ", accuracy_score(y_test, y_predict))
print('confusion_matrix: ', confusion_matrix(y_test, y_predict))
print('roc_auc_score: ', roc_auc_score(y_test, y_predict))
print('classification_report: ', classification_report(y_test, y_predict))

RandomForestClassifier(n_estimators=10, n_jobs=1)

accuracy_score:  0.6966113259040255
confusion_matrix:  [[1000 1684]
 [ 984 5126]]
roc_auc_score:  0.6057653891277889
classification_report:                precision    recall  f1-score   support

           0       0.50      0.37      0.43      2684
           1       0.75      0.84      0.79      6110

    accuracy                           0.70      8794
   macro avg       0.63      0.61      0.61      8794
weighted avg       0.68      0.70      0.68      8794



# LightGBM

In [68]:
import lightgbm as lgb

trn_data = lgb.Dataset(X_train, y_train)
val_data = lgb.Dataset(X_test, y_test)

params = {'num_leaves': 60, #结果对最终效果影响较大，越大值越好，太大会出现过拟合
          'min_data_in_leaf': 30,
          'objective': 'binary', #定义的目标函数
          'max_depth': -1,
          'learning_rate': 0.03,
          "min_sum_hessian_in_leaf": 6,
          "boosting": "gbdt",
          "feature_fraction": 0.9,  #提取的特征比率
          "bagging_freq": 1,
          "bagging_fraction": 0.8,
          "bagging_seed": 11,
          "lambda_l1": 0.1,             #l1正则
          # 'lambda_l2': 0.001,     #l2正则
          "verbosity": -1,
          "nthread": -1,                #线程数量，-1表示全部线程，线程越多，运行的速度越快
          'metric': {'binary_logloss', 'auc'},  ##评价函数选择
          "random_state": 2019, #随机数种子，可以防止每次运行的结果不一致
          # 'device': 'gpu' ##如果安装的事gpu版本的lightgbm,可以加快运算
          }

lgb_model = lgb.train(params, 
                      trn_data, 
                      num_boost_round = 1000,
                      valid_sets = [trn_data,val_data], 
                      verbose_eval = 100, 
                      early_stopping_rounds = 100)

y_predict = lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration)

y_predict[y_predict > 0.5] = 1
y_predict[y_predict <= 0.5] = 0

print("accuracy_score: ", accuracy_score(y_test, y_predict))
print('confusion_matrix: ', confusion_matrix(y_test, y_predict))
print('roc_auc_score: ', roc_auc_score(y_test, y_predict))
print('classification_report: ', classification_report(y_test, y_predict))



Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.69747	training's binary_logloss: 0.629396	valid_1's auc: 0.687087	valid_1's binary_logloss: 0.635841
[200]	training's auc: 0.702152	training's binary_logloss: 0.62491	valid_1's auc: 0.688901	valid_1's binary_logloss: 0.633719
[300]	training's auc: 0.704309	training's binary_logloss: 0.622974	valid_1's auc: 0.689189	valid_1's binary_logloss: 0.633279
[400]	training's auc: 0.705683	training's binary_logloss: 0.621743	valid_1's auc: 0.689326	valid_1's binary_logloss: 0.633154
[500]	training's auc: 0.706609	training's binary_logloss: 0.620911	valid_1's auc: 0.689569	valid_1's binary_logloss: 0.632969
[600]	training's auc: 0.707351	training's binary_logloss: 0.620215	valid_1's auc: 0.689518	valid_1's binary_logloss: 0.633025
Early stopping, best iteration is:
[502]	training's auc: 0.706636	training's binary_logloss: 0.620895	valid_1's auc: 0.689557	valid_1's binary_logloss: 0.632963
accuracy_score:  0.6410

## XGBoost

In [96]:
from xgboost import XGBClassifier


xgb_model = XGBClassifier(learning_rate =0.1, 
                          n_estimators=1000,
                          use_label_encoder=False,
                          max_depth=5,
                          min_child_weight=1,
                          gamma=0,
                          subsample=0.8,
                          colsample_bytree=0.8,
                          objective= 'binary:logistic',
                          nthread=4,
                          scale_pos_weight=1,
                          random_state=412)

xgb_model.fit(X_train, y_train.ravel())
y_predict = xgb_model.predict(X_test)

print("accuracy_score: ", accuracy_score(y_test, y_predict))
print('confusion_matrix: ', confusion_matrix(y_test, y_predict))
print('roc_auc_score: ', roc_auc_score(y_test, y_predict))
print('classification_report: ', classification_report(y_test, y_predict))



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=1000, n_jobs=4, nthread=4, num_parallel_tree=1,
              random_state=412, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              subsample=0.8, tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

accuracy_score:  0.6420035596236969
confusion_matrix:  [[3236 2323]
 [1901 4339]]
roc_auc_score:  0.6387358251345704
classification_report:                precision    recall  f1-score   support

         0.0       0.63      0.58      0.61      5559
         1.0       0.65      0.70      0.67      6240

    accuracy                           0.64     11799
   macro avg       0.64      0.64      0.64     11799
weighted avg       0.64      0.64      0.64     11799



# SVM

In [117]:
from sklearn import svm

svm_clf = svm.SVC()
svm_clf.fit(X_train, y_train.ravel())

y_predict = svm_clf.predict(X_test)

print("accuracy_score: ", accuracy_score(y_test, y_predict))
print('confusion_matrix: ', confusion_matrix(y_test, y_predict))
print('roc_auc_score: ', roc_auc_score(y_test, y_predict))
print('classification_report: ', classification_report(y_test, y_predict))

SVC()

accuracy_score:  0.6947919035706164
confusion_matrix:  [[   0 2684]
 [   0 6110]]
roc_auc_score:  0.5
classification_report:                precision    recall  f1-score   support

           0       0.00      0.00      0.00      2684
           1       0.69      1.00      0.82      6110

    accuracy                           0.69      8794
   macro avg       0.35      0.50      0.41      8794
weighted avg       0.48      0.69      0.57      8794



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# A榜提交数据生成

In [97]:
test_data = pd.read_csv('data/test/test1.csv')
test_data.head()

Unnamed: 0,id_num,program_type,program_id,program_duration,test_id,test_type,difficulty_level,trainee_id,gender,education,city_tier,age,total_programs_enrolled,is_handicapped,trainee_engagement_rating,is_pass
0,1442_124,Y,Y_4,121,124,online,easy,1442,M,High School Diploma,2,,8,N,4.0,
1,12373_38,T,T_2,117,38,online,easy,12373,F,High School Diploma,4,31.0,2,N,3.0,
2,12035_21,T,T_3,134,21,offline,easy,12035,F,High School Diploma,4,,2,N,1.0,
3,11148_86,V,V_3,131,86,offline,vary hard,11148,M,Matriculation,1,,2,N,1.0,
4,1362_131,Y,Y_3,135,131,online,easy,1362,M,High School Diploma,2,45.0,2,N,4.0,


# 去除无用列

In [98]:
test_data.drop(['program_duration', 'program_type', 'id_num', 'program_id', 'test_id', 'trainee_id', 'age', 'total_programs_enrolled'], inplace=True, axis=1)

# 缺失值填充

In [99]:
# test_data['age'].fillna(test_data['age'].mean(), inplace=True)

# 独热编码

In [100]:
test_data = test_data.join(pd.get_dummies(test_data[["gender"]]))

# 去除独热编码的列

In [101]:
test_data.drop(['gender'], inplace=True, axis=1)

# 特征编码

In [102]:
test_type_map = {'online': 1, 'offline': 2}
difficulty_level_map = {'easy': 1, 'intermediate': 2, 'hard': 3, 'vary hard': 4}
education_map = {'No Qualification': 1, 'High School Diploma': 2, 'Matriculation': 3, 'Bachelors': 4, 'Masters': 5}
is_handicapped_map = {'N': 1, 'Y': 2}

In [103]:
test_data['test_type'] = test_data['test_type'].map(test_type_map)
test_data['difficulty_level'] = test_data['difficulty_level'].map(difficulty_level_map)
test_data['education'] = test_data['education'].map(education_map)
test_data['is_handicapped'] = test_data['is_handicapped'].map(is_handicapped_map)

In [104]:
test_data.drop(['is_pass'], axis=1, inplace=True)

In [105]:
test_data

Unnamed: 0,test_type,difficulty_level,education,city_tier,is_handicapped,trainee_engagement_rating,gender_F,gender_M
0,1,1,2,2,1,4.0,0,1
1,1,1,2,4,1,3.0,1,0
2,2,1,2,4,1,1.0,1,0
3,2,4,3,1,1,1.0,0,1
4,1,1,2,2,1,4.0,0,1
...,...,...,...,...,...,...,...,...
11679,2,1,2,3,1,2.0,1,0
11680,1,1,2,2,1,2.0,1,0
11681,1,1,3,2,1,4.0,0,1
11682,2,2,2,3,1,2.0,0,1


In [106]:
X_submit = test_data.values

In [110]:
y_submit = xgb_model.predict(X_submit)
y_submit[y_submit > 0.5] = 1
y_submit[y_submit <= 0.5] = 0

In [111]:
print(y_submit)

[1 1 1 ... 1 1 1]


In [112]:
id_num = pd.read_csv('data/test/test1.csv')['id_num']

In [113]:
submission = pd.DataFrame({
        "id_num": id_num,
        "is_pass": y_submit
    })
submission.to_csv('data/submission_Giyn.csv', index=False)