using traditional machine learning methods

I. load data

In [22]:
from utils import load_cora
X_train, y_train, X_test, y_test = load_cora()

II. train

In [23]:
from IPython.display import display
import seaborn as sns
def train_test(model_name, model, X, y, X_test, y_test):
    # train the model
    model.fit(X, y)
    # calculate train set metrics
    y_pred = model.predict(X_test)
    from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
    confusion_matrix = confusion_matrix(y_test, y_pred, labels=[1,0])
    classification_report = classification_report(y_test, y_pred)
    accuracy_score = accuracy_score(y_test, y_pred)
    print(f'【{model_name}】')
    print(f'confusion_matrix:\n{confusion_matrix}')
    print(f'accuracy_score:\n{accuracy_score}')
    print(f'classification_report:\n{classification_report}')
    # sns.heatmap(confusion_matrix,cmap='Oranges')

1. naive bayes

In [24]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB,BernoulliNB
train_test("naive bayes-GaussianNB",GaussianNB(), X_train, y_train, X_test, y_test)
train_test("naive bayes-BernoulliNB",BernoulliNB(), X_train, y_train, X_test, y_test)

【naive bayes-GaussianNB】
confusion_matrix:
[[55  5]
 [20 51]]
accuracy_score:
0.556
classification_report:
              precision    recall  f1-score   support

           0       0.39      0.39      0.39       130
           1       0.44      0.60      0.50        92
           2       0.73      0.63      0.68       143
           3       0.72      0.59      0.65       318
           4       0.53      0.58      0.55       149
           5       0.44      0.63      0.52       105
           6       0.44      0.32      0.37        63

    accuracy                           0.56      1000
   macro avg       0.53      0.53      0.52      1000
weighted avg       0.58      0.56      0.56      1000

【naive bayes-BernoulliNB】
confusion_matrix:
[[48  6]
 [21 49]]
accuracy_score:
0.549
classification_report:
              precision    recall  f1-score   support

           0       0.38      0.38      0.38       130
           1       0.40      0.52      0.45        92
           2       0.66  

2. decision tree

In [25]:
from sklearn.tree import DecisionTreeClassifier
# CART基于基尼系数
train_test("Decision Tree Classifier",DecisionTreeClassifier(criterion='gini'), X_train, y_train, X_test, y_test)
# ID3和C4.5基于信息熵
train_test("Decision Tree Classifier",DecisionTreeClassifier(criterion='entropy'), X_train, y_train, X_test, y_test)

【Decision Tree Classifier】
confusion_matrix:
[[26 11]
 [ 9 38]]
accuracy_score:
0.375
classification_report:
              precision    recall  f1-score   support

           0       0.28      0.29      0.28       130
           1       0.29      0.28      0.28        92
           2       0.41      0.40      0.41       143
           3       0.49      0.45      0.47       318
           4       0.38      0.40      0.39       149
           5       0.35      0.38      0.37       105
           6       0.18      0.19      0.18        63

    accuracy                           0.38      1000
   macro avg       0.34      0.34      0.34      1000
weighted avg       0.38      0.38      0.38      1000

【Decision Tree Classifier】
confusion_matrix:
[[22 12]
 [16 32]]
accuracy_score:
0.346
classification_report:
              precision    recall  f1-score   support

           0       0.21      0.25      0.22       130
           1       0.26      0.24      0.25        92
           2       0.4

3. knn

In [26]:
from sklearn.neighbors import KNeighborsClassifier
train_test("KNeighbors Classifier",KNeighborsClassifier(n_neighbors=7), X_train, y_train, X_test, y_test)

【KNeighbors Classifier】
confusion_matrix:
[[47  4]
 [11 59]]
accuracy_score:
0.603
classification_report:
              precision    recall  f1-score   support

           0       0.44      0.45      0.45       130
           1       0.53      0.51      0.52        92
           2       0.62      0.70      0.66       143
           3       0.68      0.72      0.70       318
           4       0.66      0.56      0.61       149
           5       0.52      0.64      0.58       105
           6       0.74      0.27      0.40        63

    accuracy                           0.60      1000
   macro avg       0.60      0.55      0.56      1000
weighted avg       0.61      0.60      0.60      1000



4. svm

In [27]:
from sklearn.svm import SVC # "Support Vector Classifier"
# 选取线性核函数linear
train_test("SVC-linear",SVC(kernel = 'linear'), X_train, y_train, X_test, y_test)
# # 选取非线性核函数poly(多项式核函数)和 rbf(高斯核函数)
# train_test("SVC-poly",SVC(kernel = 'poly'), X_train, y_train, X_test, y_test)
# train_test("SVC-rbf",SVC(kernel = 'rbf'), X_train, y_train, X_test, y_test)

【SVC-linear】
confusion_matrix:
[[48 10]
 [ 6 58]]
accuracy_score:
0.611
classification_report:
              precision    recall  f1-score   support

           0       0.35      0.45      0.39       130
           1       0.57      0.52      0.55        92
           2       0.71      0.71      0.71       143
           3       0.72      0.71      0.71       318
           4       0.62      0.62      0.62       149
           5       0.64      0.54      0.59       105
           6       0.55      0.43      0.48        63

    accuracy                           0.61      1000
   macro avg       0.59      0.57      0.58      1000
weighted avg       0.62      0.61      0.61      1000



5. logistic regression

In [28]:
from sklearn.linear_model import LogisticRegression
train_test("Logistic Regression",LogisticRegression(solver='newton-cg'), X_train, y_train, X_test, y_test)
train_test("Logistic Regression",LogisticRegression(solver='lbfgs'), X_train, y_train, X_test, y_test)
train_test("Logistic Regression",LogisticRegression(solver='liblinear'), X_train, y_train, X_test, y_test)
train_test("Logistic Regression",LogisticRegression(solver='sag'), X_train, y_train, X_test, y_test)

【Logistic Regression】
confusion_matrix:
[[40  7]
 [ 8 55]]
accuracy_score:
0.623
classification_report:
              precision    recall  f1-score   support

           0       0.39      0.42      0.41       130
           1       0.63      0.43      0.52        92
           2       0.71      0.72      0.71       143
           3       0.68      0.76      0.72       318
           4       0.65      0.64      0.65       149
           5       0.65      0.60      0.62       105
           6       0.49      0.40      0.44        63

    accuracy                           0.62      1000
   macro avg       0.60      0.57      0.58      1000
weighted avg       0.62      0.62      0.62      1000

【Logistic Regression】
confusion_matrix:
[[40  7]
 [ 8 55]]
accuracy_score:
0.623
classification_report:
              precision    recall  f1-score   support

           0       0.39      0.42      0.41       130
           1       0.63      0.43      0.52        92
           2       0.71      0.7

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [29]:
"""
应用最小二乘法进行一维线性回归
求导直接求得系数
不加正则化项： w = 1/(X.T@X)@X.T@y
加L2正则化项： w = 1/(X.T@X+alpha*E)@X.T@y
"""
# from
# def sigmoid(z):
#     s = 1/(1+np.exp(-z))
#     s = s.reshape(s.shape[0],1)
#     return s

'\n应用最小二乘法进行一维线性回归\n求导直接求得系数\n不加正则化项： w = 1/(X.T@X)@X.T@y\n加L2正则化项： w = 1/(X.T@X+alpha*E)@X.T@y\n'

6. gbdt

In [30]:
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1,
                                 max_depth=3, random_state=0)
train_test("GradientBoostingClassifier",clf, X_train, y_train, X_test, y_test)

【GradientBoostingClassifier】
confusion_matrix:
[[28  8]
 [ 4 56]]
accuracy_score:
0.579
classification_report:
              precision    recall  f1-score   support

           0       0.44      0.43      0.43       130
           1       0.72      0.30      0.43        92
           2       0.68      0.63      0.65       143
           3       0.59      0.81      0.68       318
           4       0.57      0.53      0.55       149
           5       0.62      0.60      0.61       105
           6       0.20      0.06      0.10        63

    accuracy                           0.58      1000
   macro avg       0.54      0.48      0.49      1000
weighted avg       0.57      0.58      0.56      1000



7. xgb

In [31]:
from xgboost import XGBClassifier
train_test("XGBClassifier",XGBClassifier(
 learning_rate =0.01,
 n_estimators=5000,
 max_depth=4,
 min_child_weight=6,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 reg_alpha=0.005,
 objective= 'binary:logistic',
 nthread=4,
 seed=27), X_train, y_train, X_test, y_test)

【XGBClassifier】
confusion_matrix:
[[41  4]
 [ 5 60]]
accuracy_score:
0.63
classification_report:
              precision    recall  f1-score   support

           0       0.47      0.46      0.46       130
           1       0.72      0.45      0.55        92
           2       0.68      0.68      0.68       143
           3       0.66      0.81      0.73       318
           4       0.62      0.61      0.61       149
           5       0.65      0.63      0.64       105
           6       0.53      0.30      0.38        63

    accuracy                           0.63      1000
   macro avg       0.62      0.56      0.58      1000
weighted avg       0.63      0.63      0.62      1000



8. lgbm

In [32]:
from lightgbm import LGBMClassifier
# 构建 LGBM 分类器模型
train_test("LGBMClassifier",LGBMClassifier(
    n_estimators=700,
    learning_rate=0.01,
    lambda_l1=0.6,
    lamda_l2=0,
    cat_smooth=1,
    max_bin=25,
    min_data_in_leaf=71,
    feature_fraction=0.7,
    bagging_fraction=1.0,
    bagging_freq=6,
    num_leaves=42,
    max_depth=10), X_train, y_train, X_test, y_test)

【LGBMClassifier】
confusion_matrix:
[[32  8]
 [ 4 52]]
accuracy_score:
0.605
classification_report:
              precision    recall  f1-score   support

           0       0.42      0.40      0.41       130
           1       0.71      0.35      0.47        92
           2       0.71      0.69      0.70       143
           3       0.60      0.81      0.69       318
           4       0.63      0.60      0.62       149
           5       0.62      0.57      0.60       105
           6       0.61      0.22      0.33        63

    accuracy                           0.60      1000
   macro avg       0.61      0.52      0.54      1000
weighted avg       0.61      0.60      0.59      1000



In [33]:
# # GridSearch调参
# import pandas as pd
# import lightgbm as lgb
# from sklearn.model_selection import GridSearchCV # Perforing grid search
#
# parameters = {
#               'max_depth': [15, 20, 25, 30, 35],
#               'learning_rate': [0.01, 0.02, 0.05, 0.1, 0.15],
#               'feature_fraction': [0.6, 0.7, 0.8, 0.9, 0.95],
#               'bagging_fraction': [0.6, 0.7, 0.8, 0.9, 0.95],
#               'bagging_freq': [2, 4, 5, 6, 8],
#               'lambda_l1': [0, 0.1, 0.4, 0.5, 0.6],
#               'lambda_l2': [0, 10, 15, 35, 40],
#               'cat_smooth': [1, 10, 15, 20, 35]
# }
# gbm = lgb.LGBMClassifier()
# gsearch = GridSearchCV(gbm, param_grid=parameters, scoring='accuracy', cv=3)
# gsearch.fit(X_train, y_train)
#
# print("Best score: %0.3f" % gsearch.best_score_)
# print("Best parameters set:")
# best_parameters = gsearch.best_estimator_.get_params()
# for param_name in sorted(parameters.keys()):
#     print("\t%s: %r" % (param_name, best_parameters[param_name]))

MaxAcurate-XGB-0.63 < GCN/GAT/GraphSAGE