In [16]:
import os
import random
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV

data_dir = "../data"

In [11]:
def load_data(train_ratio = 0.7):
    random.seed(925) # 固定训练集和验证集
    
    features = np.load(os.path.join(data_dir, "processed/train_features.npy"))
    labels = np.load(os.path.join(data_dir, "processed/train_labels.npy"))
    X_train = []
    y_train = []
    X_valid = []
    y_valid = []

    for i in range(len(labels)):
        if random.uniform(0, 1) < train_ratio:
            X_train.append(features[i])
            y_train.append(labels[i])
        else:
            X_valid.append(features[i])
            y_valid.append(labels[i])

    return np.array(X_train), np.array(X_valid), np.array(y_train), np.array(y_valid)

In [12]:
X_train, X_valid, y_train, y_valid = load_data()
print(X_train.shape)
print(y_valid.shape)

(11741, 10)
(4928,)


In [19]:
# 随机森林===============================
params = {'n_estimators':[20,40,60,80],'max_depth':[12,14,16],
          'criterion':['entropy'],"class_weight":[ 'balanced'],"random_state":[1]}

clf = GridSearchCV(estimator=RandomForestClassifier(),param_grid=params,cv = 5,n_jobs = -1,scoring="f1_macro")
clf.fit(X_train, y_train)  # 模型训练完毕
print("Best Params:{}".format(clf.best_params_))

rdf = RandomForestClassifier(n_estimators=clf.best_params_['n_estimators'], criterion="entropy", random_state =1, max_depth=clf.best_params_['max_depth'], class_weight="balanced")
rdf.fit(X_train, y_train)

def eval(clf, X_train, X_test, y_train, y_test):
    predicted = clf.predict(X_train)
    accu = accuracy_score(y_train, predicted)
    print("训练集准确率：", accu)
    f1 = f1_score(y_train, predicted, average="macro")
    print("训练集f1：", f1)

    predicted = clf.predict(X_test)
    accu = accuracy_score(y_test, predicted)
    print("测试集准确率：", accu)
    f1 = f1_score(y_test, predicted, average="macro")
    print("测试集f1：", f1)

eval(rdf, X_train, X_valid,y_train,  y_valid)

Best Params:{'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 12, 'n_estimators': 60, 'random_state': 1}
训练集准确率： 0.9354399114215144
训练集f1： 0.9261176801987003
测试集准确率： 0.6749188311688312
测试集f1： 0.49886836508529564
