In [17]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV

import pickle
import datetime
import time
import sys 
sys.path.append("../../")
from E4525_ML import text

In [None]:
from sklearn.ensemble import RandomForestClassifier

# helper routing to log a message with time
def log_message( label_string ):
    ts = time.time()
    st = datetime.datetime.fromtimestamp( ts ).strftime( '%Y-%m-%d %H:%M:%S:%f' )
    print("{}: {}".format(st, label_string))
    with open("train_log.txt","a") as f:
        f.write("{}: {}\n".format(st, label_string))

def RF_subject_model(feature="set",n=-1):
    
    # load subject features
    data_dir = "subject_feature"

    set_vectorizer_filename=   data_dir+"/{}_vectorizer.p".format(feature)
    set_features_filename=     data_dir+"/{}_features.p".format(feature)
    labels_filename=     data_dir+"/labels.p"

    labels=pickle.load(open( labels_filename, "rb" ) )
    X_set_subject=pickle.load(open( set_features_filename, "rb" ) )
    
    if n!=-1:
        labels=labels[:n]
        X_set_subject=X_set_subject[:n]
    
    # split train and test data set
    x_train,x_test,y_train,y_test=train_test_split(X_set_subject,labels,shuffle=True,random_state=42,test_size=0.15)
    
    log_message("----------------------------------------------------------------")
    log_message( "Start to train RF with {} feature and {} samples.".format(feature,n) )
    
    # Select the best hyper param alpha
    param_grid={"n_estimators":[150, 250,300,500,1000],"max_depth":[3000, 5000,10000,20000,30000]}
    model=RandomForestClassifier()
    g=GridSearchCV(model,param_grid,cv=5, scoring=["roc_auc","accuracy"],n_jobs=-1,refit="roc_auc")
    g.fit(x_train, y_train)

    print("{} feature: ".format(feature))
    print(g.cv_results_['mean_test_roc_auc'])
    print("The best n_estimators is",g.best_estimator_.get_params()["n_estimators"])
    print("The best max_depth is",g.best_estimator_.get_params()["max_depth"])
    print("The 5-fold auc of is",g.cv_results_['mean_test_roc_auc'].max())
    print("The 5-fold acc of is",g.cv_results_['mean_test_accuracy'].max())
    
    Y_pred_proba=g.best_estimator_.predict_proba(x_test)
    Y_pred=g.best_estimator_.predict(x_test)
    print("auc on test set is",roc_auc_score(y_test,Y_pred_proba[:,1]))
    print("Accuracy on test set is",np.mean(Y_pred==y_test))
    
    log_message( "The best n_estimators is {}.The best max_depth is {}.".format(g.best_estimator_.get_params()["n_estimators"],
                                                                                     g.best_estimator_.get_params()["max_depth"]) )
    log_message( "{}".format(g.cv_results_['mean_test_roc_auc']) )
    log_message( "The 5-fold auc of is {}".format(g.cv_results_['mean_test_roc_auc'].max()) )
    log_message( "The 5-fold acc of is {}".format(g.cv_results_['mean_test_accuracy'].max()) )
    log_message( "auc on test set is {}".format(roc_auc_score(y_test,Y_pred_proba[:,1])) )
    log_message( "Accuracy on test set is {}".format(np.mean(Y_pred==y_test)) )
    
    log_message( "End to train RF with {} feature and {} samples.".format(feature,n) )
    
    return x_train,x_test,y_train,y_test

print("----------------------------------------------------------------")
RF_subject_model(feature="set",n=10000)
print("----------------------------------------------------------------")
RF_subject_model(feature="count",n=10000)
print("----------------------------------------------------------------")
RF_subject_model(feature="tfidf",n=10000)

----------------------------------------------------------------
2020-04-19 20:25:55:977009: ----------------------------------------------------------------
2020-04-19 20:25:55:977224: Start to train RF with set feature and 10000 samples.


In [None]:
from xgboost import XGBClassifier

# helper routing to log a message with time
def log_message( label_string ):
    ts = time.time()
    st = datetime.datetime.fromtimestamp( ts ).strftime( '%Y-%m-%d %H:%M:%S:%f' )
    print("{}: {}".format(st, label_string))
    with open("train_log.txt","a") as f:
        f.write("{}: {}\n".format(st, label_string))

def XGB_subject_model(feature="set",n=-1):
    
    # load subject features
    data_dir = "subject_feature"

    set_vectorizer_filename=   data_dir+"/{}_vectorizer.p".format(feature)
    set_features_filename=     data_dir+"/{}_features.p".format(feature)
    labels_filename=     data_dir+"/labels.p"

    labels=pickle.load(open( labels_filename, "rb" ) )
    X_set_subject=pickle.load(open( set_features_filename, "rb" ) )
    
    if n!=-1:
        labels=labels[:n]
        X_set_subject=X_set_subject[:n]
    
    # split train and test data set
    x_train,x_test,y_train,y_test=train_test_split(X_set_subject,labels,shuffle=True,random_state=42,test_size=0.15)
    
    log_message("----------------------------------------------------------------")
    log_message( "Start to train XGB with {} feature and {} samples.".format(feature,n) )
    
    # Select the best hyper param alpha
    param_grid={"n_estimators":[150, 250,300,500,1000],"max_depth":[3000, 5000,10000,20000,30000]}
    model=RandomForestClassifier()
    g=GridSearchCV(model,param_grid,cv=5, scoring=["roc_auc","accuracy"],n_jobs=-1,refit="roc_auc")
    g.fit(x_train, y_train)

    print("{} feature: ".format(feature))
    print(g.cv_results_['mean_test_roc_auc'])
    print("The best n_estimators is",g.best_estimator_.get_params()["n_estimators"])
    print("The best max_depth is",g.best_estimator_.get_params()["max_depth"])
    print("The 5-fold auc of is",g.cv_results_['mean_test_roc_auc'].max())
    print("The 5-fold acc of is",g.cv_results_['mean_test_accuracy'].max())
    
    Y_pred_proba=g.best_estimator_.predict_proba(x_test)
    Y_pred=g.best_estimator_.predict(x_test)
    print("auc on test set is",roc_auc_score(y_test,Y_pred_proba[:,1]))
    print("Accuracy on test set is",np.mean(Y_pred==y_test))
    
    log_message( "The best n_estimators is {}.The best max_depth is {}.".format(g.best_estimator_.get_params()["n_estimators"],
                                                                                     g.best_estimator_.get_params()["max_depth"]) )
    log_message( "{}".format(g.cv_results_['mean_test_roc_auc']) )
    log_message( "The 5-fold auc of is {}".format(g.cv_results_['mean_test_roc_auc'].max()) )
    log_message( "The 5-fold acc of is {}".format(g.cv_results_['mean_test_accuracy'].max()) )
    log_message( "auc on test set is {}".format(roc_auc_score(y_test,Y_pred_proba[:,1])) )
    log_message( "Accuracy on test set is {}".format(np.mean(Y_pred==y_test)) )
    
    log_message( "End to train XGB with {} feature and {} samples.".format(feature,n) )
    
    return x_train,x_test,y_train,y_test

print("----------------------------------------------------------------")
XGB_subject_model(feature="set",n=10000)
print("----------------------------------------------------------------")
XGB_subject_model(feature="count",n=10000)
print("----------------------------------------------------------------")
XGB_subject_model(feature="tfidf",n=10000)