In [1]:
import pandas as pd
import numpy as np
import sklearn
from time import time
import pickle
import matplotlib.pyplot as plt
import torch
import random
import re
import nltk
from collections import Counter
from nltk.corpus import words
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.model_selection import GridSearchCV

from transformers import (
                BertConfig, BertModel, BertTokenizer,
              XLNetConfig, XLNetModel, XLNetTokenizer)
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Set the seed value all over the place to make this reproducible.
def setup_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if you are using multi-GPU.
    np.random.seed(seed)  # Numpy module.
    random.seed(seed)  # Python random module.
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True  

setup_seed(42)

In [3]:
CONFIG = {}
CONFIG['A_train_path'] = "datasets/train/SemEval2018-T3-train-taskA_emoji.txt"
CONFIG['A_test_path'] = "datasets/goldtest_TaskA/SemEval2018-T3_gold_test_taskA_emoji.txt"
CONFIG['B_train_path'] = "datasets/train/SemEval2018-T3-train-taskB_emoji.txt"
CONFIG['B_test_path'] = "datasets/goldtest_TaskB/SemEval2018-T3_gold_test_taskB_emoji.txt"
CONFIG['max_len'] = 128
CONFIG['bert_models'] = ['bert-base-uncased', 'bert-base-cased', 'bert-large-uncased', 'bert-large-cased']
CONFIG['xlnet_models'] = ['xlnet-base-cased', 'xlnet-large-cased']
CONFIG['clfs'] = [('clf_RF', 'param_RF'), ('clf_NB', 'param_NB'), 
                     ('clf_LR', 'param_LR'), ('clf_SVM', 'param_SVM')]
# Random Forest Clf
CONFIG['param_RF'] = {
    'n_estimators': [100, 1000],
    'max_depth': [10, 50, 100, None]
}
CONFIG['clf_RF'] = RandomForestClassifier(oob_score=True, 
                        random_state=1, verbose=1, n_jobs=-1)
# Naive Bayesian Clf
CONFIG['param_NB'] = {}
CONFIG['clf_NB'] = GaussianNB()

# Logistic Regression Clf
CONFIG['param_LR'] = {
    'solver': ['liblinear', 'lbfgs', 'newton-cg'],
    'C': np.arange(0.01, 0.2, 0.01),
    'penalty': ['l2']
}
CONFIG['clf_LR'] = LogisticRegression(random_state=1, verbose=1, n_jobs=-1)

# SVM Clf
CONFIG['param_SVM'] = {
    'C': np.arange(5, 11, 1),  
    'gamma': [0.01, 0.1, 1, 10], 
#     'gamma': np.arange(0.05, 0.15, 0.02),
    'kernel': ['rbf']
}
CONFIG['clf_SVM'] = SVC(verbose=1)

In [4]:
# some util functions, just run it

# get score
def print_score(true, predicted, task='A'):
    acc = calc_accuracy(true, predicted)
    if task == "A":
        p, r, f = precision_recall_fscore(true, predicted, beta=1, labels=[0,1], pos_label=1)
    elif task == "B":
        p, r, f = precision_recall_fscore(true, predicted, beta=1, labels=[0,1,2,3])
    print("Accuracy:{0}\nPrecision:{1}\nRecall:{2}\nF1-score:{3}\n".format(acc, p,r,f))
            

def calc_accuracy(true, predicted):
    """Calculates the accuracy of a (multiclass) classifier, defined as the fraction of correct classifications."""
    return sum([t==p for t,p in zip(true, predicted)]) / float(len(true))


def precision_recall_fscore(true, predicted, beta=1, labels=None, pos_label=None, average=None, each=None):
    """Calculates the precision, recall and F-score of a classifier.
    :param true: iterable of the true class labels
    :param predicted: iterable of the predicted labels
    :param beta: the beta value for F-score calculation
    :param labels: iterable containing the possible class labels
    :param pos_label: the positive label (i.e. 1 label for binary classification)
    :param average: selects weighted, micro- or macro-averaged F-score
    """

    # Build contingency table as ldict
    ldict = {}
    for l in labels:
        ldict[l] = {"tp": 0., "fp": 0., "fn": 0., "support": 0.}

    for t, p in zip(true, predicted):
        if t == p:
            ldict[t]["tp"] += 1
        else:
            ldict[t]["fn"] += 1
            ldict[p]["fp"] += 1
        ldict[t]["support"] += 1

    # Calculate precision, recall and F-beta score per class
    beta2 = beta ** 2
    for l, d in ldict.items():
        try:
            ldict[l]["precision"] = d["tp"]/(d["tp"] + d["fp"])
        except ZeroDivisionError: ldict[l]["precision"] = 0.0
        try: ldict[l]["recall"]    = d["tp"]/(d["tp"] + d["fn"])
        except ZeroDivisionError: ldict[l]["recall"]    = 0.0
        try: ldict[l]["fscore"] = (1 + beta2) * (ldict[l]["precision"] * ldict[l]["recall"]) / (beta2 * ldict[l]["precision"] + ldict[l]["recall"])
        except ZeroDivisionError: ldict[l]["fscore"] = 0.0
    
    if each:
        return [ldict[l]["fscore"] for l in labels]
    
    # If there is only 1 label of interest, return the scores. No averaging needs to be done.
    if pos_label:
        d = ldict[pos_label]
        return (d["precision"], d["recall"], d["fscore"])
    # If there are multiple labels of interest, macro-average scores.
    else:
        for label in ldict.keys():
            avg_precision = sum(l["precision"] for l in ldict.values()) / len(ldict)
            avg_recall = sum(l["recall"] for l in ldict.values()) / len(ldict)
            avg_fscore = sum(l["fscore"] for l in ldict.values()) / len(ldict)
        return (avg_precision, avg_recall, avg_fscore)

# get ids and masks
def get_ids_mask(sents, tokenizer, max_len=None):
    t_e = [tokenizer.encode_plus(sent, 
                              max_length = max_len,
                              add_special_tokens = True,
                              pad_to_max_length = 'right',
#                                 return_tensors='pt',
                             ) for sent in sents]
    
    input_ids, attention_masks = [], []    
    
    for x in t_e:
        input_ids.append(x['input_ids'])
        attention_masks.append(x['attention_mask'])
    
    return input_ids, attention_masks

# format time for training time
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

def get_embedding(model_type, ids, masks, model):
    with torch.no_grad():
        last_hidden_states = model(ids, attention_mask=masks)[0]
    # for BERT, CLS token is at the beginning.
    # for XLNet, CLS token is at the last.
    if model_type == 'BERT':
        return last_hidden_states[:, 0, :].numpy()
    elif model_type == 'XLNet':
        return last_hidden_states[:, -1, :].numpy()
    

def score_optimization(clf, params, scoring, X_f, X_l, y_f, y_l, task='A', best=None):
    search = GridSearchCV(estimator=clf, param_grid=params,
            scoring=scoring, n_jobs=-1).fit(X_f, X_l)
    print("Classifier:", clf.__class__.__name__)
    print("Best parameters:",search.best_params_)
    print("Best score:", search.best_score_)
    print_score(y_l, search.best_estimator_.predict(y_f), task=task)
    if best:
        print(precision_recall_fscore(y_l, search.best_estimator_.predict(y_f), 
                                beta=1, labels=[0,1,2,3], each=True))

In [5]:
def run_embedding(task='A', model_type='BERT', verbose=False):
    df = pd.read_csv(CONFIG[task+'_train_path'], delimiter='\t', index_col=0)
    df_test = pd.read_csv(CONFIG[task+'_test_path'], delimiter='\t', index_col=0)
    
    print('Training Dataset has {} sentences.'.format(df.shape[0]))
    print('Test Dataset has {} sentences.'.format(df_test.shape[0]))
    print('Running word embedding  method for Task ' + task + ' on ' + model_type)
    
    # data preprocessing
    with open('normalized_sents.pickle', 'rb') as f:
        tv_sents, test_sents = pickle.load(f)
    tv_labels = df['Label'].values
    test_labels = df_test['Label'].values
        
    #initialize tokenizer
    tokenizers = {}
    if model_type == 'BERT':
        for name in CONFIG['bert_models']:
            if 'uncased' in name:
                tokenizers[name] =  BertTokenizer.from_pretrained(name, do_lower_case=True)
            else:
                tokenizers[name] =  BertTokenizer.from_pretrained(name)
    elif model_type == 'XLNet':
        for name in CONFIG['xlnet_models']:
            tokenizers[name] = XLNetTokenizer.from_pretrained(name)
    
    for name, tokenizer in tokenizers.items():
        tv_ids, tv_masks = get_ids_mask(tv_sents, tokenizer, max_len=CONFIG['max_len'])
        test_ids, test_masks = get_ids_mask(test_sents, tokenizer, max_len=CONFIG['max_len'])
        tv_ids = torch.tensor(tv_ids)
        tv_masks = torch.tensor(tv_masks)
        test_ids = torch.tensor(test_ids)
        test_masks = torch.tensor(test_masks)
        
        # initialize model
        if model_type == 'BERT':
            model = BertModel.from_pretrained(name)
        elif model_type == 'XLNet':
            model = XLNetModel.from_pretrained(name)
        
        train_features = get_embedding(model_type, tv_ids, tv_masks, model)
        test_features = get_embedding(model_type, test_ids, test_masks, model)
        
        print("The {} model's result for task {} is:".format(name, task))
        for clf in CONFIG['clfs']:
            score_optimization(CONFIG[clf[0]], CONFIG[clf[1]],
                'f1_macro', train_features, tv_labels, test_features, test_labels, task=task)

In [None]:
run_embedding()

Training Dataset has 3817 sentences.
Test Dataset has 784 sentences.
Running word embedding  method for Task A on BERT


In [7]:
run_embedding(task='B', model_type='BERT', verbose=False)

Training Dataset has 3817 sentences.
Test Dataset has 784 sentences.
Running word embedding  method for Task B on BERT
The bert-base-uncased model's result for task B is:


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 56 concurrent workers.
[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 338 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 688 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:    3.6s finished


Classifier: RandomForestClassifier
Best parameters: {'max_depth': 50, 'n_estimators': 1000}
Best score: 0.3122865450515221


[Parallel(n_jobs=56)]: Using backend ThreadingBackend with 56 concurrent workers.
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 338 tasks      | elapsed:    0.1s


Accuracy:0.6020408163265306
Precision:0.2655629139072848
Recall:0.32117258804723353
F1-score:0.29035218414631514



[Parallel(n_jobs=56)]: Done 688 tasks      | elapsed:    0.2s
[Parallel(n_jobs=56)]: Done 1000 out of 1000 | elapsed:    0.3s finished


Classifier: GaussianNB
Best parameters: {}
Best score: 0.33334208691576284
Accuracy:0.3520408163265306
Precision:0.3655702935387928
Recall:0.41052297523970144
F1-score:0.3276417932849872



[Parallel(n_jobs=-1)]: Using backend LokyBackend with 56 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    2.2s finished


Classifier: LogisticRegression
Best parameters: {'C': 0.15000000000000002, 'penalty': 'l2', 'solver': 'lbfgs'}
Best score: 0.4191603574221504
Accuracy:0.6415816326530612
Precision:0.41010885709818634
Recall:0.4273687670809055
F1-score:0.4016495213645721

[LibSVM]Classifier: SVC
Best parameters: {'C': 9, 'gamma': 0.01, 'kernel': 'rbf'}
Best score: 0.42620390564274213
Accuracy:0.6466836734693877
Precision:0.4146476888931776
Recall:0.42534556131533213
F1-score:0.40570721002563037

The bert-base-cased model's result for task B is:


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 56 concurrent workers.
[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 338 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 688 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:    3.3s finished


Classifier: RandomForestClassifier
Best parameters: {'max_depth': 50, 'n_estimators': 1000}
Best score: 0.30916635548908966


[Parallel(n_jobs=56)]: Using backend ThreadingBackend with 56 concurrent workers.
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 338 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 688 tasks      | elapsed:    0.2s
[Parallel(n_jobs=56)]: Done 1000 out of 1000 | elapsed:    0.3s finished


Accuracy:0.5778061224489796
Precision:0.25124569460390356
Recall:0.29121332439540043
F1-score:0.26578794814943196

Classifier: GaussianNB
Best parameters: {}
Best score: 0.34281433630648817
Accuracy:0.3239795918367347
Precision:0.340660249651279
Recall:0.40176063523184824
F1-score:0.3070308172184094



[Parallel(n_jobs=-1)]: Using backend LokyBackend with 56 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    2.0s finished


Classifier: LogisticRegression
Best parameters: {'C': 0.19, 'penalty': 'l2', 'solver': 'lbfgs'}
Best score: 0.41071756282784333
Accuracy:0.5931122448979592
Precision:0.37575315223830075
Recall:0.38380476126922686
F1-score:0.3685366031036099

[LibSVM]Classifier: SVC
Best parameters: {'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}
Best score: 0.3944543342516971
Accuracy:0.5956632653061225
Precision:0.3898224964821764
Recall:0.3743694283261698
F1-score:0.35549642126960435

The bert-large-uncased model's result for task B is:


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 56 concurrent workers.
[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 338 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 688 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:    3.1s finished


Classifier: RandomForestClassifier
Best parameters: {'max_depth': 50, 'n_estimators': 1000}
Best score: 0.31363534514288016


[Parallel(n_jobs=56)]: Using backend ThreadingBackend with 56 concurrent workers.
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 338 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 688 tasks      | elapsed:    0.2s
[Parallel(n_jobs=56)]: Done 1000 out of 1000 | elapsed:    0.3s finished


Accuracy:0.5892857142857143
Precision:0.2568472906403941
Recall:0.30991208167895634
F1-score:0.2803395874568564

Classifier: GaussianNB
Best parameters: {}
Best score: 0.3143430942289653
Accuracy:0.2780612244897959
Precision:0.29769728690635666
Recall:0.32634544037731095
F1-score:0.2543330195141147



[Parallel(n_jobs=-1)]: Using backend LokyBackend with 56 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    2.3s finished


Classifier: LogisticRegression
Best parameters: {'C': 0.14, 'penalty': 'l2', 'solver': 'lbfgs'}
Best score: 0.4290042918184061
Accuracy:0.6020408163265306
Precision:0.4763340018818403
Recall:0.42712129789979775
F1-score:0.4231064933274708

[LibSVM]Classifier: SVC
Best parameters: {'C': 5, 'gamma': 0.01, 'kernel': 'rbf'}
Best score: 0.4217034373308909
Accuracy:0.6160714285714286
Precision:0.5160989552533018
Recall:0.4130050116383276
F1-score:0.39466438003992205

The bert-large-cased model's result for task B is:


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 56 concurrent workers.
[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 338 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 688 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:    3.1s finished


Classifier: RandomForestClassifier
Best parameters: {'max_depth': 50, 'n_estimators': 1000}
Best score: 0.2994102925349263


[Parallel(n_jobs=56)]: Using backend ThreadingBackend with 56 concurrent workers.
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 338 tasks      | elapsed:    0.2s
[Parallel(n_jobs=56)]: Done 688 tasks      | elapsed:    0.3s
[Parallel(n_jobs=56)]: Done 1000 out of 1000 | elapsed:    0.3s finished


Accuracy:0.5829081632653061
Precision:0.25223741979061126
Recall:0.2903399422472026
F1-score:0.2648541114058356

Classifier: GaussianNB
Best parameters: {}
Best score: 0.3436632341795341
Accuracy:0.34438775510204084
Precision:0.3093406010867843
Recall:0.3646877110728054
F1-score:0.30558212829457765



[Parallel(n_jobs=-1)]: Using backend LokyBackend with 56 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   19.2s finished


Classifier: LogisticRegression
Best parameters: {'C': 0.09, 'penalty': 'l2', 'solver': 'newton-cg'}
Best score: 0.4187757828399434
Accuracy:0.6211734693877551
Precision:0.40917812142038945
Recall:0.4193330446704541
F1-score:0.39236836403033587

[LibSVM]Classifier: SVC
Best parameters: {'C': 7, 'gamma': 0.01, 'kernel': 'rbf'}
Best score: 0.42410458111778737
Accuracy:0.6479591836734694
Precision:0.4721850914110152
Recall:0.4417489448010887
F1-score:0.4211106318765043



In [8]:
run_embedding(task='A', model_type='XLNet', verbose=False)

Training Dataset has 3817 sentences.
Test Dataset has 784 sentences.
Running word embedding  method for Task A on XLNet
The xlnet-base-cased model's result for task A is:


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 56 concurrent workers.
[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 338 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 688 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:    2.9s finished


Classifier: RandomForestClassifier
Best parameters: {'max_depth': 10, 'n_estimators': 1000}
Best score: 0.6095089075139171


[Parallel(n_jobs=56)]: Using backend ThreadingBackend with 56 concurrent workers.
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 338 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 688 tasks      | elapsed:    0.2s
[Parallel(n_jobs=56)]: Done 1000 out of 1000 | elapsed:    0.3s finished


Accuracy:0.5982142857142857
Precision:0.4936708860759494
Recall:0.5016077170418006
F1-score:0.49760765550239233

Classifier: GaussianNB
Best parameters: {}
Best score: 0.5618199091246512
Accuracy:0.5178571428571429
Precision:0.37992831541218636
Recall:0.3408360128617363
F1-score:0.3593220338983051



[Parallel(n_jobs=-1)]: Using backend LokyBackend with 56 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.6s finished


Classifier: LogisticRegression
Best parameters: {'C': 0.01, 'penalty': 'l2', 'solver': 'newton-cg'}
Best score: 0.6019408787702122
Accuracy:0.6364795918367347
Precision:0.5329949238578681
Recall:0.6752411575562701
F1-score:0.5957446808510639

[LibSVM]Classifier: SVC
Best parameters: {'C': 5, 'gamma': 0.01, 'kernel': 'rbf'}
Best score: 0.3501222378412452
Accuracy:0.5994897959183674
Precision:0.42105263157894735
Recall:0.02572347266881029
F1-score:0.04848484848484848

The xlnet-large-cased model's result for task A is:


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 56 concurrent workers.
[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 338 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 688 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:    2.9s finished


Classifier: RandomForestClassifier
Best parameters: {'max_depth': 50, 'n_estimators': 1000}
Best score: 0.5968895475186066


[Parallel(n_jobs=56)]: Using backend ThreadingBackend with 56 concurrent workers.
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 338 tasks      | elapsed:    0.2s
[Parallel(n_jobs=56)]: Done 688 tasks      | elapsed:    0.3s
[Parallel(n_jobs=56)]: Done 1000 out of 1000 | elapsed:    0.3s finished


Accuracy:0.6198979591836735
Precision:0.5173333333333333
Recall:0.6237942122186495
F1-score:0.565597667638484

Classifier: GaussianNB
Best parameters: {}
Best score: 0.5273893859028636
Accuracy:0.5076530612244898
Precision:0.33480176211453744
Recall:0.24437299035369775
F1-score:0.2825278810408922

[LibLinear]Classifier: LogisticRegression
Best parameters: {'C': 0.01, 'penalty': 'l2', 'solver': 'liblinear'}
Best score: 0.5923848781906138
Accuracy:0.6352040816326531
Precision:0.5318066157760815
Recall:0.6720257234726688
F1-score:0.59375

[LibSVM]Classifier: SVC
Best parameters: {'C': 5, 'gamma': 0.01, 'kernel': 'rbf'}
Best score: 0.34174449416278385
Accuracy:0.610969387755102
Precision:0.7142857142857143
Recall:0.03215434083601286
F1-score:0.061538461538461535



In [9]:
run_embedding(task='B', model_type='XLNet', verbose=False)

Training Dataset has 3817 sentences.
Test Dataset has 784 sentences.
Running word embedding  method for Task B on XLNet
The xlnet-base-cased model's result for task B is:


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 56 concurrent workers.
[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 338 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 688 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:    3.0s finished


Classifier: RandomForestClassifier
Best parameters: {'max_depth': 50, 'n_estimators': 1000}
Best score: 0.27982804241095466


[Parallel(n_jobs=56)]: Using backend ThreadingBackend with 56 concurrent workers.
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 338 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 688 tasks      | elapsed:    0.2s
[Parallel(n_jobs=56)]: Done 1000 out of 1000 | elapsed:    0.3s finished


Accuracy:0.5969387755102041
Precision:0.2579940509388362
Recall:0.2991414427886351
F1-score:0.2727267267267267

Classifier: GaussianNB
Best parameters: {}
Best score: 0.3360455883158313
Accuracy:0.42346938775510207
Precision:0.3128506961066301
Recall:0.3470097169199043
F1-score:0.3153322992796677



[Parallel(n_jobs=-1)]: Using backend LokyBackend with 56 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   15.8s finished


Classifier: LogisticRegression
Best parameters: {'C': 0.03, 'penalty': 'l2', 'solver': 'newton-cg'}
Best score: 0.3701743885778144
Accuracy:0.5459183673469388
Precision:0.3906671117540151
Recall:0.3898471002418662
F1-score:0.38215866755481326

[LibSVM]Classifier: SVC
Best parameters: {'C': 5, 'gamma': 0.01, 'kernel': 'rbf'}
Best score: 0.1671027386109298
Accuracy:0.6033163265306123
Precision:0.15082908163265307
Recall:0.25
F1-score:0.1881463802704853

The xlnet-large-cased model's result for task B is:


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 56 concurrent workers.
[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 338 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 688 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:    3.2s finished


Classifier: RandomForestClassifier
Best parameters: {'max_depth': 50, 'n_estimators': 1000}
Best score: 0.26043754022936744


[Parallel(n_jobs=56)]: Using backend ThreadingBackend with 56 concurrent workers.
[Parallel(n_jobs=56)]: Done  88 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 338 tasks      | elapsed:    0.1s
[Parallel(n_jobs=56)]: Done 688 tasks      | elapsed:    0.3s
[Parallel(n_jobs=56)]: Done 1000 out of 1000 | elapsed:    0.3s finished


Accuracy:0.5625
Precision:0.2227540500736377
Recall:0.2629621512917032
F1-score:0.23414749845105326

Classifier: GaussianNB
Best parameters: {}
Best score: 0.1957215258379293
Accuracy:0.2002551020408163
Precision:0.2339479683781307
Recall:0.26091005396125344
F1-score:0.15746581716532895

[LibLinear]Classifier: LogisticRegression
Best parameters: {'C': 0.06999999999999999, 'penalty': 'l2', 'solver': 'liblinear'}
Best score: 0.3492917097378613
Accuracy:0.5267857142857143
Precision:0.3704582472992329
Recall:0.35271815174023063
F1-score:0.34529889772017636

[LibSVM]Classifier: SVC
Best parameters: {'C': 5, 'gamma': 0.01, 'kernel': 'rbf'}
Best score: 0.1671027386109298
Accuracy:0.6033163265306123
Precision:0.15082908163265307
Recall:0.25
F1-score:0.1881463802704853



In [6]:
def get_best_for_each(name, clf, model_type='BERT', verbose=False):
    df = pd.read_csv(CONFIG['B_train_path'], delimiter='\t', index_col=0)
    df_test = pd.read_csv(CONFIG['B_test_path'], delimiter='\t', index_col=0)
    
    print('Training Dataset has {} sentences.'.format(df.shape[0]))
    print('Test Dataset has {} sentences.'.format(df_test.shape[0]))
    
    # data preprocessing
    with open('normalized_sents.pickle', 'rb') as f:
        tv_sents, test_sents = pickle.load(f)
    tv_labels = df['Label'].values
    test_labels = df_test['Label'].values
        
    #initialize tokenizer
    if model_type == 'BERT':
        if 'uncased' in name:
            tokenizer =  BertTokenizer.from_pretrained(name, do_lower_case=True)
        else:
            tokenizer =  BertTokenizer.from_pretrained(name)
    elif model_type == 'XLNet':
        tokenizer= XLNetTokenizer.from_pretrained(name)
    
    tv_ids, tv_masks = get_ids_mask(tv_sents, tokenizer, max_len=CONFIG['max_len'])
    test_ids, test_masks = get_ids_mask(test_sents, tokenizer, max_len=CONFIG['max_len'])
    tv_ids = torch.tensor(tv_ids)
    tv_masks = torch.tensor(tv_masks)
    test_ids = torch.tensor(test_ids)
    test_masks = torch.tensor(test_masks)

    # initialize model
    if model_type == 'BERT':
        model = BertModel.from_pretrained(name)
    elif model_type == 'XLNet':
        model = XLNetModel.from_pretrained(name)

    train_features = get_embedding(model_type, tv_ids, tv_masks, model)
    test_features = get_embedding(model_type, test_ids, test_masks, model)

    print("The {} model's result for task {} is:".format(name, 'B'))
    score_optimization(CONFIG[clf[0]], CONFIG[clf[1]],
            'f1_macro', train_features, tv_labels, test_features, test_labels, task='B', best=True)

In [7]:
get_best_for_each('bert-large-uncased', ('clf_LR', 'param_LR'))

Training Dataset has 3817 sentences.
Test Dataset has 784 sentences.
The bert-large-uncased model's result for task B is:


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 56 concurrent workers.


Classifier: LogisticRegression
Best parameters: {'C': 0.14, 'penalty': 'l2', 'solver': 'lbfgs'}
Best score: 0.4290042918184061
Accuracy:0.6020408163265306
Precision:0.4763340018818403
Recall:0.42712129789979775
F1-score:0.4231064933274708

[0.7270788912579956, 0.4784688995215311, 0.40579710144927533, 0.08108108108108107]


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    2.4s finished
