## BERT model
In this notebook, we format the data according to BERT's format.<br/>
It is necessary to clone Google BERT's git repository, available at: https://github.com/google-research/bert.<br/>
It is also needed to download the base BERT version : cased, 12-layer, 768-hidden, and 12-heads, 110M parameters, available at : https://storage.googleapis.com/bert_models/2018_10_18/cased_L-12_H-768_1-12.zip.
The model folder should be inside the './bert_ba' directory.

In [17]:
from bert import *
import pandas as pd
import pickle
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tqdm import tqdm_notebook as tqdm
from pandas import DataFrame
import numpy as np
import os
 
le = LabelEncoder()
 
df = pd.read_csv('./labeled_mcd.csv', sep='\t', index_col=0)
df.index = list(range(df.shape[0]))
y = list(df.pi)

"""Formatting to BERT format"""
df.rename(columns={'text':'sentence'}, inplace=True)
df['alpha'] = pd.Series(['a'] * df.shape[0], index=df.index)
df.rename(columns={'pi':'label'}, inplace=True)
df['label'] = df.label.apply(int)
df['id'] = pd.Series(list(range(df.shape[0])))
df = df[['id','label','alpha','sentence']]

print(df)

"""Retrieving the positive and negative indexes"""
neg_index = [k for k in range(len(y)) if y[k]==0]
pos_index = [k for k in range(len(y)) if y[k]==1]

"""Shuffling both the positive and negative indexes"""
np.random.seed(seed=0)
np.random.shuffle(neg_index)
np.random.shuffle(pos_index)

"""Computing train and test index sets for the chosen number of folds"""
cv = 10

s = len(neg_index)//cv
neg_index_list = [neg_index[k*s:(k+1)*s] for k in range(cv-1)]
neg_index_list.append(neg_index[(cv-1)*s:])
assert(np.sum([len(e) for e in neg_index_list])==len(neg_index)) # Check on negative index completion

s = len(pos_index)//cv
pos_index_list = [pos_index[k*s:(k+1)*s] for k in range(cv-1)]
pos_index_list.append(pos_index[(cv-1)*s:])
assert(np.sum([len(e) for e in pos_index_list])==len(pos_index))# Check on positive index completion

y_tests = []

for k in tqdm(range(cv)):
    """Splitting the data into train and test"""
    print('{}-th fold'.format(k+1))
    print('Splitting the data')
    neg_test = neg_index_list[k]
    pos_test = pos_index_list[k]
    neg_train = list(set(neg_index).difference(neg_index_list[k]))
    pos_train = list(set(pos_index).difference(pos_index_list[k]))
    
    pos_dev = pos_train[-1:]
    neg_dev = neg_train[-1:]
    pos_train = pos_train[:-1]
    neg_train = neg_train[:-1]
    
    """Splitting the data frames and saving to BERT format"""
    list_train = pos_train + neg_train
    list_dev = pos_dev + neg_dev
    list_test = pos_test + neg_test

    np.random.shuffle(list_train)
    np.random.shuffle(list_dev)
    np.random.shuffle(list_test)
    
    train_df = df.iloc[list_train]
    dev_df = df.iloc[list_dev]
    test_df = df.iloc[list_test]
    y_tests.append(list(test_df.label))
    print(len(y_tests[-1]))
    
    """Making the directory to save the data if it doesn't exist"""
    if not os.path.exists('./bert_mcd/data_mcd_'+str(k+1)):
        os.mkdir('./bert_mcd/data_mcd_'+str(k+1))
    
    train_df.to_csv('./bert_mcd/data_mcd_{}/train.tsv'.format(str(k+1)), sep='\t', index=False, header=False)
    dev_df.to_csv('./bert_mcd/data_mcd_{}/dev.tsv'.format(str(k+1)), sep='\t', index=False, header=False)
    test_df[['id','sentence']].to_csv('./bert_mcd/data_mcd_{}/test.tsv'.format(str(k+1)), sep='\t', index=False, header=True)

          id  label alpha                                           sentence
0          0      1     a   McDonald 's is trash, in this case I mean lit...
1          1      1     a  Here's What Eating Only  McDonald 's for 10 Da...
2          2      1     a   mcdonald 's can most definitely be good rn ev...
3          3      1     a  Nothing motivates my new diet than getting a p...
4          4      1     a  How does this  McDonald 's not have any burger...
5          5      0     a  Don't do any diettt, I just ate a  McDonald 's...
6          6      1     a  Going on a 1 month  McDonald 's free diet chal...
7          7      0     a  I just ate one fry from  McDonald 's cause I'm...
8          8      0     a  *at the  mcdonald 's drive thru* hi yes can i ...
9          9      1     a  I don't eat  McDonald 's, but they can go to h...
10        10      0     a  christine: this is how you know what casey's d...
11        11      0     a  wow. I wanted  McDonald 's McGriddles so bad s...

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

1-th fold
Splitting the data
2309
2-th fold
Splitting the data
2309
3-th fold
Splitting the data
2309
4-th fold
Splitting the data
2309
5-th fold
Splitting the data
2309
6-th fold
Splitting the data
2309
7-th fold
Splitting the data
2309
8-th fold
Splitting the data
2309
9-th fold
Splitting the data
2309
10-th fold
Splitting the data
2321



In [18]:
"""Training and harvesting performance metrics"""
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

os.chdir("./bert_mcd")
precs = []
recs = []
fs = []
aucs = []
for k in tqdm(range(cv)):
    """Instruction to train, evaluate, and predict results on test data"""
    os.system("python run_classifier.py --task_name=cola --do_train=true --do_eval=true --do-predict=true --data_dir=./data_mcd_{}/ --vocab_file=./cased_L-12_H-768_A-12/vocab.txt --bert_config_file=./cased_L-12_H-768_A-12/bert_config.json --init_checkpoint=./cased_L-12_H-768_A-12/bert_model.ckpt --max_seq_length=128 --train_batch_size=32 --learning_rate=2e-5 --num_train_epochs=10.0 --output_dir=./bert_output/ --do_lower_case=False".format(str(k+1)))

    test_res = pd.read_csv('./bert_output/test_results.tsv',sep='\t', header=None)
    test_res = list(test_res[1])
    test_pred = [int(y>=0.5) for y in test_res]

    y_test = y_tests[k]

    precision = precision_score(y_test, test_pred)
    recall = recall_score(y_test, test_pred)
    f1_ = f1_score(y_test, test_pred)
    roc_auc = roc_auc_score(y_test, test_res)
    
    precs.append(precision)
    recs.append(recall)
    fs.append(f1_)
    aucs.append(roc_auc)

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




In [19]:
print('Precision : {}+-\{}\nRecall : {}+-{}\nF1 score : {}+-{}\nROC AUC : {}+-{}'.format(np.mean(precs),np.std(precs),
                                                                                         np.mean(recs),np.std(recs),
                                                                                         np.mean(fs),np.std(fs),
                                                                                         np.mean(aucs),np.std(aucs)))

Precision : 0.5471863556185812+-\0.05181507407202481
Recall : 0.47018419693578933+-0.041914893879407514
F1 score : 0.5048652334731056+-0.04163977921804463
ROC AUC : 0.9683533494197031+-0.003961956519343309
