In [1]:
# To use it:
# https://github.com/charles9n/bert-sklearn

In [2]:
import pickle
from bert_sklearn import BertClassifier
from bert_sklearn import load_model
import pandas as pd
import pathlib
from tools import processing

import spacy

nlp = spacy.load("en_core_web_sm")

model = BertClassifier()    

Building sklearn text classifier...


#### Load data

In [3]:
corpus = "stories/"
dataset_df = pd.read_pickle("../data/" + corpus + "train.pkl")
dataset_df['both_masked'] = dataset_df.apply(lambda row: processing.determine_context("maskedSentWithDet", row, "both"), axis=1)
dataset_df['both_unmasked'] = dataset_df.apply(lambda row: processing.determine_context("currentSentence", row, "both"), axis=1)

#### Train BERT classifiers

In [4]:
dFolders = {"targetWithoutDet": "../models/classifiers/" + corpus + "targetWithoutDet_without_prp/",
            "context3wmasked": "../models/classifiers/" + corpus + "context3wmasked_without_prp/",
            "context3w": "../models/classifiers/" + corpus + "context3w_without_prp/"}
# ,
#             "both_masked": "../models/classifiers/" + corpus + "both_masked_with_prp/",
#             "both_unmasked": "../models/classifiers/" + corpus + "both_unmasked_with_prp/",
#             "currentSentence": "../models/classifiers/" + corpus + "currentSentence_with_prp/",
#             "maskedSentence": "../models/classifiers/" + corpus + "maskedSentence_with_prp/"}

for col in ["targetWithoutDet", "context3wmasked", "context3w"]: #, "both_masked", "both_unmasked", "currentSentence", "maskedSentWithDet"]:
    dataset_df = dataset_df[dataset_df["targetIsPRP"] == False]
    print(col)
    if not pathlib.Path(dFolders[col] + "/bert.bin").exists():
        pathlib.Path(dFolders[col]).mkdir(parents=True, exist_ok=True)

        X = dataset_df[col].tolist()
        y = dataset_df["animated"].tolist()

        print (len(X),len(y))

        from sklearn.model_selection import train_test_split
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

        # finetune model
        model.fit(X_train, y_train)

        # make predictions
        y_pred = model.predict(X_test)

        from sklearn.metrics import precision_recall_fscore_support

        precision_recall_fscore_support(y_pred, y_test, average='macro')

        # save model to disk
        savefile= dFolders[col] + "/bert.bin"
        model.save(savefile)
        print("DONE")
        print()

targetWithoutDet
8457 8457
Loading bert-base-uncased model...
Defaulting to linear classifier/regressor
Loading Pytorch checkpoint
train data size: 5328, validation data size: 591


Training  : 100%|██████████| 167/167 [1:16:20<00:00, 24.03s/it, loss=0.34] 
Validating: 100%|██████████| 74/74 [02:19<00:00,  1.87s/it]

Epoch 1, Train loss: 0.3402, Val loss: 0.2074, Val accy: 93.23%



Training  : 100%|██████████| 167/167 [1:08:30<00:00, 24.71s/it, loss=0.158]
Validating: 100%|██████████| 74/74 [02:20<00:00,  1.81s/it]

Epoch 2, Train loss: 0.1583, Val loss: 0.1821, Val accy: 94.08%



Training  : 100%|██████████| 167/167 [1:23:03<00:00, 31.09s/it, loss=0.117]
Validating: 100%|██████████| 74/74 [02:27<00:00,  1.91s/it]

Epoch 3, Train loss: 0.1174, Val loss: 0.1812, Val accy: 94.08%



Predicting: 100%|██████████| 318/318 [10:19<00:00,  1.53s/it]


DONE

context3wmasked
8457 8457
Loading bert-base-uncased model...
Defaulting to linear classifier/regressor
Loading Pytorch checkpoint
train data size: 5328, validation data size: 591


Training  : 100%|██████████| 167/167 [1:14:43<00:00, 23.98s/it, loss=0.538]
Validating: 100%|██████████| 74/74 [02:20<00:00,  1.96s/it]

Epoch 1, Train loss: 0.5382, Val loss: 0.4338, Val accy: 81.56%



Training  : 100%|██████████| 167/167 [1:14:41<00:00, 19.51s/it, loss=0.293]
Validating: 100%|██████████| 74/74 [01:52<00:00,  1.44s/it]

Epoch 2, Train loss: 0.2929, Val loss: 0.4820, Val accy: 79.70%



Training  : 100%|██████████| 167/167 [1:07:40<00:00, 20.90s/it, loss=0.151]
Validating: 100%|██████████| 74/74 [01:58<00:00,  1.50s/it]

Epoch 3, Train loss: 0.1509, Val loss: 0.5646, Val accy: 80.54%



Predicting: 100%|██████████| 318/318 [08:55<00:00,  1.24s/it]


DONE

context3w
8457 8457
Loading bert-base-uncased model...
Defaulting to linear classifier/regressor
Loading Pytorch checkpoint
train data size: 5328, validation data size: 591


Training  : 100%|██████████| 167/167 [1:11:31<00:00, 21.11s/it, loss=0.412]
Validating: 100%|██████████| 74/74 [02:01<00:00,  1.54s/it]

Epoch 1, Train loss: 0.4116, Val loss: 0.2770, Val accy: 90.19%



Training  : 100%|██████████| 167/167 [1:04:52<00:00, 17.04s/it, loss=0.188]
Validating: 100%|██████████| 74/74 [01:35<00:00,  1.25s/it]

Epoch 2, Train loss: 0.1878, Val loss: 0.2363, Val accy: 91.37%



Training  : 100%|██████████| 167/167 [54:57<00:00, 16.45s/it, loss=0.117]
Validating: 100%|██████████| 74/74 [01:34<00:00,  1.19s/it]

Epoch 3, Train loss: 0.1172, Val loss: 0.2457, Val accy: 91.71%



Predicting: 100%|██████████| 318/318 [06:39<00:00,  1.01s/it]


DONE

