In [1]:
# search in the hyperparameter space with W&B sweep
import logging

import pandas as pd
import sklearn
import random
from simpletransformers.classification import (
    ClassificationArgs1,
    ClassificationModel1,
)

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

df = pd.read_csv('all_sent.csv')
df = df.drop(columns=['BIO', 'BIO_1', 'BIO_2', 'labels']).rename(columns={'bi_labels': 'labels'})
df['title'] = df['main_heading'] + ': ' + df['heading']
df.loc[((df['main_heading'] == df['heading']) | (
    pd.isnull(df['heading']))), 'title'] = df['main_heading']
df['title'] = df['title'].fillna('')

model_args = ClassificationArgs1()

model_args.normalize_ofs = True
model_args.overwrite_output_dir = True
model_args.reprocess_input_data = True
model_args.manual_seed = 1
model_args.fp16 = False
model_args.use_multiprocessing = True
model_args.do_lower_case = True  # when using uncased model

# Create a TransformerModel
model = ClassificationModel1(
    "bert",
    "../sent/Bcoutputs/best_model",
    args=model_args,
)

In [2]:
result, model_outputs, wrong_predictions = model.eval_model(df, F1_score=sklearn.metrics.f1_score)

INFO:simpletransformers.classification.classification_model1: Converting to features started. Cache is not used.


  0%|          | 0/33800 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/4225 [00:00<?, ?it/s]

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
INFO:simpletransformers.classification.classification_model1:{'mcc': 0.0, 'tp': 0, 'tn': 30147, 'fp': 3653, 'fn': 0, 'F1_score': 0.0, 'eval_loss': 0.4854330243142856}


In [3]:
predictions = model_outputs.argmax(axis=1)
# select the sentences that are predicted positive, to be the input for subtask 2
mask = df['mask'].values
# sentences that are masked out are forced to be negative
predictions = predictions * mask
pos = df[predictions == 1]
pos.to_csv('pos_sent.csv', index=False)