# Train a definitive model, log it and predict golden standard

## Paths import 


In [1]:
import os
import sys
#sys.path.append(os.path.realpath('./AugmentedSocialScientist'))

In [2]:
os.chdir('../')

Make sure that your current working directory (cwd) is `ReproducingAugSS/AugmentedSocialScientist/`

In [3]:
#os.getcwd()

In [4]:
from PATHS import ENDOEXO_ASS, ENDOEXO_GS, SAVED_MODELS_PATH

## Parameters definition

In [5]:
N_EPOCHS_OFF = 5
SAMPLER_OFF = "random"
LR_OFF = 5e-5
BS_OFF = 32

N_EPOCHS_ENDOEXO = 25
SAMPLER_ENDOEXO = "sequential"
LR_ENDOEXO = 1e-5
BS_ENDOEXO = 64

DROP_DUPLICATES = True
PERCENT_OF_DATA = 1


In [6]:
import pandas as pd
import numpy as np
from functools import reduce
from operator import add
from TransferSociologist.data import Dataset
from TransferSociologist.models import BertClassifier
from sklearn.metrics import precision_recall_fscore_support


[nltk_data] Downloading package punkt to
[nltk_data]     /pbs/home/r/rshen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:

def prepare_experiment(
    train_path, gs_path, drop_duplicates=False, percent_of_data=1
):
    dataset = Dataset()
    dataset.read(
        data_path=train_path, gold_standard_path=gs_path, data_type="csv"
    )
    dataset.df = dataset.df.rename({'is_control_1': 'is_control'}, axis=1)
    if drop_duplicates == True:
        if "is_control" in dataset.df.columns:
            gs = dataset.df[dataset.df.is_gold_standard == True]
            no_gs = dataset.df[dataset.df.is_gold_standard == False]
            no_gs = pd.concat(
                [
                    no_gs[no_gs.is_control == True]
                        .groupby(["text"])
                        .apply(lambda x: x.sample(1))
                        .reset_index(drop=True),
                    no_gs[no_gs.is_control != True]
                ]
            )
            dataset.df = pd.concat([gs, no_gs])

    dataset.task_encode(
        task_type="sentence_classification", bert_model="CamemBert"
    )
    # natural_samples = dataset.df
    # Now sample subset of data
    gs = dataset.df[dataset.df.is_gold_standard == True]
    no_gs = dataset.df[dataset.df.is_gold_standard == False]
    no_gs = no_gs.sample(frac=percent_of_data)
    dataset.df = pd.concat([no_gs, gs])
    dataset.encode_torch(
        task_type="sentence_classification",
        bert_model="CamemBert",
        # test_size=0.3,
        random_seed=2018,
    )

    dataset_pred = Dataset()
    dataset_pred.read(data_path=gs_path, data_type="csv")
    dataset_pred.task_encode(
        task_type="sentence_classification",
        bert_model="CamemBert",
        # pred_gs=True,
        pred_mode=True,
    )
    dataset_pred.encode_torch(
        task_type="sentence_classification",
        bert_model="CamemBert",
        pred_mode=True,
    )
    dataset_pred.df.head()
    return dataset, dataset_pred


def run_experiment(dataset, dataset_pred, batch_size, lr, sampler, nepochs):
    clf = BertClassifier()
    random_seed = np.random.randint(2021)

    perfs, _, epoch_best = clf.fit_evaluate(
        dataset.train,
        dataset.test,
        batch_size=batch_size,
        sampler=sampler,
        nepochs=nepochs,
        random_seed=random_seed,
        learning_rate=lr,
    )
    perf_dic = {
        "batch_size": batch_size,
        "lr": lr,
        "sampler": sampler,
        "nepochs": nepochs,
        "best epoch": int(epoch_best),
        "random_seed": int(random_seed),
        "train_size": len(dataset.train[0])
    }
    inv_conv_dict = {
        item: key
        for i, (key, item) in enumerate(dataset.conversion_dict.items())
    }
    for i in range(len(perfs[0])):
        j = inv_conv_dict[i]
        perf_dic[f"prec_tok_{j}"] = float(perfs[0][i])
        perf_dic[f"rec_tok_{j}"] = float(perfs[1][i])
        perf_dic[f"F1_tok_{j}"] = float(perfs[2][i])
        perf_dic[f"supp_tok_{j}"] = float(perfs[3][i])
        # perf_dic[f'prec_{j}_best_run'] = float(best_perfs[0][i])
        # perf_dic[f'rec_{j}_best_run'] = float(best_perfs[1][i])
        # perf_dic[f'F1_{j}_best_run'] = float(best_perfs[2][i])

    # Now, predict
    labels, logits = clf.predict(dataset_pred.pred)

    dataset_pred.df["labels_pred"] = labels
    dataset_pred.df["labels_pred"] = dataset_pred.df["labels_pred"].apply(
        lambda x: inv_conv_dict[x]
    )
    dataset_pred.df["logits"] = logits
    dataset_pred.df["pred_labels"] = dataset_pred.df.apply(
        lambda x: [[x.spans[0], x.spans[1], x.labels_pred]], axis=1
    )
    cleaned_preds = (
        dataset_pred.df.groupby(dataset_pred.df.text)
        .agg({"pred_labels": "sum"})
        .reset_index()
    )
    preds = pd.merge(
        cleaned_preds,
        dataset_pred.df[["text", "labels"]].drop_duplicates(),
        left_on="text",
        right_on="text",
    )
    preds["labels_str"] = preds.text.apply(lambda x: [0] * len(x))
    preds["labels_pred_str"] = preds.text.apply(lambda x: [0] * len(x))
    preds["labels_str"] = preds.apply(
        lambda x: fill_zeros(x.labels, x.labels_str, dataset.conversion_dict),
        axis=1,
    )
    preds["labels_pred_str"] = preds.apply(
        lambda x: fill_zeros(
            x.pred_labels, x.labels_pred_str, dataset.conversion_dict
        ),
        axis=1,
    )
    true = reduce(add, preds["labels_str"].values)
    pred = reduce(add, preds["labels_pred_str"].values)
    perfs_char = precision_recall_fscore_support(true, pred)
    for i in range(len(perfs_char[0])):
        j = inv_conv_dict[i]
        perf_dic[f"prec_char_{j}"] = float(perfs_char[0][i])
        perf_dic[f"rec_char_{j}"] = float(perfs_char[1][i])
        perf_dic[f"F1_char_{j}"] = float(perfs_char[2][i])
        perf_dic[f"supp_char_{j}"] = float(perfs_char[3][i])
    return perf_dic, dataset_pred, clf


def fill_zeros(labels, zeros, conv_dict):
    try:
        labels = eval(labels)
    except:
        pass
    for l in labels:
        start_span, stop_span, lab = l
        size = stop_span - start_span
        number = conv_dict[lab]
        zeros[start_span:stop_span] = [number] * size
    return zeros


In [8]:
import os
import json
import logging
import sys
from torch.cuda import empty_cache

def process(params, paths, percent_of_data=1):
    train_path, gs_path = paths
    dataset, dataset_pred = prepare_experiment(
        train_path, gs_path, params["drop_duplicates"], percent_of_data
    )
    p = run_experiment(
        dataset,
        dataset_pred,
        params["batch_size"],
        params["lr"],
        params["sampler"],
        params["nepochs"],
    )
    return p


In [9]:
empty_cache()
tpath = ENDOEXO_ASS
gs_path = ENDOEXO_GS
params = {
"batch_size": BS_ENDOEXO,
"nepochs": N_EPOCHS_ENDOEXO,
"lr": LR_ENDOEXO,
"sampler": SAMPLER_ENDOEXO,
"drop_duplicates": DROP_DUPLICATES,
}
percent = 1

paths = tpath, gs_path
exp_name = os.path.basename(tpath).replace(
    "_train", "").replace('.csv', '')
p, dataset_pred, clf = process(params, paths, percent)
p["exp_name"] = exp_name
p["percent_of_data"] = percent

Using gold standard


Some weights of the model checkpoint at camembert-base were not used when initializing CamembertForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.weight'


Training...

  Average training loss: 1.02
  Training epoch took: 0:00:30

Running Validation...


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         0.0       0.88      0.46      0.61       149
         1.0       0.57      0.96      0.72       179
         2.0       0.00      0.00      0.00        49

    accuracy                           0.64       377
   macro avg       0.49      0.47      0.44       377
weighted avg       0.62      0.64      0.58       377


Training...

  Average training loss: 0.91
  Training epoch took: 0:00:30

Running Validation...


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         0.0       0.65      0.83      0.73       149
         1.0       0.68      0.72      0.70       179
         2.0       0.00      0.00      0.00        49

    accuracy                           0.67       377
   macro avg       0.44      0.51      0.48       377
weighted avg       0.58      0.67      0.62       377


Training...

  Average training loss: 0.76
  Training epoch took: 0:00:30

Running Validation...


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         0.0       0.67      0.86      0.75       149
         1.0       0.72      0.75      0.74       179
         2.0       0.00      0.00      0.00        49

    accuracy                           0.69       377
   macro avg       0.46      0.54      0.50       377
weighted avg       0.61      0.69      0.65       377


Training...

  Average training loss: 0.62
  Training epoch took: 0:00:30

Running Validation...
              precision    recall  f1-score   support

         0.0       0.70      0.83      0.76       149
         1.0       0.74      0.79      0.76       179
         2.0       0.83      0.20      0.33        49

    accuracy                           0.73       377
   macro avg       0.76      0.61      0.62       377
weighted avg       0.74      0.73      0.71       377


Training...

  Average training loss: 0.52
  Training epoch took: 0:00:30

Running Validation...
              precision    recall  f1-scor


  Average training loss: 0.11
  Training epoch took: 0:00:30

Running Validation...
              precision    recall  f1-score   support

         0.0       0.74      0.78      0.76       149
         1.0       0.77      0.79      0.78       179
         2.0       0.68      0.51      0.58        49

    accuracy                           0.75       377
   macro avg       0.73      0.69      0.71       377
weighted avg       0.75      0.75      0.75       377


Training...

  Average training loss: 0.10
  Training epoch took: 0:00:30

Running Validation...
              precision    recall  f1-score   support

         0.0       0.75      0.79      0.77       149
         1.0       0.78      0.80      0.79       179
         2.0       0.68      0.53      0.60        49

    accuracy                           0.76       377
   macro avg       0.74      0.70      0.72       377
weighted avg       0.76      0.76      0.76       377


Training...

  Average training loss: 0.10
  Training 

In [10]:
clf.save(os.path.join(SAVED_MODELS_PATH, 'policy_politics_ASS'))

In [11]:
p

{'batch_size': 64,
 'lr': 1e-05,
 'sampler': 'sequential',
 'nepochs': 25,
 'best epoch': 14,
 'random_seed': 423,
 'train_size': 2357,
 'prec_tok_endogène': 0.7283950617283951,
 'rec_tok_endogène': 0.7919463087248322,
 'F1_tok_endogène': 0.7588424437299035,
 'supp_tok_endogène': 149.0,
 'prec_tok_exogène': 0.7784090909090909,
 'rec_tok_exogène': 0.7653631284916201,
 'F1_tok_exogène': 0.771830985915493,
 'supp_tok_exogène': 179.0,
 'prec_tok_autre': 0.6666666666666666,
 'rec_tok_autre': 0.5306122448979592,
 'F1_tok_autre': 0.5909090909090909,
 'supp_tok_autre': 49.0,
 'prec_char_endogène': 0.7209515096065874,
 'rec_char_endogène': 0.7971498944405349,
 'F1_char_endogène': 0.7571383812010445,
 'supp_char_endogène': 22736.0,
 'prec_char_exogène': 0.8180124410253328,
 'rec_char_exogène': 0.8008060720169452,
 'F1_char_exogène': 0.8093178135545347,
 'supp_char_exogène': 33992.0,
 'prec_char_autre': 0.5477815699658704,
 'rec_char_autre': 0.279454439930354,
 'F1_char_autre': 0.3700999231360491