In [2]:
import pandas as pd

In [4]:
df = pd.read_csv('train.csv')

In [8]:
df.head(3)

Unnamed: 0,id,anchor,target,context,score
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75
2,36d72442aefd8232,abatement,active catalyst,A47,0.25


In [10]:
df.describe(include='object')

Unnamed: 0,id,anchor,target,context
count,36473,36473,36473,36473
unique,36473,733,29340,106
top,8d135da0b55b8c88,component composite coating,composition,H01
freq,1,152,24,2186


In [16]:
df['input'] = 'TEXT1: ' + df.context + '; TEXT2: ' + df.target + '; ANC1: ' + df.anchor 

In [18]:
df.input.head()

0    TEXT1: A47; TEXT2: abatement of pollution; ANC...
1    TEXT1: A47; TEXT2: act of abating; ANC1: abate...
2    TEXT1: A47; TEXT2: active catalyst; ANC1: abat...
3    TEXT1: A47; TEXT2: eliminating process; ANC1: ...
4    TEXT1: A47; TEXT2: forest region; ANC1: abatement
Name: input, dtype: object

In [20]:
#Tokenisation
from datasets import Dataset, DatasetDict

  from .autonotebook import tqdm as notebook_tqdm


In [22]:
ds = Dataset.from_pandas(df)

In [24]:
ds

Dataset({
    features: ['id', 'anchor', 'target', 'context', 'score', 'input'],
    num_rows: 36473
})

In [60]:
#AutoTokenizer will create a tokenizer appropriate for a given model
from transformers import AutoTokenizer, AutoModel
import tiktoken

In [64]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

In [66]:
tokenizer.tokenize("Hi, This is Mey experimenting with this stuff")

['hi',
 ',',
 'this',
 'is',
 'me',
 '##y',
 'experimenting',
 'with',
 'this',
 'stuff']

In [68]:
def tok_func(x): 
    return tokenizer(x["input"])

In [70]:
tok_ds = ds.map(tok_func, batched=True)

Map: 100%|██████████| 36473/36473 [00:00<00:00, 38953.02 examples/s]


In [72]:
row = tok_ds[0]
row['input'], row['input_ids']

('TEXT1: A47; TEXT2: abatement of pollution; ANC1: abatement',
 [101,
  3793,
  2487,
  1024,
  1037,
  22610,
  1025,
  3793,
  2475,
  1024,
  19557,
  18532,
  4765,
  1997,
  10796,
  1025,
  2019,
  2278,
  2487,
  1024,
  19557,
  18532,
  4765,
  102])

In [74]:
tokenizer.vocab['of']

1997

In [76]:
#preparing data with labels
tok_ds = tok_ds.rename_columns({'score':'labels'})

In [78]:
eval_df = pd.read_csv('test.csv')
eval_df.describe()

Unnamed: 0,id,anchor,target,context
count,36,36,36,36
unique,36,34,36,29
top,4112d61851461f60,hybrid bearing,inorganic photoconductor drum,G02
freq,1,2,1,3


In [84]:
#splitting data into train and test
dds = tok_ds.train_test_split(0.25, seed=42)

In [86]:
dds

DatasetDict({
    train: Dataset({
        features: ['id', 'anchor', 'target', 'context', 'labels', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 27354
    })
    test: Dataset({
        features: ['id', 'anchor', 'target', 'context', 'labels', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 9119
    })
})

In [88]:
eval_df['input'] = 'TEXT1: ' + eval_df.context + '; TEXT2: ' + eval_df.target + '; ANC1: ' + eval_df.anchor
eval_ds = Dataset.from_pandas(eval_df).map(tok_func, batched=True)

Map: 100%|██████████| 36/36 [00:00<00:00, 7169.07 examples/s]


In [90]:
def corr_d(eval_pred): 
    return {'pearson': corr(*eval_pred)}

In [102]:
from transformers import TrainingArguments,Trainer

In [104]:
bs = 128
epochs = 5
lr = 8e-5

In [119]:
args = TrainingArguments(
    'outputs', 
    learning_rate=lr, 
    warmup_ratio=0.1, 
    lr_scheduler_type='cosine', 
    fp16=True,
    per_device_train_batch_size=bs, 
    per_device_eval_batch_size=bs*2,
    num_train_epochs=epochs, 
    weight_decay=0.01, 
    report_to='none'
)

In [123]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification .from_pretrained('bert-base-uncased', num_labels=1)
trainer = Trainer(model,
                  args,
                  train_dataset=dds['train'], 
                  eval_dataset=dds['test'],
                  tokenizer=tokenizer, 
                  compute_metrics=corr_d)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(model,


In [125]:
trainer.train();

Step,Training Loss
500,0.0372
1000,0.0126


In [127]:
preds = trainer.predict(eval_ds).predictions.astype(float)
preds

array([[ 0.30932617],
       [ 0.58740234],
       [ 0.51367188],
       [ 0.24621582],
       [ 0.09088135],
       [ 0.52392578],
       [ 0.48901367],
       [ 0.00737762],
       [ 0.26098633],
       [ 1.05957031],
       [ 0.16369629],
       [ 0.31298828],
       [ 0.80371094],
       [ 0.80273438],
       [ 0.71582031],
       [ 0.38256836],
       [ 0.35742188],
       [ 0.00855255],
       [ 0.57373047],
       [ 0.36914062],
       [ 0.46484375],
       [ 0.17431641],
       [ 0.18041992],
       [ 0.26049805],
       [ 0.51416016],
       [ 0.01332855],
       [ 0.0107193 ],
       [-0.00344849],
       [ 0.00248337],
       [ 0.70166016],
       [ 0.38476562],
       [ 0.00119114],
       [ 0.74609375],
       [ 0.54882812],
       [ 0.39624023],
       [ 0.27807617]])

In [131]:
#to make the values<0->0 and values>1->1
import numpy as np
preds = np.clip(preds, 0, 1)
preds

array([[0.30932617],
       [0.58740234],
       [0.51367188],
       [0.24621582],
       [0.09088135],
       [0.52392578],
       [0.48901367],
       [0.00737762],
       [0.26098633],
       [1.        ],
       [0.16369629],
       [0.31298828],
       [0.80371094],
       [0.80273438],
       [0.71582031],
       [0.38256836],
       [0.35742188],
       [0.00855255],
       [0.57373047],
       [0.36914062],
       [0.46484375],
       [0.17431641],
       [0.18041992],
       [0.26049805],
       [0.51416016],
       [0.01332855],
       [0.0107193 ],
       [0.        ],
       [0.00248337],
       [0.70166016],
       [0.38476562],
       [0.00119114],
       [0.74609375],
       [0.54882812],
       [0.39624023],
       [0.27807617]])