# Experiment: ExpM5-bertram-str1

In [1]:
import pandas as pd
pd.options.display.max_colwidth=500

import os
import sys

## Experiment Setup

In [2]:
root_dir = '../../'
base_dir = root_dir + 'data/magpie/'
data_file = base_dir + 'processed_with_context_MAGPIE_filtered_split_typebased.csv'

# Idioms data, to replace the idioms with their single-token representations
idioms_file = root_dir + 'data/token_files/option1_idioms_bertram.csv'

# NOTE: This notebook should ideally modify only the contents of this exp_dir.
exp_dir = './'
tmp_dir = exp_dir + 'tmp/'

In [3]:
if not os.path.isdir(tmp_dir):
    os.makedirs(tmp_dir)

In [4]:
df_data = pd.read_csv(data_file)
df_data

Unnamed: 0,sentence,idiom,confidence,label,split,variant_type,offsets
0,"Please , can we close the doggy postbag for now ! Remember that RUNNING is looking for all kinds of safety tips . For example , with fell running and mountain marathons gaining in popularity , how about some ideas for safe running <START> off the beaten track <END> ? FITNESS CLINIC DIET",off the beaten track,1.000000,i,training,identical,"[[117, 120], [125, 131], [132, 137]]"
1,"But it 's a selfish family , I 'd say . They take what they want . I 'd keep him well <START> in the running <END> . Then of course there 's Desmond 's wife — I 'd forgotten her . I did n't get much of an impression of her .",in the running,0.770109,i,training,identical,"[[19, 21], [26, 33]]"
2,"And I looked behind , and he was just sitting there staring like that . Oh my god . He <START> gives me the creeps <END> , so I looked round , hmm hmm . I mean , what is she doing ? What does she want ?",give someone the creeps,1.000000,i,training,combined-inflection,"[[3, 8], [9, 11], [16, 22]]"
3,"Especially this year . Makes me hair stand on end just thinking about it.’ ‘ He 's <START> done us proud <END> , as well,’ says Granville . ‘ He had the chance to go to the States with them but he said , ‘ No’ . Other commitments.’",do someone proud,1.000000,i,training,combined-inflection,"[[8, 12], [13, 15], [16, 21]]"
4,"Rather , what is needed most is a new way of thinking – new “ software ” ( though effective “ hard ” green technologies also are essential ) . As we saw in the postcommunist world , changing attitudes is often the hardest problem of all . People quickly embraced formal democracy , but the tolerance and compromise that is at the heart of the democratic process took time to <START> take root <END> .",take root,1.000000,i,training,identical,"[[136, 140], [141, 145]]"
...,...,...,...,...,...,...,...
48390,"The running of multiple sessions has been another means of saving which , though not always a problem , has led to truncated teaching sessions , late - coming and absenteeism on the part of teachers and pupils . Teacher absenteeism is also exacerbated by general shortages of goods and services : teachers , like other employees , can spend half their days chasing scarce consumer goods for their families , instead of being in the classroom . Many also have second or third jobs to <START> make ...",make ends meet,0.854973,i,test,identical,"[[39, 43], [44, 48], [49, 53]]"
48391,"you understand the process ? Yeah , yeah Take people to objections , take them to where you want them to be and <START> bear in mind <END> you 're always looking for an objection Yeah right , another thing , we wanna get more quotes , right",bear in mind,1.000000,i,training,identical,"[[71, 75], [76, 78], [79, 83]]"
48392,"The same implications attach to the playing of games or the membership of clubs and so on , although what is of even more interest are the ' unwritten ' rules which underwrite the more formal , quasi - legal , ones . Without unwritten rules civilised life would be impossible . Indeed we are rarely aware of them <START> as rules <END> , until they are broken , since they are typical of the settings in which we received our moral training . Many were originally instinctive and , to that limite...",as a rule,1.000000,l,training,deletion-determiner,"[[35, 37], [38, 43]]"
48393,"A manufacturer can work closely with its suppliers , co - operating on the development of new components , for instance . It is like being part of the same company , but without the drawbacks . Unlike in a firm that is a <START> jack of all trades <END> , the supplier is an independent business subject to market disciplines rather than another bit of a big bureaucracy . From the supplier 's point of view , the relationship is better than simply one based on contracts , price and open bidding...",jack of all trades,1.000000,i,training,identical,"[[27, 31], [32, 34], [35, 38], [39, 45]]"


In [5]:
# To convert string representation of 'offsets' into a valid list
import ast
df_data['offsets'] = df_data['offsets'].map(lambda os: ast.literal_eval(os))

In [6]:
columns=['sentence', 'idiom', 'confidence', 'label', 'split', 'variant_type', 'offsets']

## Replace idioms with their Single Token Representation

In [7]:
# Load the idioms file: <idiom phrase,token> mapping
df_idioms = pd.read_csv(idioms_file)
df_idioms = df_idioms.set_index('idiom')
IDIOM_TOKEN_DICT = df_idioms.to_dict()['idiom_token']
IDIOM_TOKEN_DICT

{'off the beaten track': '<BERTRAM:IDoffthebeatentrackID>',
 'in the running': '<BERTRAM:IDintherunningID>',
 'give someone the creeps': '<BERTRAM:IDgivesomeonethecreepsID>',
 'do someone proud': '<BERTRAM:IDdosomeoneproudID>',
 'take root': '<BERTRAM:IDtakerootID>',
 'clean house': '<BERTRAM:IDcleanhouseID>',
 'make history': '<BERTRAM:IDmakehistoryID>',
 'go all the way': '<BERTRAM:IDgoallthewayID>',
 'chapter and verse': '<BERTRAM:IDchapterandverseID>',
 'break the bank': '<BERTRAM:IDbreakthebankID>',
 'head for the hills': '<BERTRAM:IDheadforthehillsID>',
 'in a fog': '<BERTRAM:IDinafogID>',
 'bring up the rear': '<BERTRAM:IDbringuptherearID>',
 'in the hole': '<BERTRAM:IDintheholeID>',
 'true to form': '<BERTRAM:IDtruetoformID>',
 'rags to riches': '<BERTRAM:IDragstorichesID>',
 'on the ball': '<BERTRAM:IDontheballID>',
 'stake a claim': '<BERTRAM:IDstakeaclaimID>',
 'up for grabs': '<BERTRAM:IDupforgrabsID>',
 'up and running': '<BERTRAM:IDupandrunningID>',
 'behind bars': '<BERT

In [8]:
# Map the idiom phrase with token and then replace the phrase in the 'sentence' column
def map_and_replace_by_idiom_token(row):
    # Get the token
    idiom_phrase=row['idiom']
    token=IDIOM_TOKEN_DICT[idiom_phrase]
    # Replace the idiom phrase with a single token in the sentence
    sentence = row['sentence']
    before = sentence.split("<START>")[0]
    after = sentence.split("<END>")[1]
    return before + "<START> " + token + " <END>" + after

df_data['sentence'] = df_data.apply(map_and_replace_by_idiom_token, axis=1)

In [9]:
mirrorwic_training_file = tmp_dir + 'mirrorwic_STR1_sentences_with_context_bertram.txt'

mirrorwic_cols = ['sentence']
tmp_mirrorwic = df_data[mirrorwic_cols]
tmp_mirrorwic.to_csv(mirrorwic_training_file, header=False, index=False)
print(f'Saved the file to {mirrorwic_training_file}')

Saved the file to ./tmp/mirrorwic_STR1_sentences_with_context_bertram.txt


## Prepare & save the train, dev & test sets

In [10]:
label_to_id = {'i': 0, 'l': 1}

In [11]:
df_data['split'].value_counts()

training       38715
test            4840
development     4840
Name: split, dtype: int64

In [12]:
df_tmp = df_data[['sentence', 'label', 'split']]

df_train = df_tmp[df_tmp['split'] == 'training']
df_dev = df_tmp[df_tmp['split'] == 'development']
df_test = df_tmp[df_tmp['split'] == 'test']

def clean_df(df):
    """Clean each of the datasets"""
    df = df.drop(columns=['split'])
    df['label'] = df['label'].map(label_to_id)
    return df

# Clean the datasets
df_train, df_dev, df_test = [clean_df(df) for df in [df_train, df_dev, df_test]]

In [13]:
# Save data to tmp files
train_csv = tmp_dir + 'train.csv'
dev_csv = tmp_dir + 'dev.csv'
test_csv = tmp_dir + 'test.csv'

df_train.to_csv(train_csv, index=False)
df_dev.to_csv(dev_csv, index=False)
df_test.to_csv(test_csv, index=False)
print(f'Saved the files to {tmp_dir}')

Saved the files to ./tmp/
