In [5]:
import pandas as pd
from datasets import Dataset, DatasetDict
import numpy as np

In [22]:
def load_and_rename_data(filename:str) -> pd.DataFrame:
    dat = pd.read_csv(filename)
    dat = dat[dat.lp.str.endswith('en')]
    dat.drop(['src', 'raw', 'annotators', 'domain'], inplace=True, axis=1)
    dat.rename(columns={'lp':'lang', 'ref': 'reference', 'mt':'candidate'}, inplace=True)
    return dat

data22 = load_and_rename_data('wmt_data/2022-da.csv')
data21 = load_and_rename_data('wmt_data/2021-da.csv')
data20 = load_and_rename_data('wmt_data/2020-da.csv')
data20

Unnamed: 0,lang,candidate,reference,score
19410,ps-en,Let's search for the joint points between Hind...,Let's analyse the differences between Hinduism...,-2.125683
19411,ps-en,"By coming home, the waiting room is full of Je...","Upon arrival at home, the waiting room would b...",-1.136993
19412,ps-en,"Adjustment of all the economy, and to eradicat...",The assessment of global economy and eradicati...,-0.148303
19413,ps-en,"This structure is made of group wounds, with t...",This structure is organized into a hierarchy o...,-0.108756
19414,ps-en,"On this night, they published a radio and Bara...",On the same night a Soviet radio station aired...,-1.117219
...,...,...,...,...
240084,zh-en,"On September 27, 27th, Vietnam Shield vs. US D...",The central parity rate of Vietnamese dong aga...,-0.411959
240085,zh-en,With Exchange rate fluctuations at + /-3 today...,Under the circumstance of an exchange rate flu...,0.759167
240086,zh-en,The Bank's buying price and selling bid for rm...,"The bank has a buying rate at 3,211 dong per C...",-2.022257
240087,zh-en,Techcombank of Vietnam (Techcombank) has set t...,Techcombank adjusted the buying and selling ra...,-2.754210


In [16]:
data_older = pd.read_json("bleurtMaster/wmt_data/wmt_all.json", lines=True)
data_older.drop(['source', 'year', 'n_ratings', 'system', 'segment_id', 'raw_rating'], inplace=True, axis=1)
data_older

Unnamed: 0,lang,candidate,reference,score
0,cs-en,"""That's not normal,"" Ferrell told herself.","""'This is not usual,'"" Ferrell recalled saying.",0.153301
1,cs-en,"""That's not normal,"" Ferrellová said to himself.","""'This is not usual,'"" Ferrell recalled saying.",-1.800665
2,cs-en,"""This is not the kind of murder we've gotten u...","""'This isn't the type of murder that we've bec...",1.156997
3,cs-en,"""This is not the type of murder to which we ha...","""'This isn't the type of murder that we've bec...",-0.329594
4,cs-en,"""This isn't the kind of murder we've grown acc...","""'This isn't the type of murder that we've bec...",-0.009791
...,...,...,...,...
247152,zh-en,"Continuous broadcast three years of ""Xiao Sonc...","“Xiaosong Pedia”, which has been broadcast for...",-0.637276
247153,zh-en,"For three years in a row, ""The Talk of Xiaoson...","“Xiaosong Pedia”, which has been broadcast for...",1.058468
247154,zh-en,"For three years in a row, ""Xiaosong Chance"" ha...","“Xiaosong Pedia”, which has been broadcast for...",-1.217945
247155,zh-en,"For three years in a row, Xiao Song Chi-chu ha...","“Xiaosong Pedia”, which has been broadcast for...",-0.851482


In [7]:
#data = pd.concat([data_older, data20, data21, data22])
data = pd.concat([data22, data21])
data = data.sample(frac=1).reset_index(drop=True)
data

Unnamed: 0,lang,candidate,reference,score
0,zh-en,"Hello, how long will it take for the ordered m...","Hello, how much longer will the ordered food a...",0.146530
1,ja-en,The rest is as per your note.,The rest is exactly as your notes say.,-1.484289
2,zh-en,"The computer is very good, especially the cust...",The laptop is good; the customer service staff...,0.920532
3,zh-en,According to the requirements of epidemic prev...,In accordance with the requirements related to...,0.845171
4,ja-en,I made an am radio with homemade capacitors an...,I made an AM radio with a homemade condenser a...,0.563870
...,...,...,...,...
219860,zh-en,Chinese colleges and universities should furth...,Universities and colleges in China shall furth...,0.278676
219861,ja-en,I wonder if there are many people who use redd...,Maybe a few people here use Reddit but also us...,-0.267265
219862,zh-en,"A simple white T-shirt with a short skirt, and...",Fat girls can also try simple white T-shirt wi...,-0.234670
219863,is-en,"""You have the mystery, I need Iceland,"" says a...","""You're right, I need Iceland,"" says a woman s...",-0.804000


In [8]:
frac_flipped_pairs = 0.3
frac_identical = 0.3
indices_flipped = np.random.randn(len(data)) < frac_flipped_pairs
duplicates = data[indices_flipped].copy()
duplicates = pd.DataFrame({'lang': duplicates['lang'], 'candidate': duplicates['reference'],
                           'reference': duplicates['candidate'], 'score': duplicates['score']})
# augment every sample only once -> create diff of dfs
ident = data[~indices_flipped]
indices_cand = np.random.randn(len(ident)) < frac_identical
indices_ref = np.random.randn(len(ident)) > 1 - frac_identical
ident_cand = ident[indices_cand]
ident_ref = ident[indices_ref]
ident_cand = pd.DataFrame({'lang': ['**-en'] * len(ident_cand), 'candidate': ident_cand['candidate'],
                           'reference': ident_cand['candidate'], 'score': [1.0] * len(ident_cand)})
ident_ref = pd.DataFrame({'lang': ['**-en'] * len(ident_ref), 'candidate': ident_ref['reference'],
                           'reference': ident_ref['reference'], 'score': [1.0] * len(ident_ref)})
data_wmt = pd.concat([data, duplicates, ident_cand, ident_ref])
data_wmt = data_wmt.sample(frac=1).reset_index(drop=True)

In [9]:
len(duplicates)

135867

In [10]:
cand_column = "candidate" # "mt"
ref_column = "reference" #"ref"
lab_column = "score"
#df = data_wmt[["mt", "ref", "label"]]
df = data[[cand_column, ref_column, lab_column]]
dataset_unsplit = Dataset.from_pandas(df)  # no need to shuffle (data already shuffled)

# Split dataset into train-eval-test (80-10-10)
# HuggingFace doesn't provide a straightforward way of doing this,
# see https://discuss.huggingface.co/t/how-to-split-main-dataset-into-train-dev-test-as-datasetdict
dataset_train = dataset_unsplit.train_test_split(test_size=0.2)
dataset_eval_test = dataset_train["test"].train_test_split(test_size=0.5)

dataset = DatasetDict(
    train=dataset_train["train"],
    eval=dataset_eval_test["train"],  # not a typo!
    test=dataset_eval_test["test"]
)

print(dataset)

assert dataset_unsplit.num_rows == (dataset["train"].num_rows + dataset["eval"].num_rows + dataset["test"].num_rows)
assert dataset["train"][:10] != dataset["eval"][:10] \
       and dataset["train"][:10] != dataset["test"][:10] \
       and dataset["eval"][:10] != dataset["test"][:10]

DatasetDict({
    train: Dataset({
        features: ['candidate', 'reference', 'score'],
        num_rows: 175892
    })
    eval: Dataset({
        features: ['candidate', 'reference', 'score'],
        num_rows: 21986
    })
    test: Dataset({
        features: ['candidate', 'reference', 'score'],
        num_rows: 21987
    })
})


In [20]:
dataset.save_to_disk("wmt_all_years_augmented")

Flattening the indices:   0%|          | 0/945392 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/945392 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/118174 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/118174 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/118174 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/118174 [00:00<?, ? examples/s]

In [11]:
save_path = "wmt_21_22_augmented/"
dataset['train'].to_json(save_path + "train.json")
dataset['test'].to_json(save_path + "test.json")
dataset['eval'].to_json(save_path + "eval.json")

Creating json from Arrow format:   0%|          | 0/176 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/22 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/22 [00:00<?, ?ba/s]

6838267

In [None]:
from pathlib import Path
import torch
from sentence_transformers import SentenceTransformer

timestamp = "2023-02-17_15-02-13"
project_base_path = Path("Guided Research WS22")
negation_dataset = project_base_path / "data/negation_dataset_labeled.tsv"

base_model = "sentence-transformers/all-mpnet-base-v2"
output_model_name = f"{base_model.split('/')[1]}-negation"  # TODO.
model_save_path = str(project_base_path / f"finetuned-models/{timestamp}/{output_model_name}")
sentence_model = SentenceTransformer(model_save_path)

device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(device)
sentence_model.to(device)

In [None]:
X_train_mt = sentence_model.encode(dataset['train'][cand_column])
X_train_ref = sentence_model.encode(dataset['train'][ref_column])
X_train_vec = np.concatenate([X_train_ref, X_train_mt], axis=1)
print(X_train_vec.shape)

X_eval_mt = sentence_model.encode(dataset['eval'][cand_column])
X_eval_ref = sentence_model.encode(dataset['eval'][ref_column])
X_eval_vec = np.concatenate([X_eval_ref, X_eval_mt], axis=1)

X_test_mt = sentence_model.encode(dataset['test'][cand_column])
X_test_ref = sentence_model.encode(dataset['test'][ref_column])
X_test_vec = np.concatenate([X_test_ref, X_test_mt], axis=1)

In [None]:
wmt_data_folder = "wmt_data_bleurt/"
np.save(wmt_data_folder+"X_train", X_train_vec)
np.save(wmt_data_folder+"X_eval", X_eval_vec)
np.save(wmt_data_folder+"X_test", X_test_vec)

### Data investigation

In [17]:
import pandas as pd

In [19]:
#wmt21 = pd.read_json("wmt_21/train_neg_-1.json", lines=True)
wmt21 = pd.read_json("wmt_21_22_augmented/train_neg_0.json", lines=True)
wmt21

Unnamed: 0,candidate,reference,score
0,I believe they are looking for a pastry chef w...,I think they are looking for a confectioner wh...,0.561256
1,Commercial timber harvesting and hydro-electri...,Commercial timber harvesting and commercial hy...,1.000000
2,He mainly buys trading cards.,He mainly buys collectible cards.,0.588547
3,The YIG filter is tunes by an electromagnet.,An electromagnet then tunes the YIG filter to ...,1.000000
4,If you have a big aquarium pumps with 2 or mor...,"If you own a large aquarium, then a pump with ...",-0.585726
...,...,...,...
198629,In 1954 the school was integrated.,The school was integrated in 1954.,1.000000
198630,"By brand, semiconductor Intel fell 16%.","Looking at individual stocks, semiconductor gi...",0.785644
198631,"If you wish, you can meet your family in Kutna...","If you wish, you can meet your family in Kutna...",0.613079
198632,Members of Bathyotica do have a postfrontal bone.,Members of Bathyotica also lack a postfrontal ...,0.000000


In [28]:
wmt21[(wmt21.score <2) & (wmt21.score > -2)]

Unnamed: 0,candidate,reference,score
0,I give it 2 thumbs down,I don't give it 2 thumbs down,-1.000000
1,He noted that today's weather situation will b...,He noted that the weather today would be deter...,0.588926
2,This is a cult classic that is worth watching ...,This is definitely a cult classic well worth v...,1.000000
3,"A later owner, Charles W. Griffith, added the ...","A later owner, Charles W. Griffith, removed th...",-1.000000
4,Not tasty and the texture wasn't just nasty.,Not tasty and the texture was just nasty.,-1.000000
...,...,...,...
140823,"It is expected that during the day today, Beij...","It is estimated that during the day, Beijing w...",1.390594
140824,The British received horses from this ranch du...,"From this ranch, he supplied the British with ...",1.000000
140825,It is ranked amongst the least successful scho...,It is ranked amongst the most successful schoo...,-1.000000
140826,"The governor added, ""We must express our grati...",The Governor added that we must express our gr...,-0.218797


In [20]:
wmt_bleurt = pd.read_json("bleurtMaster/wmt_data/train_ratings_neg_-1.json", lines=True)
wmt_bleurt

Unnamed: 0,lang,candidate,reference,source,year,n_ratings,system,segment_id,raw_rating,score
0,,"Additionally, Norwegian assimilated a consider...",Norwegian used a lot of traditional Danes expr...,,,,,,,1.000000
1,,No health effects are expected from ingestion ...,No health effects are expected by ingestion of...,,,,,,,1.000000
2,de-en,Dr. Wolfgang Schneider has consulted the Bisch...,"Dr Wolfgang Schneider has, once again, consult...",Dr. Wolfgang Schneider hat extra das Bischofsh...,2015.0,,,384.0,,-0.330765
3,fi-en,"Hull Valioliiga Club, the club has invested a ...",Premier League club Hull invested a record sum...,Valioliigaseura Hull sijoitti seuran ennätyssu...,2015.0,,,325.0,,0.250715
4,,Terrorist tactics usually discourage attacks t...,Terrorist tactics tend to favor attacks that a...,,,,,,,-1.000000
...,...,...,...,...,...,...,...,...,...,...
3797,,"A month after his release, the Seventh Circuit...","A month after his release, the Seventh Circuit...",,,,,,,-1.000000
3798,fi-en,He did an hour towards säälintunnetta Keverniä...,She felt a pang for him and for men in general...,Hän tunti säälintunnetta Keverniä ja miehiä ko...,2015.0,,,126.0,,-0.720867
3799,ru-en,"""I went for two minutes before the terrorists ...",'I left two minutes before the terrorists rush...,"""Я ушла за две минуты до того, как террористы ...",2015.0,,,370.0,,0.142791
3800,cs-en,"Campaigners for women's rights to abortion, to...","Abortion rights campaigners, along with the Am...",Bojovníci za práva žen na potrat společně se S...,2015.0,,,238.0,,-0.079020


In [21]:
print(wmt21.describe())
print(wmt_bleurt.describe())

               score
count  198634.000000
mean        0.197796
std         0.539709
min        -0.999993
25%        -0.156174
50%         0.129354
75%         0.638042
max         1.000000
         year  n_ratings  system   segment_id  raw_rating        score
count  1791.0        0.0     0.0  1791.000000         0.0  3802.000000
mean   2015.0        NaN     NaN   249.252931         NaN    -0.018409
std       0.0        NaN     NaN   143.899267         NaN     0.832993
min    2015.0        NaN     NaN     1.000000         NaN    -1.901818
25%    2015.0        NaN     NaN   126.000000         NaN    -1.000000
50%    2015.0        NaN     NaN   248.000000         NaN    -0.024450
75%    2015.0        NaN     NaN   372.000000         NaN     1.000000
max    2015.0        NaN     NaN   500.000000         NaN     1.315658


## Append negation data to wmt data

In [19]:
## Load bleurt or custom preprocess WMT data
import pandas as pd

load_path = "bleurtMaster/wmt_data/"
wmt_train = pd.read_json(load_path + 'train_ratings.json', lines=True)
wmt_test = pd.read_json(load_path + 'test_ratings.json', lines=True)
wmt_dev = pd.read_json(load_path + 'dev_ratings.json', lines=True)
#load_path = "wmt_21_22_augmented/"
#wmt_train = pd.read_json(load_path + "train.json", lines=True)
#wmt_test = pd.read_json(load_path + "test.json", lines=True)
#wmt_dev = pd.read_json(load_path + "eval.json", lines=True)
num_tr = len(wmt_train)
num_dev = len(wmt_dev)
num_test = len(wmt_test)
print(num_tr, num_test, num_dev)
wmt_data = pd.concat([wmt_dev, wmt_train, wmt_test]).sample(frac=1).reset_index(drop=True)
wmt_data = wmt_data[wmt_data.score >= -1].copy()
wmt_data.drop(columns=['source', 'n_ratings', 'system', 'segment_id', 'raw_rating', 'lang'], inplace=True)
wmt_data

1791 4132 200


Unnamed: 0,candidate,reference,year,score
0,"In previous Olympic games, Gaby Duglasa became...","Last Olympics, Gabby Douglas was a surprise Ol...",2017,-0.356171
1,I also think that the film shows that we have ...,I also think the film shows we have a great se...,2017,0.782293
3,"But, regardless, Fiji would have beaten a 15-m...","But, regardless, Fiji on this form would have ...",2017,0.241079
4,"The first day of the new school year, is tradi...",The first day back at school is traditionally ...,2015,0.049910
5,The usual trajectory of the Olympic railway is...,The most common Olympic vector is straight ahead.,2017,-0.290765
...,...,...,...,...
6118,"And on Sunday, about 150 people came to the To...","And on Sunday in Cardiff, around 150 people ca...",2015,0.206902
6119,"Sultana was 17 years old and, according to his...","Sultana was 17 years old and, according to her...",2017,0.788480
6120,"If used, for example, the'calculus' - the pape...","""Calculus"" is selling for $93.49 used on Chegg...",2015,-0.648367
6121,"Of course, that Darling will not support the m...","Of course, Darling will not support the mandat...",2015,0.868201


In [6]:
# BLEURT wmt db builder
import pandas as pd

wmt_data = pd.read_json("bleurtMaster/wmt_data/wmt_15-17.jsonl", lines=True)
wmt_data = wmt_data[wmt_data.lang.str.endswith('en')]
wmt_data.drop(columns=['source', 'n_ratings', 'system', 'segment_id', 'raw_rating', 'lang'], inplace=True)
wmt_data

Unnamed: 0,candidate,reference,year,score
0,„ the voice in a head said me before ‚ he will...,"""A little voice inside me said, 'You're going ...",2015,-0.630735
1,"A "" general meeting no problem was with it whe...","""A plenary session was held, at which no one h...",2015,-0.709421
2,"""A restaurateur should be meticulous, patient ...","""A restorer has to be diligent, patient and no...",2017,-1.288860
3,"""Although at least some growth is better than ...","""Although some growth is better than no growth...",2015,0.233937
4,"""For tourism in general, LGBT clients are very...","""As tourists, the LGBT clientele is very lucra...",2017,0.739368
...,...,...,...,...
9259,Zhan Qixian believes that so long as today als...,"Zhan Qixian said that, as long as there are pa...",2017,-0.665855
9260,Xiamen xiangan district Park East of the villa...,"Zhang Donghong, a villager of the East Park Vi...",2017,-0.416351
9261,Zhang Huizhen talked about her gold medal afte...,Zhang Huizhen talked about his gold winning af...,2017,-0.351795
9262,Zhang Xiaolong told reporters that his daughte...,Zhang Xiaolong told reporters that her daughte...,2017,0.032072


In [18]:
## Load WMT from homepage download
def load_and_rename_data(filename:str, year:int) -> pd.DataFrame:
    dat = pd.read_csv(filename)
    dat = dat[dat.lp.str.endswith('en')]
    dat.drop(['src', 'raw', 'annotators', 'domain', 'lp'], inplace=True, axis=1)
    dat.rename(columns={'ref': 'reference', 'mt':'candidate'}, inplace=True)
    dat['year'] = year
    return dat

wmt_data = load_and_rename_data('wmt_data/2021-da.csv', 2022)
wmt_data

Unnamed: 0,candidate,reference,score,year
0,Xinhua Commentary: Making Excellent Returning ...,Xinhua Commentary: Outstanding returning rural...,-0.086212,2022
1,Talent is the key to rural revitalization.,Talent is the key to rural revitalization.,0.244415,2022
2,Where can I find talent?,Where is talent found?,-0.995434,2022
3,The group of outstanding returning migrant wor...,Outstanding returning rural migrant workers ar...,0.409728,2022
4,"Hai Chunzuo, Xinhua News Agency",Published by Xinhua News Agency Author: Shang ...,0.492384,2022
...,...,...,...,...
219983,It also replaced red meat with fish and seafood.,She also replaced red meat with fish and seafood.,-0.650321,2022
219984,"Prefers vegetables (except potatoes), whole gr...","She favors vegetables (besides potatoes), whol...",0.356726,2022
219985,"Gagarin is also active in the gym, yoga and ru...","Gagarina also actively works out in the gym, d...",-0.197150,2022
219986,"Earlier, Russian singer Sergei Lazarev told ho...","Earlier, Russian singer Sergey Lazarev told ho...",-0.448912,2022


In [7]:
#neg_data = pd.read_csv('Guided Research WS22/data/negation_dataset_v1.1.tsv', sep='\t') # with antonym data
neg_data = pd.read_csv('Guided Research WS22/data/negation_dataset_labeled.tsv', sep='\t') # without antonym data
neg_data = neg_data.replace({0:1, 1:0}) # original data has target negated, but we need quality estimation
neg_data.rename(columns={'premise':'reference', 'hypothesis':'candidate', 'label':'score'}, inplace=True)
neg_data['year'] = ['neg'] * len(neg_data)
neg_data

Unnamed: 0,reference,candidate,score,year
0,"Hopefully, the director James Cox can turn the...",If James Cox can turn the short into a feature...,1,neg
1,The Model T would be started on an alternator ...,The Model T would be started on battery but th...,0,neg
2,"However, such a relationship is no longer wide...","However, such a relationship is widely accepte...",0,neg
3,"For example, despite bilingual-education laws,...","Despite bilingual education laws, the use of S...",1,neg
4,The front facade involves an already-found Ion...,The front facade features a monumental Ionic o...,0,neg
...,...,...,...,...
68775,Everything wasn't good and tasty!,Everything was good and tasty!,0,neg
68776,"From homicides alone, roughly 150,000 people d...","From all causes, roughly 150,000 people die ar...",0,neg
68777,The Oerlikon GDF Mk.5 35mm gun is currently in...,Regiment Oos Transvaal is currently equipped w...,1,neg
68778,The discovery of Williams's alterations and fo...,The discovery of Williams's alterations and fo...,0,neg


In [8]:
# Split data into train, eval and test
from sklearn.model_selection import train_test_split

def train_eval_test_split(all_data:pd.DataFrame) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame):
    all_train, all_tmp = train_test_split(all_data, train_size=.8)
    all_test, all_eval = train_test_split(all_tmp, train_size=.5)
    return all_train, all_eval, all_test

neg_train, neg_eval, neg_test = train_eval_test_split(neg_data)
wmt_train, wmt_eval, wmt_test = train_eval_test_split(wmt_data)

In [10]:
save_path = 'wmt_negation_data/'

def combine_and_write(wmt_df, neg_df, filename):
    wmt_df = pd.concat([wmt_df, neg_df]).sample(frac=1).reset_index(drop=True)
    wmt_df.to_csv(save_path+filename+'.tsv', sep='\t')
    wmt_df.to_json(save_path+filename+'.json', lines=True, orient='records')

combine_and_write(wmt_train, neg_train, 'wmt_neg_train')
combine_and_write(wmt_eval, neg_eval, 'wmt_neg_eval')
combine_and_write(wmt_test, neg_test, 'wmt_neg_test')

In [8]:
def combine_and_write(wmt_df, neg_df, filename):
    wmt_df = pd.concat([wmt_df[(wmt_df.score > -1)], neg_df]).sample(frac=1).reset_index(drop=True)
    #wmt_df = pd.concat([wmt_df[(wmt_df.score > -1) & (wmt_df.score < 1)], neg_df]).sample(frac=1).reset_index(drop=True)
    #wmt_df.dropna(inplace=True)
    print(wmt_df)
    #wmt_df.to_json('bleurtMaster/wmt_data/'+filename, lines=True, orient='records')
    #wmt_df.to_json(save_path+filename, lines=True, orient='records')
    wmt_df.to_csv(save_path+filename, sep='\t')

combine_and_write(pd.concat([wmt_train, wmt_test, wmt_dev]), neg_data, 'train_neg_0_antonym_full.tsv')
#combine_and_write(wmt_train, neg_train, 'train_neg_0_antonym_full.json')
#combine_and_write(wmt_test, neg_test, 'test_neg_0_antonym_full.json')
#combine_and_write(wmt_dev, neg_dev, 'eval_neg_0_antonym_full.json')
#combine_and_write(wmt_train, neg_train, 'train_ratings_neg_-1_w_empty.json')
#combine_and_write(wmt_test, neg_test, 'test_ratings_neg_-1.json')
#combine_and_write(wmt_dev, neg_dev, 'dev_ratings_neg_-1.json')

      lang                                          candidate  \
0      NaN  Branches allow for parts of software to be dev...   
1      NaN                                 He is unavailable.   
2      NaN  Gutman attended Newark Academy, where he was u...   
3      NaN  He graduated from New York University School o...   
4      NaN  Some authors have compared the Kirtimukha myth...   
...    ...                                                ...   
83156  NaN  In some cases, such as Kenya, this has led to ...   
83157  NaN  The hospital was later used by the American fo...   
83158  NaN  This provided a major improvement in the fidel...   
83159  NaN  In the second quarter of 1983, Llanidloes was ...   
83160  NaN  Potentially mineable resources include granite...   

                                               reference source  year  \
0      Parts of the software can be developed in para...    NaN   NaN   
1                                   He is not available.    NaN   NaN   


In [6]:
import pandas as pd

def ratio_in_split(split):
    d = pd.read_csv(f"wmt_negation_data/wmt_neg_{split}.tsv", sep='\t')
    num_neg = len(d[d.year.str.contains("neg")])
    num_wmt = len(d[d.year.str.contains('20')])
    num_all = len(d)
    print(num_neg, num_wmt,num_all, num_neg/num_all, num_wmt/num_all)
ratio_in_split("train")
ratio_in_split("test")

55024 7411 62435 0.8813005525746777 0.11869944742532233
6878 926 7804 0.8813429010763711 0.11865709892362891
