In [None]:
!pip install transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd

tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large-mnli')
model = AutoModelForSequenceClassification.from_pretrained('facebook/bart-large-mnli')

In [None]:
# Add gold labels and pairIDs
gold_labels = pd.read_csv('subset_en.csv', sep='\t')
classified_data = pd.DataFrame(columns=['gold_label', 'English', 'German', 'Spanish', 'Swahili', 'Urdu', 'pairID'])
classified_data['gold_label'] = gold_labels['gold_label']
classified_data['pairID'] = gold_labels['pairID']

# Classify English
labels = ['entailment', 'contradiction', 'neutral']
data = gold_labels

for index, row in data.iterrows():
  #Get needed info
  sentence1 = row['sentence1']
  sentence2 = row['sentence2']

  # Classify
  encoded_input = tokenizer.encode_plus(
    sentence1,
    sentence2,
    padding="max_length",
    truncation=True,
    max_length=200,
    return_tensors="pt")
  raw_output = model(**encoded_input)[0]
  predicted_label = raw_output.argmax().item()

  # Add label to classidied_data
  classified_data.at[index, 'English'] = labels[predicted_label]

In [None]:
# Classify German to English
data = pd.read_csv('translated_subset_de.tsv', sep='\t')

for index, row in data.iterrows():
  #Get needed info
  sentence1 = row['translated_sentence1']
  sentence2 = row['translated_sentence2']

  # Classify
  encoded_input = tokenizer.encode_plus(
    sentence1,
    sentence2,
    padding="max_length",
    truncation=True,
    max_length=200,
    return_tensors="pt")
  raw_output = model(**encoded_input)[0]
  predicted_label = raw_output.argmax().item()

  # Add label to classidied_data
  classified_data.at[index, 'German'] = labels[predicted_label]


In [None]:
# Classify Spanish to English
data = pd.read_csv('translated_subset_es.tsv', sep='\t')

for index, row in data.iterrows():
  #Get needed info
  sentence1 = row['translated_sentence1']
  sentence2 = row['translated_sentence2']

  # Classify
  encoded_input = tokenizer.encode_plus(
    sentence1,
    sentence2,
    padding="max_length",
    truncation=True,
    max_length=200,
    return_tensors="pt")
  raw_output = model(**encoded_input)[0]
  predicted_label = raw_output.argmax().item()

  # Add label to classidied_data
  classified_data.at[index, 'Spanish'] = labels[predicted_label]

In [None]:
# Classify Swahili to English
data = pd.read_csv('translated_subset_sw.tsv', sep='\t')

for index, row in data.iterrows():
  #Get needed info
  sentence1 = row['translated_sentence1']
  sentence2 = row['translated_sentence2']

  # Classify
  encoded_input = tokenizer.encode_plus(
    sentence1,
    sentence2,
    padding="max_length",
    truncation=True,
    max_length=200,
    return_tensors="pt")
  raw_output = model(**encoded_input)[0]
  predicted_label = raw_output.argmax().item()

  # Add label to classidied_data
  classified_data.at[index, 'Swahili'] = labels[predicted_label]


In [None]:

# Classify Urdu to English
data = pd.read_csv('translated_subset_ur.tsv', sep='\t')

for index, row in data.iterrows():
  #Get needed info
  sentence1 = row['translated_sentence1']
  sentence2 = row['translated_sentence2']

  # Classify
  encoded_input = tokenizer.encode_plus(
    sentence1,
    sentence2,
    padding="max_length",
    truncation=True,
    max_length=200,
    return_tensors="pt")
  raw_output = model(**encoded_input)[0]
  predicted_label = raw_output.argmax().item()

  # Add label to classidied_data
  classified_data.at[index, 'Urdu'] = labels[predicted_label]


# Print translated subsets to file
filename = 'bart_classifications.tsv' 
classified_data.to_csv(filename, sep='\t', index=False)