In [None]:
!pip install transformers
!pip install transformers torch

from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import pandas as pd

tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-base-mnli')
model = AutoModelForSequenceClassification.from_pretrained('microsoft/deberta-base-mnli')

In [None]:
# Add gold label and pairIDs
labels = ['entailment', 'contradiction', 'neutral']
gold_labels = pd.read_csv('subset_en.csv', sep='\t')
classified_data = pd.DataFrame(columns=['gold_label', 'English', 'German', 'Spanish', 'Swahili', 'Urdu', 'pairID'])
classified_data['gold_label'] = gold_labels['gold_label']
classified_data['pairID'] = gold_labels['pairID']

# Classify English
data = gold_labels
model.eval()

with torch.no_grad():
    for index, row in data.iterrows():
        # Get needed info
        sentence1 = row['sentence1']
        sentence2 = row['sentence2']

        # Classify
        encoded_input = tokenizer.encode_plus(
            sentence1,
            sentence2,
            padding="max_length",
            truncation=True,
            max_length=200,
            return_tensors="pt")

        # Pass the encoded input through the model to obtain the predicted label
        outputs = model(**encoded_input)
        predicted_logits = outputs.logits
        predicted_labels = torch.argmax(predicted_logits, dim=1)

        # Add label to classified_data
        classified_data.at[index, 'English'] = labels[predicted_labels.item()]


In [None]:
# Classify German to English
data = pd.read_csv('translated_subset_de.tsv', sep='\t')

with torch.no_grad():
    for index, row in data.iterrows():
        # Get needed info
        sentence1 = row['translated_sentence1']
        sentence2 = row['translated_sentence2']

        # Classify
        encoded_input = tokenizer.encode_plus(
            sentence1,
            sentence2,
            padding="max_length",
            truncation=True,
            max_length=200,
            return_tensors="pt")

        # Pass the encoded input through the model to obtain the predicted label
        outputs = model(**encoded_input)
        predicted_logits = outputs.logits
        predicted_labels = torch.argmax(predicted_logits, dim=1)

        # Add label to classified_data
        classified_data.at[index, 'German'] = labels[predicted_labels.item()]

In [None]:
# Classify Spanish to English
data = pd.read_csv('translated_subset_es.tsv', sep='\t')

with torch.no_grad():
    for index, row in data.iterrows():
        # Get needed info
        sentence1 = row['translated_sentence1']
        sentence2 = row['translated_sentence2']

        # Classify
        encoded_input = tokenizer.encode_plus(
            sentence1,
            sentence2,
            padding="max_length",
            truncation=True,
            max_length=200,
            return_tensors="pt")

        # Pass the encoded input through the model to obtain the predicted label
        outputs = model(**encoded_input)
        predicted_logits = outputs.logits
        predicted_labels = torch.argmax(predicted_logits, dim=1)

        # Add label to classified_data
        classified_data.at[index, 'Spanish'] = labels[predicted_labels.item()]

In [None]:
# Classify Swahili to English
data = pd.read_csv('translated_subset_sw.tsv', sep='\t')

with torch.no_grad():
    for index, row in data.iterrows():
        # Get needed info
        sentence1 = row['translated_sentence1']
        sentence2 = row['translated_sentence2']

        # Classify
        encoded_input = tokenizer.encode_plus(
            sentence1,
            sentence2,
            padding="max_length",
            truncation=True,
            max_length=200,
            return_tensors="pt")

        # Pass the encoded input through the model to obtain the predicted label
        outputs = model(**encoded_input)
        predicted_logits = outputs.logits
        predicted_labels = torch.argmax(predicted_logits, dim=1)

        # Add label to classified_data
        classified_data.at[index, 'Swahili'] = labels[predicted_labels.item()]

In [None]:
# Classify Urdu to English
data = pd.read_csv('translated_subset_ur.tsv', sep='\t')

with torch.no_grad():
    for index, row in data.iterrows():
        # Get needed info
        sentence1 = row['translated_sentence1']
        sentence2 = row['translated_sentence2']

        # Classify
        encoded_input = tokenizer.encode_plus(
            sentence1,
            sentence2,
            padding="max_length",
            truncation=True,
            max_length=200,
            return_tensors="pt")

        # Pass the encoded input through the model to obtain the predicted label
        outputs = model(**encoded_input)
        predicted_logits = outputs.logits
        predicted_labels = torch.argmax(predicted_logits, dim=1)

        # Add label to classified_data
        classified_data.at[index, 'Urdu'] = labels[predicted_labels.item()]

# Print translated subsets to file
filename = 'deberta_classifications.tsv' 
classified_data.to_csv(filename, sep='\t', index=False)