In [1]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-2.3.1-py3-none-any.whl (132 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/132.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.8/132.8 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-2.3.1


In [2]:
from sentence_transformers import SentenceTransformer, losses
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm.notebook import tqdm_notebook
import copy

torch.manual_seed(0)

base_dir = '/content/drive/MyDrive/SemEval/'
train_data_dir = base_dir + 'train_data.csv'
val_data_dir = base_dir + 'val_data.csv'

In [3]:
from google.colab import drive
from google.colab import userdata
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [5]:
class TextTrainDataset(Dataset):
  def __init__(self, df):
    self.df = df

  def __len__(self):
    return len(self.df)

  def __getitem__(self, idx):
    question = self.df.iloc[idx]['question']
    answer = self.df.iloc[idx]['answer']
    distractor = self.df.iloc[idx]['distractor']

    return question, answer, distractor


class Model(nn.Module):
  def __init__(self):
    super(Model, self).__init__()
    self.embedding = SentenceTransformer('sentence-transformers/all-mpnet-base-v2').to(device=device)
    self.linear1 = nn.Linear(768, 64, device=device)
    self.linear2 = nn.Linear(768*2, 64, device=device)
    self.tanh = nn.Tanh()

  def embed(self, question, answer, distractor):
    question = torch.from_numpy(model.embedding.encode(question)).to(device)
    answer = torch.from_numpy(model.embedding.encode(answer)).to(device)
    distractor = torch.from_numpy(model.embedding.encode(distractor)).to(device)

    return question, answer, distractor

  def forward(self, question, answer, distractor):
    question = question.to(device)
    answer = answer.to(device)
    distractor = distractor.to(device)

    anchor = question
    positive = torch.cat((question, answer), dim=1)
    negative = torch.cat((question, distractor), dim=1)

    anchor = self.tanh(self.linear1(anchor).to(device))
    positive = self.tanh(self.linear2(positive).to(device))
    negative = self.tanh(self.linear2(negative).to(device))

    return anchor, positive, negative

In [6]:
def make_dataset(df):
  new_df = pd.DataFrame(columns=['question', 'answer', 'distractor'])
  for i, row in df.iterrows():
    q = row['question']
    ans = row['answer']
    distractor1 = row['distractor1']
    distractor2 = row['distractor2']
    temp = pd.DataFrame({
        'question': [q, q],
        'answer': [ans, ans],
        'distractor': [distractor1, distractor2]
    })
    new_df = pd.concat([new_df, temp], ignore_index=True)

  return TextTrainDataset(new_df)

In [19]:
def forward(model, batch, loss_fn, mode='train'):
  if mode == 'train':
    model.train()
  else:
    model.eval()
  question, answer, distractor = batch
  question, answer, distractor = model.embed(question, answer, distractor)
  anchor, positive, negative = model(question, answer, distractor)
  loss = loss_fn(anchor, positive, negative)
  return loss

class CosineTripletLoss(nn.Module):
    def __init__(self, margin=1.0):
        super(CosineTripletLoss, self).__init__()
        self.margin = margin

    def forward(self, anchor, positive, negative):
        cos_sim = nn.CosineSimilarity(dim=1, eps=1e-6)
        positive_sim = cos_sim(anchor, positive)
        negative_sim = cos_sim(anchor, negative)
        # losses = nn.functional.relu(negative_sim - positive_sim + self.margin)
        losses = negative_sim - positive_sim + self.margin
        return losses.mean()


def train(model, loader, val_loader, train_df, val_df, n_epochs):
  loss_fn = CosineTripletLoss(2)
  optimizer = optim.Adam(model.parameters())
  model = model.to(device)
  best_model = copy.deepcopy(model)
  # best_val_acc = -1000
  best_val_loss = 10000
  for epoch in tqdm_notebook(range(n_epochs)):
    model.train()
    total_loss = 0.0
    for i, batch in enumerate(loader):
      # question, answer, distractor = batch
      # question, answer, distractor = model.embed(question, answer, distractor)

      # anchor, positive, negative = model(question, answer, distractor)

      # loss = loss_fn(anchor, positive, negative)
      # total_loss += loss.item() / len(batch)
      loss = forward(model, batch, loss_fn, 'train')
      total_loss += loss.item() / len(batch)

      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

    val_loss = 0.0
    for i, batch in enumerate(val_loader):
      val_loss += forward(model, batch, loss_fn, 'eval').item() / len(batch)
    val_acc = accuracy(model, val_df)
    # if val_acc > best_val_acc:
    #   best_val_acc = val_acc
    #   best_model = copy.deepcopy(model)
    if val_loss < best_val_loss:
      best_val_loss = val_loss
      best_model = copy.deepcopy(model)

    # print(f'Epoch epoch:{epoch+1}, Loss: {total_loss}, Val_Acc: {val_acc}')
    print(f'Epoch epoch:{epoch+1}, Loss: {total_loss}, Val_Loss: {val_loss}, Val_acc: {val_acc}')

  return best_model

In [20]:
def similarity(q, p, n):
  return torch.dist(q, p, p=2) - torch.dist(q, n, p=2)


def inference(model, question, option1, option2, option3):
  model.eval()
  samples = [model.embed(question, option1, option2), model.embed(question, option2, option1),
             model.embed(question, option1, option3), model.embed(question, option3, option1),
             model.embed(question, option2, option3), model.embed(question, option3, option2),
             ]
  options = [option1, option2, option1, option3, option2, option3]
  answers = []
  for i, s in enumerate(samples):
    q, ans, dis = model(torch.reshape(s[0], (1,) + s[0].shape), torch.reshape(s[1], (1,) + s[1].shape), torch.reshape(s[2], (1,) + s[2].shape))
    answers.append((i, similarity(q, ans, dis)))
  sorted_options = sorted(answers, key=lambda x:x[1])
  best_ans = sorted_options[0][0]
  return options[best_ans]


def convert_valdf(df):
  new_df = pd.DataFrame(columns=['question', 'answer', 'distractor1', 'distractor2'])
  for i, row in df.iterrows():
    if row['option1'] == row['answer']:
      values = [row['question'], row['answer'], row['option2'], row['option3']]
    elif row['option2'] == row['answer']:
      values = [row['question'], row['answer'], row['option1'], row['option3']]
    elif row['option3'] == row['answer']:
      values = [row['question'], row['answer'], row['option1'], row['option2']]
    new_df.loc[len(new_df)] = values
  return new_df


# def accuracy(model, df):
#   total = 0
#   correct = 0
#   for i, row in df.iterrows():
#     q = row['question']
#     ans = row['answer']
#     dis1 = row['distractor1']
#     dis2 = row['distractor2']
#     pred = inference(model, q, ans, dis1, dis2)
#     if pred == ans:
#       correct += 1
#     total += 1
#   return correct / total * 100
def accuracy(model, df):
  total = 0
  correct = 0
  for i, row in df.iterrows():
    q = row['question']
    ans = row['answer']
    dis1 = row['distractor1']
    dis2 = row['distractor2']
    options = [ans, dis1, dis2]
    random.shuffle(options)
    pred = inference(model, q, options[0], options[1], options[2])
    if pred == ans:
      correct += 1
    total += 1
  return correct / total * 100

In [21]:
df = pd.read_csv(train_data_dir)
val_df = pd.read_csv(val_data_dir)
val_df = convert_valdf(val_df)

dataset = make_dataset(df)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(make_dataset(val_df), batch_size=16)
model = Model()
print(model)

Model(
  (embedding): SentenceTransformer(
    (0): Transformer({'max_seq_length': 384, 'do_lower_case': False}) with Transformer model: MPNetModel 
    (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False})
    (2): Normalize()
  )
  (linear1): Linear(in_features=768, out_features=64, bias=True)
  (linear2): Linear(in_features=1536, out_features=64, bias=True)
  (tanh): Tanh()
)


In [26]:
print(len(dataset))

1014


In [22]:
model = train(model, dataloader, val_loader, df, val_df, 10)

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch epoch:1, Loss: 26.119136055310566, Val_Loss: 3.960465431213379, Val_acc: 65.0
Epoch epoch:2, Loss: 13.830816989143687, Val_Loss: 3.8920491536458335, Val_acc: 70.0
Epoch epoch:3, Loss: 9.936877811948456, Val_Loss: 3.8363499244054156, Val_acc: 68.33333333333333
Epoch epoch:4, Loss: 8.06397553284963, Val_Loss: 3.8308546940485635, Val_acc: 68.33333333333333
Epoch epoch:5, Loss: 7.011919188002744, Val_Loss: 3.842889110247294, Val_acc: 68.33333333333333
Epoch epoch:6, Loss: 6.315824558337527, Val_Loss: 3.5977144638697305, Val_acc: 68.33333333333333
Epoch epoch:7, Loss: 5.361962966620922, Val_Loss: 3.7829339901606236, Val_acc: 66.66666666666666
Epoch epoch:8, Loss: 5.024512420097988, Val_Loss: 3.730200171470642, Val_acc: 66.66666666666666
Epoch epoch:9, Loss: 4.436069389184317, Val_Loss: 3.5299547314643864, Val_acc: 68.33333333333333
Epoch epoch:10, Loss: 4.18694300452868, Val_Loss: 3.6948737502098083, Val_acc: 63.33333333333333


In [23]:
questions = [
    'Mr. and Mrs. Mustard have six daughters and each daughter has one brother. But there are only 9 people in the family, how is that possible?',
    'There are 3 apples for 2 sons and 2 fathers to eat. Each of them get their own apple. How is it numerically possible?',
    'Everyone called him "Batman," but he knew nothing about bats and thought they were disgusting. He still cherished being referred to as Batman! How is this possible?'
]

options = [
    [
        'Some daughters get married and have their own family.',
        'Some brothers were not loved by family and moved away.',
        'Each daughter shares the same brother.',
    ],
    [
        'Two sons shared the same apple.',
        'They are one son, one father and one grandfather.',
        'One father gave his apple to his son.',
    ],
    [
        'He was the star baseball player.',
        'He is afraid others will laugh at him.',
        'He tries to be friendly.'
    ]
]

for i in range(len(questions)):
  print(inference(model, questions[i], options[i][0], options[i][1], options[i][2]))

Each daughter shares the same brother.
They are one son, one father and one grandfather.
He was the star baseball player.


In [24]:
test_dir = base_dir + 'new_test_data_nolabel.csv'
test_df = pd.read_csv(test_dir)

def predict(model, df):
  for i, row in df.iterrows():
    question = row['question']
    option1 = row['option1']
    option2 = row['option2']
    option3 = row['option3']
    prediction = inference(model, question, option1, option2, option3)
    if option1 == prediction:
      id = 0
    elif option2 == prediction:
      id = 1
    elif option3 == prediction:
      id = 2
    df.loc[i, 'pred_id'] = int(id)

  return df

def write_answer_id(df, path):
  with open(path, 'w', encoding='utf-8') as f:
    for i, row in df.iterrows():
      f.write(str(int(row['pred_id']))+'\n')


In [25]:
test_df = predict(model, test_df)
test_df

Unnamed: 0.1,Unnamed: 0,question,option1,option2,option3,option4,pred_id
0,0,"In a small village, two farmers are working in...",The lazy farmer is his mother.,The lazy farmer is not a responsible father as...,The diligent farmer devoted himself to the far...,None of above.,0.0
1,1,Romeo and Juliet are discovered dead on the be...,They were sleeping and scared by the sound of ...,The rumble of the train moved the shelf which ...,Romeo and Juliet are fish. The rumble of the t...,None of above.,2.0
2,2,How many years in your life it happens that be...,In ech leap year.,In the first year of graduation.,It happens every year.,None of above.,0.0
3,3,Who would serve as the team's captain if a cru...,The first officer.,The captain.,The second officer.,None of above.,1.0
4,4,"In one city, 5% of the population has an unlis...",One hundred people.,Ninty-five people.,Five people.,None of above.,0.0
...,...,...,...,...,...,...,...
115,115,A professional fisherman caught 30 fish during...,Two.,One.,Three.,None of above.,1.0
116,116,Bob was working on a project when suddenly int...,He needed a membership to search in google.,He searched the google in a wrong way.,The internet was disconnected so he couldn't s...,None of above.,2.0
117,117,"He has wed numerous women, but never himself. ...",A teacher.,A preacher.,A laywer.,None of above.,1.0
118,118,You walk into a room and see a bed and lie on ...,"Thirty-six, as there are eighteen animals.","Six. The bed's four legs, plus your two legs.","Seventy-two, as there are eighteen animals.",None of above.,0.0


In [27]:
path = base_dir + 'answer_sen.txt'
write_answer_id(test_df, path)

In [None]:
accuracy(model, val_df)

63.33333333333333