In [None]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-2.3.1-py3-none-any.whl (132 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/132.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.0/132.8 kB[0m [31m1.1 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━[0m [32m122.9/132.8 kB[0m [31m1.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.8/132.8 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-2.3.1


In [None]:
from sentence_transformers import SentenceTransformer
from transformers import GPT2Model, GPT2Tokenizer
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
from tqdm.notebook import tqdm_notebook

base_dir = '/content/drive/MyDrive/SemEval/'
train_data_dir = base_dir + 'train_data.csv'
val_data_dir = base_dir + 'val_data.csv'
target_train_data_dir = base_dir + 'pred_train_data.csv'
target_val_data_dir = base_dir + 'pred_val_data.csv'
x_train_tensor_dir = base_dir + 'x_train.pt'
y_train_tensor_dir = base_dir + 'y_train.pt'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
from google.colab import drive
from google.colab import userdata
drive.mount('/content/drive')

Mounted at /content/drive


In [42]:
# from transformers import AutoTokenizer, FalconForCausalLM


# class Embedding:
#   def __init__(self):
#     self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#     self.tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-mpnet-base-v2")
#     self.model = FalconForCausalLM.from_pretrained("sentence-transformers/all-mpnet-base-v2").to(self.device)

#   def encode(self, sentence):
#     inputs = self.tokenizer(sentence, return_tensors='pt').to(self.device)
#     outputs = self.model(**inputs)
#     return torch.mean(outputs[0], dim=1).reshape(768).cpu().detach().numpy()

# embedding_model = Embedding()
embedding_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [43]:
def make_dataset(data):
  x_train = []
  y_train = []
  for idx, row in tqdm_notebook(data.iterrows()):
    q_emb = torch.from_numpy(embedding_model.encode(row['question'])).to(torch.float32)
    ans_emb = torch.from_numpy(embedding_model.encode(row['answer'])).to(torch.float32)
    dis1_emb = torch.from_numpy(embedding_model.encode(row['distractor1'])).to(torch.float32)
    dis2_emb = torch.from_numpy(embedding_model.encode(row['distractor2'])).to(torch.float32)
    x = torch.stack([q_emb, ans_emb], dim=0)
    x_train.append(x)
    y_train.append(torch.tensor([1], dtype=torch.float32))

    x = torch.stack([q_emb, dis1_emb], dim=0)
    x_train.append(x)
    y_train.append(torch.tensor([0], dtype=torch.float32))

    x = torch.stack([q_emb, dis2_emb], dim=0)
    x_train.append(x)
    y_train.append(torch.tensor([0], dtype = torch.float32))

  return torch.stack(x_train, dim=0), torch.tensor(y_train)


def evaluate(model, loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for x_batch, y_batch in loader:
          x_batch = x_batch.to(device)
          y_batch = y_batch.to(device)
          y_pred = model(x_batch)
          predicted = (y_pred > 0.5).float()
          total += y_batch.size(0)
          correct += (predicted == y_batch).sum().item()
    return 100 * correct / total


def train(model, loader, epochs):
  loss_fn = nn.BCELoss()
  optimizer = optim.Adam(model.parameters())
  for epoch in tqdm_notebook(range(epochs)):
    model.train()
    total_loss = 0.0
    for x_batch, y_batch in loader:
      x_batch = x_batch.to(device)
      y_batch = y_batch.to(device)
      y_pred = model(x_batch)
      loss = loss_fn(y_pred, y_batch)
      total_loss += loss.item() / len(x_batch)
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()
    print("Epoch %d: train loss %.4f acc %.4f" % (epoch+1, total_loss, evaluate(model, loader)))


def inference(model, question, options):
  q_emb = torch.from_numpy(embedding_model.encode(question)).to(torch.float32)
  emb0 = torch.from_numpy(embedding_model.encode(options[0])).to(torch.float32)
  emb1 = torch.from_numpy(embedding_model.encode(options[1])).to(torch.float32)
  emb2 = torch.from_numpy(embedding_model.encode(options[2])).to(torch.float32)
  x = [torch.stack([q_emb, emb0], dim=0), torch.stack([q_emb, emb1], dim=0), torch.stack([q_emb, emb2], dim=0)]
  model.eval()
  x = [torch.reshape(xi, shape=(1,)+xi.shape) for xi in x]
  y = torch.stack([model(xi) for xi in x], dim=0)
  idx = torch.argmax(y)
  answer = options[idx]
  return answer


def add_pred_column(model, df, mode='train'):
  for idx, row in tqdm_notebook(df.iterrows()):
    q = row['question']
    if mode == 'train':
      options = [row['answer'], row['distractor1'], row['distractor2']]
    else:
      options = [row['option1'], row['option2'], row['option3']]
    random.shuffle(options)
    pred = inference(model, q, options)
    df.loc[idx, 'prediction'] = pred
  return df

def accuracy(model, df):
  total = 0.0
  correct = 0.0
  for idx, row in tqdm_notebook(df.iterrows()):
    pred = row['prediction']
    if pred == row['answer']:
      correct += 1
    else:
      print(idx, ': ', row['question'], ':\n')
      print('\ta. ', row['answer'])
      print('\tp. ', pred)
    total += 1
  return correct / total * 100

In [44]:
csv_data = pd.read_csv(train_data_dir)

x_train, y_train = make_dataset(csv_data)
# x_train, y_train = torch.load(x_train_tensor_dir).to(device), torch.load(y_train_tensor_dir).to(device)

0it [00:00, ?it/s]

In [45]:
y_train = torch.reshape(y_train, shape=(y_train.shape[0], 1))
x_train = torch.reshape(x_train, shape=(1521, 2, 768))
x_train.shape, y_train.shape

(torch.Size([1521, 2, 768]), torch.Size([1521, 1]))

In [None]:

torch.save(x_train, x_train_tensor_dir)
torch.save(y_train, y_train_tensor_dir)

In [46]:
class Model(nn.Module):
  def __init__(self):
        super().__init__()
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.linear1 = nn.Linear((768)*2, 1, device=self.device).to(self.device)
        # self.classifier = nn.Lin
        self.sigmoid = nn.Sigmoid()
  def forward(self, x):
    x1, x2 = x[:, 0].to(self.device), x[:, 1].to(self.device)
    x = torch.cat((x1, x2), dim=1)
    x = self.linear1(x).to(self.device)
    x = self.sigmoid(x)
    return x

In [47]:
loader = data.DataLoader(data.TensorDataset(x_train, y_train), shuffle=True, batch_size=4)
len(loader)

381

In [48]:
model = Model().to(device)
train(model, loader, 10)

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1: train loss 61.0272 acc 67.6529
Epoch 2: train loss 56.9397 acc 69.7567
Epoch 3: train loss 54.2114 acc 72.0579
Epoch 4: train loss 51.6550 acc 74.0960
Epoch 5: train loss 49.8337 acc 76.7916
Epoch 6: train loss 48.1145 acc 77.9750
Epoch 7: train loss 46.5203 acc 78.6982
Epoch 8: train loss 45.0858 acc 79.1584
Epoch 9: train loss 44.1896 acc 80.1446
Epoch 10: train loss 43.2994 acc 81.7226


In [49]:
questions = [
    'Mr. and Mrs. Mustard have six daughters and each daughter has one brother. But there are only 9 people in the family, how is that possible?',
    'There are 3 apples for 2 sons and 2 fathers to eat. Each of them get their own apple. How is it numerically possible?',
    'Everyone called him "Batman," but he knew nothing about bats and thought they were disgusting. He still cherished being referred to as Batman! How is this possible?'
]

options = [
    [
        'Each daughter shares the same brother.',
        'Some daughters get married and have their own family.',
        'Some brothers were not loved by family and moved away.'
    ],
    [
        'They are one son, one father and one grandfather.',
        'Two sons shared the same apple.',
        'One father gave his apple to his son.'
    ],
    [
        'He was the star baseball player.',
        'He is afraid others will laugh at him.',
        'He tries to be friendly.'
    ]
]


In [50]:
for i, q in enumerate(questions):
  print(inference(model, q, options[i]))

Each daughter shares the same brother.
They are one son, one father and one grandfather.
He was the star baseball player.


In [53]:
test_dir = base_dir + 'new_test_data_nolabel.csv'
test_df = pd.read_csv(test_dir)

def predict(model, df):
  for i, row in df.iterrows():
    question = row['question']
    option1 = row['option1']
    option2 = row['option2']
    option3 = row['option3']
    prediction = inference(model, question, [option1, option2, option3])
    if option1 == prediction:
      id = 0
    elif option2 == prediction:
      id = 1
    elif option3 == prediction:
      id = 2
    df.loc[i, 'pred_id'] = int(id)

  return df

def write_answer_id(df, path):
  with open(path, 'w', encoding='utf-8') as f:
    for i, row in df.iterrows():
      f.write(str(int(row['pred_id']))+'\n')


In [54]:
test_df = predict(model, test_df)
test_df

Unnamed: 0.1,Unnamed: 0,question,option1,option2,option3,option4,pred_id
0,0,"In a small village, two farmers are working in...",The lazy farmer is his mother.,The lazy farmer is not a responsible father as...,The diligent farmer devoted himself to the far...,None of above.,0.0
1,1,Romeo and Juliet are discovered dead on the be...,They were sleeping and scared by the sound of ...,The rumble of the train moved the shelf which ...,Romeo and Juliet are fish. The rumble of the t...,None of above.,2.0
2,2,How many years in your life it happens that be...,In ech leap year.,In the first year of graduation.,It happens every year.,None of above.,0.0
3,3,Who would serve as the team's captain if a cru...,The first officer.,The captain.,The second officer.,None of above.,1.0
4,4,"In one city, 5% of the population has an unlis...",One hundred people.,Ninty-five people.,Five people.,None of above.,0.0
...,...,...,...,...,...,...,...
115,115,A professional fisherman caught 30 fish during...,Two.,One.,Three.,None of above.,1.0
116,116,Bob was working on a project when suddenly int...,He needed a membership to search in google.,He searched the google in a wrong way.,The internet was disconnected so he couldn't s...,None of above.,0.0
117,117,"He has wed numerous women, but never himself. ...",A teacher.,A preacher.,A laywer.,None of above.,0.0
118,118,You walk into a room and see a bed and lie on ...,"Thirty-six, as there are eighteen animals.","Six. The bed's four legs, plus your two legs.","Seventy-two, as there are eighteen animals.",None of above.,1.0


In [55]:
path = base_dir + 'answer_sen.txt'
write_answer_id(test_df, path)

In [None]:
df = csv_data[['question', 'answer', 'distractor1', 'distractor2']]
df = add_pred_column(model, df)
acc = accuracy(model, df)
df.to_csv(target_train_data_dir)
acc

In [None]:
val_df = pd.read_csv(val_data_dir)
df = add_pred_column(model, val_df, 'val')
acc = accuracy(model, df)
df.to_csv(target_val_data_dir)
acc