In [1]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

In [3]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')
train_path = '/content/drive/MyDrive/SMU_MITB_NLP/assignment2/train.jsonl'
test_path =  '/content/drive/MyDrive/SMU_MITB_NLP/assignment2/eval.jsonl'

import pandas as pd

# Read the file into a pandas DataFrame
df_train = pd.read_json(train_path,lines = True)
df_test = pd.read_json(test_path,lines = True)

Mounted at /content/drive


In [5]:
df_train.shape

(14912, 5)

In [6]:
df_test.shape

(4261, 4)

In [None]:
df_train.head()

Unnamed: 0,Id,Question,Alternative1,Alternative2,Answer
0,train-1,There is a light rain today. What happened as ...,The roots of many plants are not moistened by ...,Tourists have seen many ripples.,1
1,train-2,Susan wants to buy a restricted pesticide. Wha...,She bought rotenone.,He decided to buy amylase.,1
2,train-3,His parents stopped him. What was the cause of...,The child ran towards hippos.,Mike wanted to have a bird who has the lest nu...,1
3,train-4,The shamans were to start their sacred ceremon...,They disappear immediately after finishing eat...,They moved to a cave.,2
4,train-5,She mainly studied drugs. What was the cause o...,Lily likes the art form of truth.,Susan majored in pharmacology at university.,2


In [None]:
df_test.head()

Unnamed: 0,Id,Question,Alternative1,Alternative2
0,dev-1,The child brought psycho-physical phenomena on...,The woman gave birth to a child.,The baby feels the awareness through physical ...
1,dev-2,Otters enter their new habitat. What happened ...,Otters start looking for abalone for food.,They always live by the water so that they can...
2,dev-3,Lila can find what she wants quickly. What was...,Lila bought several kinds of textbooks.,Lila loves classification of her things.
3,dev-4,The chance she gets flu becomes smaller. What ...,The fish has suffered from the infection.,She has been vaccined.
4,dev-5,He got some rum. What was the cause of this?,The worker fremented some sugar cane with yeast.,Tom went out and want to hunt some cottontails.


In [None]:
df_test.tail()

Unnamed: 0,Id,Question,Alternative1,Alternative2
4256,test-2126,The keeper kept some cougars. What happened as...,Keepers occasionally mistake them for cats.,They drank water similarly in continuous draug...
4257,test-2127,Tom played the quadrille. What happened as a r...,Tom danced with his friends. .,Tom touches the vertebrae.
4258,test-2128,Tom has been having trouble sleeping. What hap...,"Yesterday, the doctor diagnosed him with epile...",He asked the doctor to gave him some depressants.
4259,test-2129,She puts different metals which are in molten ...,Lucy wants to get some alloys.,The experimenter wants to extract some barium.
4260,test-2130,Tom has signed a protocol with the agency. Wha...,The protection of the environment would be eas...,His passing rate in the exam is guaranteed.


In [None]:
print(df_train['Question'][:4])

0    There is a light rain today. What happened as ...
1    Susan wants to buy a restricted pesticide. Wha...
2    His parents stopped him. What was the cause of...
3    The shamans were to start their sacred ceremon...
Name: Question, dtype: object


In [None]:
class Vocabulary:
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []
        self.vectors = []

    def add_sentence(self, sentence):
        for word in sentence.split():
            if word not in self.word2idx:
                self.word2idx[word] = len(self.idx2word)
                self.idx2word.append(word)
                self.vectors.append(np.random.randn(100))  # 随机初始化词向量

    def sentence_to_tensor(self, sentence):
        indices = [self.word2idx[word] for word in sentence.split()]
        return torch.tensor(indices, dtype=torch.long)

    def __len__(self):
        return len(self.idx2word)

vocab = Vocabulary()
for _, row in df_train.iterrows():
    vocab.add_sentence(row['Question'])
    vocab.add_sentence(row['Alternative1'])
    vocab.add_sentence(row['Alternative2'])

for _,row in df_test.iterrows():
    vocab.add_sentence(row['Question'])
    vocab.add_sentence(row['Alternative1'])
    vocab.add_sentence(row['Alternative2'])

class SentenceDataset(Dataset):
    def __init__(self, df, vocab):
        self.df = df
        self.vocab = vocab
        self.is_train = 'Answer' in df.columns

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        question = self.df.iloc[idx]['Question']
        alt1 = self.df.iloc[idx]['Alternative1']
        alt2 = self.df.iloc[idx]['Alternative2']
        id = self.df.iloc[idx]['Id']
        if self.is_train:
          answer = self.df.iloc[idx]['Answer']
        else:
          answer = 0
        question_tensor = self.vocab.sentence_to_tensor(question)
        alt1_tensor = self.vocab.sentence_to_tensor(alt1)
        alt2_tensor = self.vocab.sentence_to_tensor(alt2)

        return id,question_tensor, alt1_tensor, alt2_tensor, answer

train_dataset = SentenceDataset(df_train, vocab)
eval_dataset = SentenceDataset(df_test, vocab)

def collate_fn(batch):
    ids,question_tensors, alt1_tensors, alt2_tensors, answers = [],[], [], [], []
    for id, question_tensor, alt1_tensor, alt2_tensor, answer in batch:
        ids.append(id)
        question_tensors.append(question_tensor)
        alt1_tensors.append(alt1_tensor)
        alt2_tensors.append(alt2_tensor)
        answers.append(answer)

    return ids, \
          torch.nn.utils.rnn.pad_sequence(question_tensors, batch_first=True), \
          torch.nn.utils.rnn.pad_sequence(alt1_tensors, batch_first=True), \
          torch.nn.utils.rnn.pad_sequence(alt2_tensors, batch_first=True), \
          torch.tensor(answers, dtype=torch.long)

train_dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)
eval_dataloader = DataLoader(eval_dataset, batch_size=2, shuffle=False, collate_fn=collate_fn)

In [None]:
print(vocab.idx2word[:10])

['There', 'is', 'a', 'light', 'rain', 'today.', 'What', 'happened', 'as', 'result?']


In [None]:
class SimpleRNNModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(SimpleRNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.embedding(x)
        _, h_n = self.rnn(x)
        return h_n.squeeze(0)

embedding_dim = 200
hidden_dim = 256
output_dim = 1
vocab_size = len(vocab)

model = SimpleRNNModel(vocab_size, embedding_dim, hidden_dim, output_dim)


In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

def calculate_accuracy(predictions, labels):
    _, preds = torch.max(predictions, 1)
    correct = (preds == labels).float()
    accuracy = correct.sum() / len(correct)
    return accuracy

for epoch in range(10):  # 训练10个epoch
    epoch_loss = 0
    epoch_acc = 0
    model.train()

    for id,question_tensor, alt1_tensor, alt2_tensor, label in train_dataloader:
        optimizer.zero_grad()

        question_vec = model(question_tensor)
        alt1_vec = model(alt1_tensor)
        alt2_vec = model(alt2_tensor)

        sim1 = torch.cosine_similarity(question_vec.unsqueeze(1), alt1_vec.unsqueeze(1), dim=2).squeeze(1)
        sim2 = torch.cosine_similarity(question_vec.unsqueeze(1), alt2_vec.unsqueeze(1), dim=2).squeeze(1)

        output = torch.stack((sim1, sim2), dim=1)
        label = label - 1  # 标签从1,2转换为0,1

        loss = criterion(output, label)
        acc = calculate_accuracy(output, label)

        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()

    epoch_loss /= len(train_dataloader)
    epoch_acc /= len(train_dataloader)

    print(f'Epoch {epoch+1}, Loss: {epoch_loss}, Accuracy: {epoch_acc}')



Epoch 1, Loss: 0.7111449525311845, Accuracy: 0.5028165236051502
Epoch 2, Loss: 0.702076150084418, Accuracy: 0.5203862660944206
Epoch 3, Loss: 0.6888591731286932, Accuracy: 0.5357430257510729
Epoch 4, Loss: 0.6698723162440463, Accuracy: 0.5938841201716738
Epoch 5, Loss: 0.6502413179429624, Accuracy: 0.6174222103004292
Epoch 6, Loss: 0.6123773708795657, Accuracy: 0.6737526824034334
Epoch 7, Loss: 0.5800621869430234, Accuracy: 0.7052038626609443
Epoch 8, Loss: 0.5524639855358797, Accuracy: 0.7314243562231759
Epoch 9, Loss: 0.5298314379456217, Accuracy: 0.7459763948497854
Epoch 10, Loss: 0.5146348428065077, Accuracy: 0.7587848712446352


In [None]:
model.eval()
results = []

with torch.no_grad():
    for ids, question_tensor, alt1_tensor, alt2_tensor,_ in eval_dataloader:
        question_vec = model(question_tensor)
        alt1_vec = model(alt1_tensor)
        alt2_vec = model(alt2_tensor)

        sim1 = torch.cosine_similarity(question_vec.unsqueeze(1), alt1_vec.unsqueeze(1), dim=2).squeeze(1)
        sim2 = torch.cosine_similarity(question_vec.unsqueeze(1), alt2_vec.unsqueeze(1), dim=2).squeeze(1)

        output = torch.stack((sim1, sim2), dim=1)
        _, preds = torch.max(output, 1)

        for id, pred in zip(ids, preds):
            selected_alt = 1 if pred.item() == 0 else 2
            results.append({'ID': id, 'Target': selected_alt})

results_df = pd.DataFrame(results)
results_df.to_csv('test_results.csv', index=False)


In [None]:
print(results_df.shape)

(4261, 2)


In [None]:
from google.colab import files
files.download('test_results.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>