In [12]:
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration

In [48]:
train = pd.read_csv('../dataset/test_answer.csv')
train.head()

Unnamed: 0,context,Answer,Label
0,In the planning of a new district in a townshi...,A.Civic Park is north of the administrative se...,0
1,The company sent three young staff members to ...,"A.20-year-old accountant, 20-year-old salesper...",0
2,"In a traditional Chinese medicine preparation,...",A No dangshen,1
3,"In recent years, graduate entrance examination...","A.If you take an English tutoring class, you c...",1
4,"Zhang Ming, Li Ying, Wang Jia and Chen Rui wor...",A Chen Rui can't speak the Central Plains Mand...,1


In [57]:
def get_prompt(df):
    context = df.context
    answer = df.Answer
    return f"""Given that "{context}", do you think this is logically correct: "{answer}".
Answer 1 if it is correct, otherwise answer 0. Your answer:"""

In [50]:
tokenizer = T5Tokenizer.from_pretrained('google/flan-t5-large')
model = T5ForConditionalGeneration.from_pretrained('google/flan-t5-large')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [56]:
prompt = get_prompt(train.iloc[0])
print(prompt)
inputs = tokenizer(prompt, return_tensors='pt')
outputs = model.generate(**inputs, max_length=100)
output = tokenizer.decode(outputs[0])
print(output)

Given that "In the planning of a new district in a township, it was decided to build a special community in the southeast, northwest, centered on the citizen park.These four communities are designated as cultural area, leisure area, commercial area and administrative service area.It is known that the administrative service area is southwest of the cultural area, and the cultural area is southeast of the leisure area.", do you think this is logically correct: "A.Civic Park is north of the administrative service area".
<pad> no</s>


In [58]:
def get_answer(output):
    for char in output:
        if char in ['0', '1']:
            return int(char)

In [60]:
tokenizer = T5Tokenizer.from_pretrained('google/flan-t5-large')
model = T5ForConditionalGeneration.from_pretrained('google/flan-t5-large')
predictions = []
for i in range(len(train)):
    prompt = get_prompt(train.iloc[i])
    inputs = tokenizer(prompt, return_tensors='pt')
    outputs = model.generate(**inputs)
    output = tokenizer.decode(outputs[0])
    predictions.append(get_answer(output))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [61]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
y_true = train['Label']
y_pred = predictions
f1 = f1_score(y_true, y_pred)
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
print(f'F1: {f1}, Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}')

F1: 0.27411167512690354, Accuracy: 0.506896551724138, Precision: 0.5192307692307693, Recall: 0.18620689655172415
