In [49]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from datasets import Dataset, load_dataset, concatenate_datasets, load_from_disk
import pandas as pd

In [50]:
raw_dataset = load_dataset("squad_v2")
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 130319
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 11873
    })
})

In [51]:
raw_dataset["train"][0]

{'id': '56be85543aeaaa14008c9063',
 'title': 'Beyoncé',
 'context': 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".',
 'question': 'When did Beyonce start becoming popular?',
 'answers': {'text': ['in the late 1990s'], 'answer_start': [269]}}

In [52]:
raw_dataset["train"]["context"][:5]

['Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".',
 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead s

In [100]:
merged_data = concatenate_datasets([raw_dataset["train"], raw_dataset["validation"]])
merged_data

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 142192
})

In [101]:
merged_data = merged_data.remove_columns(["id", "title"])

In [102]:
merged_data.num_rows

142192

In [111]:
df = pd.DataFrame(merged_data["answers"])
df['text'] = df['text'].apply(lambda lst: ' '.join(lst))
df = df.drop(columns=['answer_start'])
df

Unnamed: 0,text
0,in the late 1990s
1,singing and dancing
2,2003
3,"Houston, Texas"
4,late 1990s
...,...
142187,sthène sthène sthène sthène sthène
142188,
142189,
142190,


In [114]:
df['label'] = df['text'].apply(lambda lst: 'SUPPORTED' if lst else 'NEI')
df['question'] = merged_data["question"]
df['document'] = merged_data["context"]
df['claim'] = df['question'] + df['text']
df = df.drop(columns=['question'])
df

Unnamed: 0,text,label,document,claim
0,in the late 1990s,SUPPORTED,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce start becoming popular?in the...
1,singing and dancing,SUPPORTED,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What areas did Beyonce compete in when she was...
2,2003,SUPPORTED,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce leave Destiny's Child and bec...
3,"Houston, Texas",SUPPORTED,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In what city and state did Beyonce grow up? H...
4,late 1990s,SUPPORTED,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In which decade did Beyonce become famous?late...
...,...,...,...,...
142187,sthène sthène sthène sthène sthène,SUPPORTED,"The pound-force has a metric counterpart, less...",What is the seldom used force unit equal to on...
142188,,NEI,"The pound-force has a metric counterpart, less...",What does not have a metric counterpart?
142189,,NEI,"The pound-force has a metric counterpart, less...",What is the force exerted by standard gravity ...
142190,,NEI,"The pound-force has a metric counterpart, less...",What force leads to a commonly used unit of mass?


In [115]:
df[df['label'] == 'SUPPORTED'].count()

text        92749
label       92749
document    92749
claim       92749
dtype: int64

In [116]:
df[df['text'].str.contains(r'\d')].count() # text with number

text        20154
label       20154
document    20154
claim       20154
dtype: int64

In [127]:
import re
from random import randint
def replace_number(match):
    number = int(match.group(0))
    return str(number + randint(1,5))

changed_df = pd.DataFrame(df['text'].apply(lambda text: re.sub(r'\d+', replace_number, text)))
changed_df['question'] = merged_data["question"]
changed_df['document'] = df['document']
changed_df

Unnamed: 0,text,question,document
0,in the late 1991s,When did Beyonce start becoming popular?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...
1,singing and dancing,What areas did Beyonce compete in when she was...,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...
2,2006,When did Beyonce leave Destiny's Child and bec...,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...
3,"Houston, Texas",In what city and state did Beyonce grow up?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...
4,late 1993s,In which decade did Beyonce become famous?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...
...,...,...,...
142187,sthène sthène sthène sthène sthène,What is the seldom used force unit equal to on...,"The pound-force has a metric counterpart, less..."
142188,,What does not have a metric counterpart?,"The pound-force has a metric counterpart, less..."
142189,,What is the force exerted by standard gravity ...,"The pound-force has a metric counterpart, less..."
142190,,What force leads to a commonly used unit of mass?,"The pound-force has a metric counterpart, less..."


In [128]:
print("changed columns:")
df[df['text'] != changed_df['text']].count()

changed columns:


text        20154
label       20154
document    20154
claim       20154
dtype: int64

In [63]:
# import nltk
# nltk.download('stopwords')

In [129]:
from collections import Counter
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def tokenize_and_remove_stopwords(text):
    words = re.findall(r'\w+', text.lower()) # Tách từ và chuyển thành chữ thường
    filtered_words = [word for word in words if word not in stop_words]
    return filtered_words

# Tạo một danh sách các từ từ cột "text" chưa được chỉnh sửa
all_words = df[df['text'] == changed_df['text']]['text'].apply(tokenize_and_remove_stopwords).tolist()

# Đếm tần suất xuất hiện của từ
word_counts = Counter(word for words in all_words for word in words)

# In ra các từ phổ biến
most_common_words = word_counts.most_common(20)
print(pd.DataFrame(most_common_words, columns=('word', 'count')))

          word  count
0          two    726
1          new    683
2          one    629
3        three    564
4       united    506
5          war    486
6        world    447
7       states    442
8         four    441
9        state    417
10          de    407
11         law    395
12        john    394
13      system    385
14     british    384
15        city    369
16  university    365
17       north    360
18    national    358
19  government    352


In [130]:
replacement_word_map = {
    'two': 'three',
    'new': 'last',
    'one': 'two',
    'three': 'four',
    'united': 'united',
    'war': 'peace',
    'world': 'earth',
    'states': 'states',
    'four': 'five',
    'state': 'state',
    'de': 'deeee',
    'law': 'rules',
    'john': 'leo',
    'system': 'structure',
    'british': 'usa',
    'city': 'country',
    'university': 'school',
    'north': 'south',
    'national': 'international',
    'government': 'crown',
}
for common_word,_ in most_common_words:
    changed_df['text'] = changed_df['text'].apply(lambda text: ' '.join([replacement_word_map[word] if word == common_word else word for word in text.split()]))

In [131]:
print("changed columns:")
df[df['text'] != changed_df['text']].count()

changed columns:


text        23369
label       23369
document    23369
claim       23369
dtype: int64

In [132]:
changed_df

Unnamed: 0,text,question,document
0,in the late 1991s,When did Beyonce start becoming popular?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...
1,singing and dancing,What areas did Beyonce compete in when she was...,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...
2,2006,When did Beyonce leave Destiny's Child and bec...,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...
3,"Houston, Texas",In what city and state did Beyonce grow up?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...
4,late 1993s,In which decade did Beyonce become famous?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...
...,...,...,...
142187,sthène sthène sthène sthène sthène,What is the seldom used force unit equal to on...,"The pound-force has a metric counterpart, less..."
142188,,What does not have a metric counterpart?,"The pound-force has a metric counterpart, less..."
142189,,What is the force exerted by standard gravity ...,"The pound-force has a metric counterpart, less..."
142190,,What force leads to a commonly used unit of mass?,"The pound-force has a metric counterpart, less..."


In [137]:
refuted_df = pd.DataFrame(changed_df[df['text'] != changed_df['text']], columns=('document', 'label', 'question', 'text'))
refuted_df['label'] = 'REFUTED'
refuted_df

Unnamed: 0,document,label,question,text
0,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,REFUTED,When did Beyonce start becoming popular?,in the late 1991s
2,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,REFUTED,When did Beyonce leave Destiny's Child and bec...,2006
4,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,REFUTED,In which decade did Beyonce become famous?,late 1993s
8,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,REFUTED,When did Beyoncé rise to fame?,late 1991s
11,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,REFUTED,When did Beyoncé release Dangerously in Love?,2008
...,...,...,...,...
142103,The weak force is due to the exchange of the h...,REFUTED,At what temperature do weak and electromagneti...,approximately 1016 kelvins in excess of approx...
142128,Newton's laws and Newtonian mechanics in gener...,REFUTED,What does matter actually have that Newtonian ...,extended structure extended structure extended...
142160,A conservative force that acts on a closed sys...,REFUTED,What is the force between two locations relate...,difference in potential energy the difference ...
142175,The connection between macroscopic nonconserva...,REFUTED,What changes macroscopic closed system energies?,nonconservative forces internal energies of th...


In [138]:
refuted_df['claim'] = refuted_df['question'] + refuted_df['text']
refuted_df = refuted_df.drop(columns=['text', 'question'])
refuted_df

Unnamed: 0,document,label,claim
0,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,REFUTED,When did Beyonce start becoming popular?in the...
2,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,REFUTED,When did Beyonce leave Destiny's Child and bec...
4,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,REFUTED,In which decade did Beyonce become famous?late...
8,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,REFUTED,When did Beyoncé rise to fame?late 1991s
11,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,REFUTED,When did Beyoncé release Dangerously in Love?2008
...,...,...,...
142103,The weak force is due to the exchange of the h...,REFUTED,At what temperature do weak and electromagneti...
142128,Newton's laws and Newtonian mechanics in gener...,REFUTED,What does matter actually have that Newtonian ...
142160,A conservative force that acts on a closed sys...,REFUTED,What is the force between two locations relate...
142175,The connection between macroscopic nonconserva...,REFUTED,What changes macroscopic closed system energie...


In [140]:
final_df = pd.concat([df, refuted_df], ignore_index=True)
final_df = final_df.drop(columns=['text'])
final_df

Unnamed: 0,label,document,claim
0,SUPPORTED,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce start becoming popular?in the...
1,SUPPORTED,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What areas did Beyonce compete in when she was...
2,SUPPORTED,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce leave Destiny's Child and bec...
3,SUPPORTED,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In what city and state did Beyonce grow up? H...
4,SUPPORTED,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In which decade did Beyonce become famous?late...
...,...,...,...
165556,REFUTED,The weak force is due to the exchange of the h...,At what temperature do weak and electromagneti...
165557,REFUTED,Newton's laws and Newtonian mechanics in gener...,What does matter actually have that Newtonian ...
165558,REFUTED,A conservative force that acts on a closed sys...,What is the force between two locations relate...
165559,REFUTED,The connection between macroscopic nonconserva...,What changes macroscopic closed system energie...


In [141]:
final_df['label'].unique()

array(['SUPPORTED', 'NEI', 'REFUTED'], dtype=object)

In [142]:
ds = Dataset.from_dict({"document": final_df['document'],
                        "claim": final_df['claim'],
                        "label": final_df['label']
                        })
ds = ds.with_format("torch")
ds

Dataset({
    features: ['document', 'claim', 'label'],
    num_rows: 165561
})

In [143]:
ds[:5]

{'document': ['Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".',
  'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1

In [144]:
ds.to_json('train.jsonl')

Creating json from Arrow format:   0%|          | 0/166 [00:00<?, ?ba/s]

Creating json from Arrow format: 100%|██████████| 166/166 [00:01<00:00, 141.24ba/s]


146619068

In [145]:
test_dataset = load_dataset("json", data_files='train.jsonl')
test_dataset

Downloading data files: 100%|██████████| 1/1 [00:00<?, ?it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 111.40it/s]
Generating train split: 165561 examples [00:00, 399046.97 examples/s]


DatasetDict({
    train: Dataset({
        features: ['document', 'claim', 'label'],
        num_rows: 165561
    })
})