# Prepare dataset (jsonl file)

- Prepare datasets for fine-tuning GPT-3.5-turbo with **features as text** and **all-in-one strategy**.

- Here, the argument component (AC), its essay as well as its structural features are given.

- We create the data files: `data_train_v2.jsonl`, `data_val_v2.jsonl`, `data_test_v2.jsonl`

## Libraries

In [1]:
import os
import json
import pandas as pd
import random

## Load csv file

In [2]:
data_dir = os.path.join(os.getcwd(), "data")

In [3]:
df = pd.read_csv(os.path.join(data_dir, "persuasive_essays_dataset.csv"), index_col=0)

In [4]:
# df.isna().sum()

In [5]:
df.head()

Unnamed: 0,tag,label,start,end,argument_component,essay_file,essay_title,essay_text,sentence,nr_essay_paragraphs,paragraph_nr,paragraph,is_component_in_intro_paragraph,is_component_in_conclusion_paragraph,is_component_first_in_paragraph,is_component_last_in_paragraph,split,structral_featxt,argument_counter
0,T1,MajorClaim,503,575,we should attach more importance to cooperatio...,essay001.txt,Should students be taught to compete or to coo...,Should students be taught to compete or to coo...,"From this point of view, I firmly believe that...",4,1,It is always said that competition can effecti...,1,0,1,1,TRAIN,Topic: Should students be taught to compete or...,1
1,T3,Claim,591,714,"through cooperation, children can learn about ...",essay001.txt,Should students be taught to compete or to coo...,Should students be taught to compete or to coo...,"First of all, through cooperation, children ca...",4,2,"First of all, through cooperation, children ca...",0,0,1,0,TRAIN,Topic: Should students be taught to compete or...,2
2,T4,Premise,716,851,What we acquired from team work is not only ho...,essay001.txt,Should students be taught to compete or to coo...,Should students be taught to compete or to coo...,What we acquired from team work is not only ho...,4,2,"First of all, through cooperation, children ca...",0,0,0,0,TRAIN,Topic: Should students be taught to compete or...,3
3,T5,Premise,853,1086,"During the process of cooperation, children ca...",essay001.txt,Should students be taught to compete or to coo...,Should students be taught to compete or to coo...,"During the process of cooperation, children ca...",4,2,"First of all, through cooperation, children ca...",0,0,0,0,TRAIN,Topic: Should students be taught to compete or...,4
4,T6,Premise,1088,1191,All of these skills help them to get on well w...,essay001.txt,Should students be taught to compete or to coo...,Should students be taught to compete or to coo...,All of these skills help them to get on well w...,4,2,"First of all, through cooperation, children ca...",0,0,0,1,TRAIN,Topic: Should students be taught to compete or...,5


In [6]:
len(df)

6089

In [7]:
df.split.value_counts()

split
TRAIN    4823
TEST     1266
Name: count, dtype: int64

In [8]:
train_essays_l = list(df[df.split=="TRAIN"].essay_file.value_counts().index)
len(train_essays_l)

322

In [9]:
# validation set: 10% of train set

val_size = int(322 * 10/100)
val_size

32

In [10]:
random.seed(42)
val_essays_l = random.sample(train_essays_l, val_size)

In [11]:
len(val_essays_l)
# val_essays_l

32

In [12]:
train_essays_l = list(set(train_essays_l) - set(val_essays_l))
len(train_essays_l)

290

## Prepare prompt

In [13]:
# Dataset in chat completion format

def formatting_fct(task_description="", question="", answer="", mode="train"):
    
    prompt_d = {"messages": [
        {"role": "system", "content": f"{task_description}"},
        {"role": "user", "content": f"{question}"},
        {"role": "assistant", "content": f"{answer if mode=='train' else ''}"}
    ]
             }
    
    return prompt_d

In [14]:
my_task_description = """### Your task is to classify an arguement component from an essay into three possible classes: major claim, claim, or premise. For this purpose, we give you the following information in order:
1. The argument component itself
2. The topic of the essay
3. The complete sentence of this argument component
4. Positional information of this argument component
"""

In [15]:
print(my_task_description)

### Your task is to classify an arguement component from an essay into three possible classes: major claim, claim, or premise. For this purpose, we give you the following information in order:
1. The argument component itself
2. The topic of the essay
3. The complete sentence of this argument component
4. Positional information of this argument component



In [16]:
def bool_to_text(b):
    
    return "yes" if b == 1 else "no"

In [26]:
def build_question(argument_component, contextual_fts, structural_fts):
    
    question = f"""### Here is the information about the argument component:
1. Argument component: {argument_component}
2. Topic: {contextual_fts[0]}
3. Complete sentence: {contextual_fts[1]}
4. Positional information: Paragraph number: {structural_fts[0]}. Is it in the introduction paragraph: {bool_to_text(structural_fts[1])}. Is it in the conclusion paragraph: {bool_to_text(structural_fts[2])}. Is it the first component of its paragraph: {bool_to_text(structural_fts[3])}. Is it the last component of its paragraph: {bool_to_text(structural_fts[4])}.
Is the argument compoment given in point 1 a major claim, a claim, or a premise? No other answer besides these three is accepted.
"""
    
    return question

In [27]:
df.columns

Index(['tag', 'label', 'start', 'end', 'argument_component', 'essay_file',
       'essay_title', 'essay_text', 'sentence', 'nr_essay_paragraphs',
       'paragraph_nr', 'paragraph', 'is_component_in_intro_paragraph',
       'is_component_in_conclusion_paragraph',
       'is_component_first_in_paragraph', 'is_component_last_in_paragraph',
       'split', 'structral_featxt', 'argument_counter'],
      dtype='object')

In [28]:
argument_component = df.iloc[0].argument_component
contextual_fts = (df.iloc[0].essay_title, df.iloc[0].sentence)
structural_fts = (df.iloc[0].paragraph_nr, 
                  df.iloc[0].is_component_in_intro_paragraph, 
                  df.iloc[0].is_component_in_conclusion_paragraph,
                  df.iloc[0].is_component_first_in_paragraph,
                  df.iloc[0].is_component_last_in_paragraph)

question = build_question(argument_component, contextual_fts, structural_fts)
print(question)

### Here is the information about the argument component:
1. Argument component: we should attach more importance to cooperation during primary education
2. Topic: Should students be taught to compete or to cooperate?
3. Complete sentence: From this point of view, I firmly believe that we should attach more importance to cooperation during primary education.
4. Positional information: Paragraph number: 1. Is it in the introduction paragraph: yes. Is it in the conclusion paragraph: no. Is it the first component of its paragraph: yes. Is it the last component of its paragraph: yes.
Is the argument compoment given in point 1 a major claim, a claim, or a premise? No other answer besides these three is accepted.



In [29]:
def build_answer(x):
    
    if x == "MajorClaim":
        return "major claim"
    
    elif x == "Claim":
        return "claim"
    
    elif x == "Premise":
        return "premise"

In [30]:
answer = build_answer(df.iloc[0].label)
print(answer)

major claim


In [31]:
print(formatting_fct(my_task_description, question, answer, mode="train"))

{'messages': [{'role': 'system', 'content': '### Your task is to classify an arguement component from an essay into three possible classes: major claim, claim, or premise. For this purpose, we give you the following information in order:\n1. The argument component itself\n2. The topic of the essay\n3. The complete sentence of this argument component\n4. Positional information of this argument component\n'}, {'role': 'user', 'content': '### Here is the information about the argument component:\n1. Argument component: we should attach more importance to cooperation during primary education\n2. Topic: Should students be taught to compete or to cooperate?\n3. Complete sentence: From this point of view, I firmly believe that we should attach more importance to cooperation during primary education.\n4. Positional information: Paragraph number: 1. Is it in the introduction paragraph: yes. Is it in the conclusion paragraph: no. Is it the first component of its paragraph: yes. Is it the last co

## Prepare data files

### Train set

In [32]:
data_file_train = []

for i, _ in df[df["essay_file"].isin(train_essays_l)].iterrows():
    
    
    argument_component = df.iloc[i].argument_component
    contextual_fts = (df.iloc[i].essay_title, df.iloc[i].sentence)
    structural_fts = (df.iloc[i].paragraph_nr, 
    df.iloc[i].is_component_in_intro_paragraph, 
    df.iloc[i].is_component_in_conclusion_paragraph,
    df.iloc[i].is_component_first_in_paragraph,
    df.iloc[i].is_component_last_in_paragraph)

    question = build_question(argument_component, contextual_fts, structural_fts)
    answer = build_answer(df.iloc[i].label)
    
    data_file_train.append( formatting_fct(my_task_description, question, answer, mode="train") )

In [33]:
len(data_file_train)

4307

In [34]:
for i in range(3):
    
    print(data_file_train[i])
    print()

{'messages': [{'role': 'system', 'content': '### Your task is to classify an arguement component from an essay into three possible classes: major claim, claim, or premise. For this purpose, we give you the following information in order:\n1. The argument component itself\n2. The topic of the essay\n3. The complete sentence of this argument component\n4. Positional information of this argument component\n'}, {'role': 'user', 'content': '### Here is the information about the argument component:\n1. Argument component: we should attach more importance to cooperation during primary education\n2. Topic: Should students be taught to compete or to cooperate?\n3. Complete sentence: From this point of view, I firmly believe that we should attach more importance to cooperation during primary education.\n4. Positional information: Paragraph number: 1. Is it in the introduction paragraph: yes. Is it in the conclusion paragraph: no. Is it the first component of its paragraph: yes. Is it the last co

### Validation set

In [37]:
data_file_val = []

for i, _ in df[df["essay_file"].isin(val_essays_l)].iterrows():
    
    argument_component = df.iloc[i].argument_component
    contextual_fts = (df.iloc[i].essay_title, df.iloc[i].sentence)
    structural_fts = (df.iloc[i].paragraph_nr, 
    df.iloc[i].is_component_in_intro_paragraph, 
    df.iloc[i].is_component_in_conclusion_paragraph,
    df.iloc[i].is_component_first_in_paragraph,
    df.iloc[i].is_component_last_in_paragraph)

    question = build_question(argument_component, contextual_fts, structural_fts)
    answer = build_answer(df.iloc[i].label)
    
    data_file_val.append( formatting_fct(my_task_description, question, answer, mode="train") )

In [38]:
len(data_file_val)

516

In [39]:
for i in range(3):
    
    print(data_file_val[i])
    print()

{'messages': [{'role': 'system', 'content': '### Your task is to classify an arguement component from an essay into three possible classes: major claim, claim, or premise. For this purpose, we give you the following information in order:\n1. The argument component itself\n2. The topic of the essay\n3. The complete sentence of this argument component\n4. Positional information of this argument component\n'}, {'role': 'user', 'content': '### Here is the information about the argument component:\n1. Argument component: it has contributed to the economic development as well as preserved the culture and environment of the tourist destinations\n2. Topic: International tourism is now more common than ever before\n3. Complete sentence: While some people might think that this international tourism has negative effects on the destination countries, I would contend that it has contributed to the economic development as well as preserved the culture and environment of the tourist destinations.\n4.

### Test set

In [40]:
data_file_test = []

for i, _ in df[df.split == "TEST"].iterrows():
    
    argument_component = df.iloc[i].argument_component
    contextual_fts = (df.iloc[i].essay_title, df.iloc[i].sentence)
    structural_fts = (df.iloc[i].paragraph_nr, 
    df.iloc[i].is_component_in_intro_paragraph, 
    df.iloc[i].is_component_in_conclusion_paragraph,
    df.iloc[i].is_component_first_in_paragraph,
    df.iloc[i].is_component_last_in_paragraph)

    question = build_question(argument_component, contextual_fts, structural_fts)
    answer = build_answer(df.iloc[i].label)
    
    data_file_test.append( formatting_fct(my_task_description, question, answer, mode="test") )

In [41]:
len(data_file_test)

1266

In [42]:
for i in range(3):
    
    print(data_file_test[i])
    print()

{'messages': [{'role': 'system', 'content': '### Your task is to classify an arguement component from an essay into three possible classes: major claim, claim, or premise. For this purpose, we give you the following information in order:\n1. The argument component itself\n2. The topic of the essay\n3. The complete sentence of this argument component\n4. Positional information of this argument component\n'}, {'role': 'user', 'content': '### Here is the information about the argument component:\n1. Argument component: the tourism bring large profit for the destination countries\n2. Topic: International tourism is now more common than ever before\n3. Complete sentence: While some might think the tourism bring large profit for the destination countries, I would contend that this industry has affected the cultural attributes and damaged the natural environment of the tourist destinations.\n4. Positional information: Paragraph number: 1. Is it in the introduction paragraph: yes. Is it in the

## Save `jsonl` files

In [43]:
file_name = "data_train_v2.jsonl"

with open(os.path.join(data_dir, file_name), 'w') as fh:
    
    for entry in data_file_train:
        
        json.dump(entry, fh)
        fh.write('\n')

In [44]:
file_name = "data_val_v2.jsonl"

with open(os.path.join(data_dir, file_name), 'w') as fh:
    
    for entry in data_file_val:
        
        json.dump(entry, fh)
        fh.write('\n')

In [45]:
file_name = "data_test_v2.jsonl"

with open(os.path.join(data_dir, file_name), 'w') as fh:
    
    for entry in data_file_test:
        
        json.dump(entry, fh)
        fh.write('\n')