In [1]:
from docx.oxml import OxmlElement
from docx import Document
from docx.oxml.ns import qn
import pandas as pd

In [2]:
# Set display option to show full text in all columns
pd.set_option('display.max_colwidth', None)

# Add incremental ID to records

In [3]:
# Read dataset
df = pd.read_csv('../dataset/validation.csv')

# Display the first few rows of the dataframe to check
display(df)

Unnamed: 0,text,label,label_name
0,im feeling quite sad and sorry for myself but ill snap out of it soon,0,sadness
1,i feel like i am still looking at a blank canvas blank pieces of paper,0,sadness
2,i feel like a faithful servant,2,love
3,i am just feeling cranky and blue,3,anger
4,i can have for a treat or if i am feeling festive,1,joy
...,...,...,...
1995,im having ssa examination tomorrow in the morning im quite well prepared for the coming exam and somehow i feel numb towards exam because in life there is much more important things than exam,0,sadness
1996,i constantly worry about their fight against nature as they push the limits of their inner bodies for the determination of their outer existence but i somehow feel reassured,1,joy
1997,i feel its important to share this info for those that experience the same thing,1,joy
1998,i truly feel that if you are passionate enough about something and stay true to yourself you will succeed,1,joy


In [4]:
# Add an incremental ID column
df['ID'] = range(1, len(df) + 1)

# Display the first few rows of the dataframe to check
display(df)

Unnamed: 0,text,label,label_name,ID
0,im feeling quite sad and sorry for myself but ill snap out of it soon,0,sadness,1
1,i feel like i am still looking at a blank canvas blank pieces of paper,0,sadness,2
2,i feel like a faithful servant,2,love,3
3,i am just feeling cranky and blue,3,anger,4
4,i can have for a treat or if i am feeling festive,1,joy,5
...,...,...,...,...
1995,im having ssa examination tomorrow in the morning im quite well prepared for the coming exam and somehow i feel numb towards exam because in life there is much more important things than exam,0,sadness,1996
1996,i constantly worry about their fight against nature as they push the limits of their inner bodies for the determination of their outer existence but i somehow feel reassured,1,joy,1997
1997,i feel its important to share this info for those that experience the same thing,1,joy,1998
1998,i truly feel that if you are passionate enough about something and stay true to yourself you will succeed,1,joy,1999


In [5]:
# Store the updated DataFrame into a new CSV file to compare after research is answered by human participants
df.to_csv('human_vs_machine_experiment.csv', index=False)

# Generate research sheets

In [6]:
# Read dataset
df = pd.read_csv('human_vs_machine_experiment.csv')

In [7]:
# Extract unique values from 'label_name' column
possible_emotions = df['label_name'].unique().tolist()

print(possible_emotions)

['sadness', 'love', 'anger', 'joy', 'fear', 'surprise']


In [8]:
# Keep only the 'ID' and 'text' columns
df_filtered = df[['ID', 'text']]

In [9]:
# Display the first few rows of the dataframe to check
display(df_filtered)

Unnamed: 0,ID,text
0,1,im feeling quite sad and sorry for myself but ill snap out of it soon
1,2,i feel like i am still looking at a blank canvas blank pieces of paper
2,3,i feel like a faithful servant
3,4,i am just feeling cranky and blue
4,5,i can have for a treat or if i am feeling festive
...,...,...
1995,1996,im having ssa examination tomorrow in the morning im quite well prepared for the coming exam and somehow i feel numb towards exam because in life there is much more important things than exam
1996,1997,i constantly worry about their fight against nature as they push the limits of their inner bodies for the determination of their outer existence but i somehow feel reassured
1997,1998,i feel its important to share this info for those that experience the same thing
1998,1999,i truly feel that if you are passionate enough about something and stay true to yourself you will succeed


In [10]:
# Number of groups
num_groups = 40

# Split the DataFrame into 50 groups
group_size = len(df_filtered) // num_groups
groups = [df_filtered.iloc[i:i + group_size] for i in range(0, len(df_filtered), group_size)]

# Define a function to create a Word document for each group
def create_word_file(group_number: int, group_data: pd.DataFrame, possible_emotions: list):
    # Create a Word document
    doc = Document()
    
    # Add Research Sheet nr.
    doc.add_heading(f"Research Sheet nr. {group_number}", 0)
    
    # Add instructions on how to contribute to this research
    section = doc.sections[0]
    section.different_first_page_header_footer = True
    first_page_header = section.first_page_header
    instructions = [
        "This research sheet contains 50 records each with its own ID and text.",
        "Your task is to classify the emotion that the text might trigger in you by choosing ONLY 1 of the specified feelings.",
        "Please use X in the cell corresponding to your classification.",
    ]
    for instruction in instructions:
        if 'ListBullet' in doc.styles:
            first_page_header.add_paragraph(instruction, style='ListBullet')
        else:
            first_page_header.add_paragraph(instruction)
    
    
    # Get number of registered emotions in the research dataset
    number_of_emotions = len(possible_emotions)
    
    # Add a table with headers
    table = doc.add_table(rows=1, cols=2+number_of_emotions)
    table.style = 'Table Grid'

    # Set header row
    hdr_cells = table.rows[0].cells
    headers = ['ID', 'Text'] + possible_emotions
    
    for i, text in enumerate(headers):
        cell = hdr_cells[i]
        run = cell.paragraphs[0].add_run(text)
        run.bold = True  # Make text bold
        # Set background color (e.g., light gray)
        tc = cell._tc
        tcPr = tc.get_or_add_tcPr()
        shd = OxmlElement('w:shd')
        shd.set(qn('w:fill'), "D9D9D9")  # Light gray background (hex color)
        tcPr.append(shd)
    
    # Repeat header on each page
    def set_repeat_table_header(row):
        """Set a table row to repeat as header row on each page."""
        tr = row._tr
        trPr = tr.get_or_add_trPr()
        tblHeader = OxmlElement('w:tblHeader')
        tblHeader.set(qn('w:val'), "true")
        trPr.append(tblHeader)
    
    set_repeat_table_header(table.rows[0])

    # Add rows and prevent splitting across pages
    def prevent_row_split(row):
        """Set table row to not split across pages."""
        tr = row._tr
        trPr = tr.get_or_add_trPr()
        cantSplit = OxmlElement('w:cantSplit')
        trPr.append(cantSplit)
    
    # Add each record in the group to the table
    for _, row in group_data.iterrows():
        table_row = table.add_row()
        row_cells = table_row.cells
        row_cells[0].text = str(row['ID'])
        row_cells[1].text = row['text']
        prevent_row_split(table_row)


    # Add Participent notes on the research, and optional e-mail (or any other contact information) for further questions.
    doc.add_heading("\nThank you for participating in this research!", 0)
    doc.add_heading("\nIf you have any suggestions on improving this research, feel free to let us know by writing your thoughts below and we sure value your opinion.", 0)
    
    # Save the Word document
    doc.save(f'research_sheets/research_sheet__{group_number}.docx')

# Create Word files for each group
for i, group in enumerate(groups, 1):
    create_word_file(i, group, possible_emotions)

print("All Word files have been created.")

All Word files have been created.
