## Make connectives corpus

- preprocess data from `.json`
- create stimuli for causal and connective minimal pairs
- export to `.csv`

In [2]:
import re
import pandas as pd

In [3]:
df = pd.read_csv(
    "english_stimuli_connector_2014_drenhaus_et_al.json", delimiter='\t'
    )

In [4]:
df.head()

Unnamed: 0,id,sentence,target,connective_type
0,0,John is thinking about going to see the latest...,Therefore,causal
1,1,John is thinking about going to see the latest...,Nevertheless,concessive
2,2,Mr. Brown was planning to look for new glasses...,Therefore,causal
3,3,Mr. Brown was planning to look for new glasses...,Nevertheless,concessive
4,4,Stan is thinking about jobs; he would like to ...,Nevertheless,concessive


`target` = 'Therefore'

In [11]:
df.iloc[0]["sentence"]

'John is thinking about going to see the latest movie or to listen to some famous arias. He would like to hear some great tenors and sopranos.   [MASK] he buys tickets for  an opera in the city center. '

`target` = 'Nevertheless'

In [12]:
df.iloc[1]["sentence"]

'John is thinking about going to see the latest movie or to listen to some famous arias. He would like to hear some great tenors and sopranos.   [MASK] he buys tickets for  a cinema in the city center. '

### Preprocessing

Function to clean redundant whitespace

In [19]:
def clean_whitespace(text):
    return re.sub(r'\s+', ' ', text).strip()

Iterate through pairs of rows and create stimuli

In [14]:
collected_data = []

# Iterate through pairs of rows (ids)
for i in range(0, len(df['id']), 2):
    # Extract the current pair of rows
    row1 = df.iloc[i]
    row2 = df.iloc[i + 1] if i + 1 < len(df) else None
    
    # Process the first row (i)
    sentence1 = clean_whitespace(row1['sentence'])
    prefix1, suffix1 = sentence1.split('[MASK]')
    prefix1 = clean_whitespace(prefix1)
    suffix1 = clean_whitespace(suffix1)
    good_continuation1 = f"{row1['target']} {suffix1}".strip().rstrip('.')
    bad_continuation1 = f"{row2['target']} {suffix1}".strip().rstrip('.') if row2 is not None else ""

    collected_data.append({
        'item_id': row1['id'],
        'prefix': prefix1,
        'good_continuation': good_continuation1,
        'bad_continuation': bad_continuation1,
        'category': row1['connective_type']
    })
    
    # Process the second row (i+1) if it exists
    if row2 is not None:
        sentence2 = clean_whitespace(row2['sentence'])
        prefix2, suffix2 = sentence2.split('[MASK]')
        prefix2 = clean_whitespace(prefix2)
        suffix2 = clean_whitespace(suffix2)
        good_continuation2 = f"{row2['target']} {suffix2}".strip().rstrip('.')
        bad_continuation2 = f"{row1['target']} {suffix2}".strip().rstrip('.')

        collected_data.append({
            'item_id': row2['id'],
            'prefix': prefix2,
            'good_continuation': good_continuation2,
            'bad_continuation': bad_continuation2,
            'category': row2['connective_type']
        })


Convert the collected data into a DataFrame

In [17]:
processed_df = pd.DataFrame(collected_data)

In [18]:
processed_df.head()

Unnamed: 0,item_id,prefix,good_continuation,bad_continuation,category
0,0,John is thinking about going to see the latest...,Therefore he buys tickets for an opera in the ...,Nevertheless he buys tickets for an opera in t...,causal
1,1,John is thinking about going to see the latest...,Nevertheless he buys tickets for a cinema in t...,Therefore he buys tickets for a cinema in the ...,concessive
2,2,Mr. Brown was planning to look for new glasses...,Therefore he now heads towards an optician tha...,Nevertheless he now heads towards an optician ...,causal
3,3,Mr. Brown was planning to look for new glasses...,Nevertheless he now heads towards a shoe shop ...,Therefore he now heads towards a shoe shop tha...,concessive
4,4,Stan is thinking about jobs; he would like to ...,Nevertheless he decides that he wants to becom...,Therefore he decides that he wants to become a...,concessive


Export the dataframe to `corpus.csv`

In [None]:
processed_df.to_csv("corpus.csv", index=False)