In [18]:
import pandas as pd

try:
    from google.colab import drive
    drive.mount('/content/gdrive')

    train_path = '/content/gdrive/MyDrive/advanced-ml-project/data/train.tsv'
    test_path = '/content/gdrive/MyDrive/advanced-ml-project/data/test.tsv'
    dev_path = '/content/gdrive/MyDrive/advanced-ml-project/data/dev.tsv'

    transformed_data_path = '/content/gdrive/MyDrive/advanced-ml-project/data/nli.csv'
except:
    train_path = 'data/train.tsv'
    test_path = 'data/test.tsv'
    dev_path = 'data/dev.tsv'

    transformed_data_path = 'data/nli.csv'

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## Load Data

In [19]:
data = pd.read_csv(train_path, sep='\t', header=0)
data = data.sample(frac=1).reset_index(drop=True)

tmp = pd.read_csv(test_path, sep='\t', header=0)
tmp = tmp.sample(frac=1).reset_index(drop=True)
data = pd.concat([data, tmp], axis=0)

tmp = pd.read_csv(dev_path, sep='\t', header=0)
tmp = tmp.sample(frac=1).reset_index(drop=True)
data = pd.concat([data, tmp], axis=0)

print("Length:", len(data))
print(data.label.value_counts())
data.head(10)

Length: 16632
moderate          10494
not depression     4649
severe             1489
Name: label, dtype: int64


Unnamed: 0,PID,text,label
0,train_pid_3966,I don’t care anymore : Basically I just feel e...,moderate
1,train_pid_2260,I've rationalized suicide : A little about me ...,moderate
2,train_pid_1285,Losing hope : I feel like such a burden...\n\n...,moderate
3,train_pid_7571,It should be a joke : No matter how great you ...,not depression
4,train_pid_473,Getting back to normal life : I've been gettin...,moderate
5,train_pid_1999,"hate being so lonely : shit sucks, new decade ...",moderate
6,train_pid_2650,I don't have real problems : I know I don't ha...,moderate
7,train_pid_4151,Why does no one realize that depression starts...,moderate
8,train_pid_5863,Wasted 5 years of my life in a pointless degre...,moderate
9,train_pid_6643,How can I (non-religious) help my girlfriend (...,not depression


## Create split

Chose 1/2 items from each label.

In [20]:
moderate = data.loc[data.label == 'moderate']
not_depression = data.loc[data.label == 'not depression']
severe = data.loc[data.label == 'severe']

print(len(moderate))
print(len(not_depression))
print(len(severe))

10494
4649
1489


In [21]:
split_index = len(moderate) // 2
first_half = moderate.iloc[:split_index]
second_half = moderate.iloc[split_index:]

print(len(first_half))
print(len(second_half))

transformed_moderate = {'premise': [], 'hypothesis': [], 'label': []}

prompt = 'The level of depression is {}.'
for idx, row in first_half.iterrows():
    transformed_moderate['premise'].append(row['text'])
    transformed_moderate['hypothesis'].append(prompt.format(row['label']))
    transformed_moderate['label'].append('entailment')

for idx, row in second_half.iterrows():
    transformed_moderate['premise'].append(row['text'])
    if idx % 2 == 0:
        transformed_moderate['hypothesis'].append(prompt.format('severe'))
    else:
        transformed_moderate['hypothesis'].append(prompt.format('not depression'))
    transformed_moderate['label'].append('not_entailment')

print(len(transformed_moderate['label']))

5247
5247
10494


In [22]:
split_index = len(not_depression) // 2
first_half = not_depression.iloc[:split_index]
second_half = not_depression.iloc[split_index:]

print(len(first_half))
print(len(second_half))

transformed_not_depression = {'premise': [], 'hypothesis': [], 'label': []}

prompt = 'The level of depression is {}.'
for idx, row in first_half.iterrows():
    transformed_not_depression['premise'].append(row['text'])
    transformed_not_depression['hypothesis'].append(prompt.format(row['label']))
    transformed_not_depression['label'].append('entailment')

for idx, row in second_half.iterrows():
    transformed_not_depression['premise'].append(row['text'])
    if idx % 2 == 0:
        transformed_not_depression['hypothesis'].append(prompt.format('severe'))
    else:
        transformed_not_depression['hypothesis'].append(prompt.format('moderate'))
    transformed_not_depression['label'].append('not_entailment')

print(len(transformed_not_depression['label']))

2324
2325
4649


In [23]:
split_index = len(severe) // 2
first_half = severe.iloc[:split_index]
second_half = severe.iloc[split_index:]

print(len(first_half))
print(len(second_half))

transformed_severe = {'premise': [], 'hypothesis': [], 'label': []}

prompt = 'The level of depression is {}.'
for idx, row in first_half.iterrows():
    transformed_severe['premise'].append(row['text'])
    transformed_severe['hypothesis'].append(prompt.format(row['label']))
    transformed_severe['label'].append('entailment')

for idx, row in second_half.iterrows():
    transformed_severe['premise'].append(row['text'])
    if idx % 2 == 0:
        transformed_severe['hypothesis'].append(prompt.format('not depression'))
    else:
        transformed_severe['hypothesis'].append(prompt.format('moderate'))
    transformed_severe['label'].append('not_entailment')

print(len(transformed_severe['label']))

744
745
1489


In [24]:
transformed_data = {
    'premise': transformed_moderate['premise'] + transformed_not_depression['premise'] + transformed_severe['premise'],
    'hypothesis': transformed_moderate['hypothesis'] + transformed_not_depression['hypothesis'] + transformed_severe['hypothesis'],
    'label': transformed_moderate['label']+ transformed_not_depression['label'] + transformed_severe['label']
}

transformed_data = pd.DataFrame(transformed_data)
transformed_data = transformed_data.sample(frac=1).reset_index(drop=True)
print(len(transformed_data))
print(transformed_data.label.value_counts())
transformed_data.head(10)

16632
not_entailment    8317
entailment        8315
Name: label, dtype: int64


Unnamed: 0,premise,hypothesis,label
0,Is anyone just tired of crying or just feeling...,The level of depression is not depression.,not_entailment
1,Everything I do feels pointless. Help me overc...,The level of depression is severe.,not_entailment
2,I wish I was never born : All people on my own...,The level of depression is not depression.,entailment
3,I have way too many mental (and other) problem...,The level of depression is severe.,entailment
4,"I should be an actor, and for the wrong reason...",The level of depression is severe.,not_entailment
5,Ever wonder when your family will give up on y...,The level of depression is moderate.,entailment
6,Everybody has left me : I don't know why I am ...,The level of depression is moderate.,entailment
7,My illness is screwing my life up : My depress...,The level of depression is severe.,not_entailment
8,I wish : I wish I could meet all of you and ju...,The level of depression is moderate.,not_entailment
9,How to cope with nostalgia : I don’t really fe...,The level of depression is severe.,not_entailment


In [25]:
transformed_data.to_csv(transformed_data_path, index=False)