In [2]:
import json
import pandas as pd
import random
import os
from sklearn.model_selection import train_test_split

TRAIN_LABELS_PATH = '../data/persona_labels/interim/train.jsonl'
VALID_LABELS_PATH = '../data/persona_labels/interim/val.jsonl'

In [3]:
train = [json.loads(line) for line in open(TRAIN_LABELS_PATH).readlines()]
valid = [json.loads(line) for line in open(VALID_LABELS_PATH).readlines()]

In [8]:
train = pd.DataFrame(train)
valid = pd.DataFrame(valid)

train = train.explode('labels')
valid = valid.explode('labels')

In [9]:
def get_stats(df):
    cnt = df['labels'].value_counts()
    distr = cnt / cnt.sum()
    distr = 1 - distr
    return distr.to_dict()

def get_samples(x, df=None, n_samples=10000):
    labels = x.unique()
    filtered_df = df[~df['labels'].isin(labels)]
    distr = get_stats(filtered_df)
    weights = filtered_df['labels'].map(distr)
    weights = weights / weights.sum()
    samples = []
    for label in labels:
        # indicies = random.choices(filtered_df.index, weights.tolist(), k=n_samples)
        # negative_samples = filtered_df.loc[indicies]
        negative_samples = filtered_df.copy()
        negative_samples['labels'] = label
        negative_samples['target'] = 0
        negative_samples = negative_samples.to_dict(orient='records')
        samples += negative_samples
    positive_samples = pd.DataFrame({'text': [x.name]*len(labels), 'labels': labels, 'target': 1})
    positive_samples = positive_samples.to_dict(orient='records')
    samples += positive_samples
    return samples

samples = train.groupby('text')['labels'].apply(lambda x: get_samples(x, train)).tolist()
samples = [sample for samples in samples for sample in samples]
samples = pd.DataFrame(samples)
samples = samples.drop_duplicates(['text', 'labels', 'target'])
samples = samples.to_dict(orient='records')
len(samples)

15148

In [10]:
valid['target'] = 1
valid_samples = valid.to_dict(orient='records')

In [13]:
pd.DataFrame(samples)['labels'].value_counts()

labels
Characteristics       2619
Experiences           2583
Routines or Habits    2559
Relationship          2531
Goals or Plans        2440
None                  2416
Name: count, dtype: int64

In [16]:
pd.DataFrame(valid_samples)['labels'].value_counts()

labels
Experiences           263
Characteristics       218
Routines or Habits     87
Goals or Plans         76
Relationship           32
None                    9
Name: count, dtype: int64

In [18]:
NLI_TRAIN_SAVE_PATH = '../data/persona_labels/interim/nli_train.jsonl'
NLI_VALID_SAVE_PATH = '../data/persona_labels/interim/nli_val.jsonl'

if os.path.exists(NLI_TRAIN_SAVE_PATH):
    os.remove(NLI_TRAIN_SAVE_PATH)

with open(NLI_TRAIN_SAVE_PATH, 'a+') as f:
    for line in samples:
        f.write(json.dumps(line, ensure_ascii=False) + '\n')

if os.path.exists(NLI_VALID_SAVE_PATH):
    os.remove(NLI_VALID_SAVE_PATH)

with open(NLI_VALID_SAVE_PATH, 'a+') as f:
    for line in valid_samples:
        f.write(json.dumps(line, ensure_ascii=False) + '\n')