In [144]:
import json
import pandas as pd
import random
import os

LABELS_PATH = '../data/persona_labels/raw/exported_labels.json'
SAVE_LABELS_PATH = '../data/persona_labels/interim/labels.jsonl'

In [145]:
labels = json.load(open(LABELS_PATH))

In [146]:
labels_data = []
for label in labels:
    labels_data.append({'text': label['data']['text'], 
                        'labels': label['annotations'][0]['result'][0]['value']['choices']})
labels_data = pd.DataFrame(labels_data)
labels_data = labels_data.explode('labels')
labels_data = labels_data.groupby('text')['labels'].apply(set).apply(list)
labels_data = labels_data.reset_index()
labels_data = labels_data.explode('labels')
labels_data

Unnamed: 0,text,labels
0,A band I like mentioned him in their lyrics.,Experiences
1,A lot of my family members are teachers.,Relationship
1,A lot of my family members are teachers.,Experiences
2,Alfred Hitchcock is my favorite old movie dire...,Characteristics
3,Animals markings inspire me.,Characteristics
...,...,...
3019,red is my favorite color.,Characteristics
3020,thinks cousin would like to meet speaker two.,Relationship
3020,thinks cousin would like to meet speaker two.,Goals or Plans
3021,volunteers at animal shelters often,


In [147]:
def get_stats(df):
    cnt = df['labels'].value_counts()
    distr = cnt / cnt.sum()
    distr = 1 - distr
    return distr.to_dict()

def get_samples(x, df=None, n_samples=10000):
    labels = x.unique()
    filtered_df = df[~df['labels'].isin(labels)]
    distr = get_stats(filtered_df)
    weights = filtered_df['labels'].map(distr)
    weights = weights / weights.sum()
    samples = []
    for label in labels:
        # indicies = random.choices(filtered_df.index, weights.tolist(), k=n_samples)
        # negative_samples = filtered_df.loc[indicies]
        negative_samples = filtered_df.copy()
        negative_samples['labels'] = label
        negative_samples['target'] = 0
        negative_samples = negative_samples.to_dict(orient='records')
        samples += negative_samples
    positive_samples = pd.DataFrame({'text': [x.name]*len(labels), 'labels': labels, 'target': 1})
    positive_samples = positive_samples.to_dict(orient='records')
    samples += positive_samples
    return samples

samples = labels_data.groupby('text')['labels'].apply(lambda x: get_samples(x, labels_data)).tolist()
samples = [sample for samples in samples for sample in samples]
samples = pd.DataFrame(samples)
samples = samples.drop_duplicates(['text', 'labels', 'target'])
samples = samples.to_dict(orient='records')
len(samples)

18954

In [148]:
labels_data[labels_data['text'] == "I play football."]

Unnamed: 0,text,labels
1907,I play football.,Routines or Habits


In [149]:
pd.DataFrame(samples)['labels'].value_counts()

labels
Characteristics       3276
Experiences           3231
Routines or Habits    3203
Relationship          3167
Goals or Plans        3054
None                  3023
Name: count, dtype: int64

In [150]:
if os.path.exists(SAVE_LABELS_PATH):
    os.remove(SAVE_LABELS_PATH)

with open(SAVE_LABELS_PATH, 'a+') as f:
    for line in samples:
        f.write(json.dumps(line, ensure_ascii=False) + '\n')