In [None]:
import os
import pandas as pd
import numpy as np
import random

In [None]:
np.random.seed(1895)

In [None]:
stance = pd.read_csv("data/raw/ukraine/stanced.csv")

In [None]:
articles_train = random.sample(set(stance.id), 294)
idx = stance.id.isin(articles_train)

In [None]:
stance.context_before = stance.context_before.fillna("")
stance.context_after = stance.context_after.fillna("")

In [None]:
train = pd.DataFrame({"text": stance.target_sentence[idx]+" </s> "+
                      stance.context_before[idx]+" </s> "+stance.context_after[idx],
                      "label": stance.gold_label[idx]})

In [None]:
test = pd.DataFrame({"text": stance.target_sentence[~idx]+" </s> "+
                     stance.context_before[~idx]+" </s> "+stance.context_after[~idx],
                     "label": stance.gold_label[~idx]})

In [None]:
train.to_csv("data/raw/ukraine/stanced_train.csv", header = False, index = False)
test.to_csv("data/raw/ukraine/stanced_test.csv", header = False, index = False)
test.to_csv("data/test/ukraine/test.csv", header = False, index = False)

# First Setting ("equal")
### sample 5 subsamples with n = 10, 100, 250 and equal inclusion probability for each label

In [None]:
for s in range(1,6):
    os.makedirs(f"data/train/ukraine/equal/10/{s}")
    os.makedirs(f"data/train/ukraine/equal/100/{s}")
    os.makedirs(f"data/train/ukraine/equal/250/{s}")
    train = pd.read_csv("data/raw/ukraine/stanced_train.csv", header=None, names=("text", "label"))
    train_shot = pd.DataFrame(columns = train.columns)
    oversample = np.random.choice(train.label.unique(), 2, False)
    for label in set(train.label.unique()).difference(oversample):
        idx = train.label == label
        train_shot = train_shot.append(train[idx].sample(n=2))
    for label in oversample:
        idx = train.label == label
        train_shot = train_shot.append(train[idx].sample(n=3))
    train = train.drop(train_shot.index)
    train_shot.to_csv(f"data/train/ukraine/equal/10/{s}/train.csv", header = False, index = False)
    for label in set(train.label.unique()).difference(oversample):
        idx = train.label == label
        train_add = train[idx].sample(n = 23)
        train_shot = train_shot.append(train_add)
        train = train.drop(train_add.index)
    for label in oversample:
        idx = train.label == label
        train_add = train[idx].sample(n = 22)
        train_shot = train_shot.append(train_add)
        train = train.drop(train_add.index)
    train_shot.to_csv(f"data/train/ukraine/equal/100/{s}/train.csv", header = False, index = False)
    for label in set(train.label.unique()).difference(oversample):
        idx = train.label == label
        train_add = train[idx].sample(n = 38)
        train_shot = train_shot.append(train_add)
        train = train.drop(train_add.index)
    for label in oversample:
        idx = train.label == label
        train_add = train[idx].sample(n = 37)
        train_shot = train_shot.append(train_add)
        train = train.drop(train_add.index)
    train_shot.to_csv(f"data/train/ukraine/equal/250/{s}/train.csv", header = False, index = False)

# Second Setting ("stratified")
### sample 5 stratified(!) subsamples with n = 10, 100, 250 and inclusion probability equal to true label distribution

In [None]:
train = pd.read_csv("data/raw/ukraine/stanced_train.csv", header=None, names=("text", "label"))
tab = pd.read_csv("data/raw/ukraine/stanced.csv").gold_label.value_counts()
rel = tab/sum(tab)
tab10 = round(rel*10)
tab90 = round(rel*100)-tab10
tab150 = round(rel*250)-tab90-tab10

In [None]:
for s in range(1,6):
    os.makedirs(f"data/train/ukraine/stratified/10/{s}")
    os.makedirs(f"data/train/ukraine/stratified/100/{s}")
    os.makedirs(f"data/train/ukraine/stratified/250/{s}")
    train = pd.read_csv("data/raw/ukraine/stanced_train.csv", header=None, names=("text", "label"))
    train_shot = pd.DataFrame(columns = train.columns)
    for label in train.label.unique():
        idx = train.label == label
        train_shot = train_shot.append(train[idx].sample(n=int(tab10[label])))
    train = train.drop(train_shot.index)
    train_shot.to_csv(f"data/train/ukraine/stratified/10/{s}/train.csv", header = False, index = False)
    for label in train.label.unique():
        idx = train.label == label
        train_add = train[idx].sample(n=int(tab90[label]))
        train_shot = train_shot.append(train_add)
        train = train.drop(train_add.index)
    train_shot.to_csv(f"data/train/ukraine/stratified/100/{s}/train.csv", header = False, index = False)
    for label in train.label.unique():
        idx = train.label == label
        train_add = train[idx].sample(n=int(tab150[label]))
        train_shot = train_shot.append(train_add)
        train = train.drop(train_add.index)
    train_shot.to_csv(f"data/train/ukraine/stratified/250/{s}/train.csv", header = False, index = False)

# Third Setting ("random")
### sample 5 random subsamples with n = 100, 250 (inclusion probability equal to true label distribution)
we leave out n=10 because we want to make sure that at least each label occurs once in the training

In [None]:
for s in range(1,6):
    os.makedirs(f"data/train/ukraine/random/100/{s}")
    os.makedirs(f"data/train/ukraine/random/250/{s}")
    train = pd.read_csv("data/raw/ukraine/stanced_train.csv", header=None, names=("text", "label"))
    train_shot = pd.DataFrame(columns = train.columns)
    train_shot = train_shot.append(train.sample(n=100))
    train = train.drop(train_shot.index)
    train_shot.to_csv(f"data/train/ukraine/random/100/{s}/train.csv", header = False, index = False)
    train_shot = train_shot.append(train.sample(n=150))
    train_shot.to_csv(f"data/train/ukraine/random/250/{s}/train.csv", header = False, index = False)