# Random Sampling for Annotation

In [22]:
import pandas as pd
import os
import requests
import numpy as np

SEED = 42
MAX_SAMPLE_SIZE = 500
N_ANNOTATION_ROUNDS = 2

N_ANNOTATORS = 4
TOTAL_N_INDIVIDUAL_SAMPLES = 100 * 4

Setup functionality for sampling tweets and querying the correct URL.

In [23]:
def get_url(tweet_id: str) -> str:
    url = f'https://twitter.com/anyuser/status/{tweet_id}'
    return f'<html><body><a href={url}>URL</a></body></html>'


def sample_tweet(file_path: str, n_samples: int, random_state: int, filter_tweets=None) -> pd.DataFrame:
    df = pd.read_csv(file_path, sep=',', lineterminator='\n')
    df = df if filter_tweets is None else df[~df['tweet_id'].isin(
        filter_tweets)]
    df = df.sample(n=n_samples, random_state=random_state)
    df = df.apply(lambda x: pd.Series({
        'url': get_url(x.tweet_id),
        'tweet_id': x.tweet_id,
        'player_name': x.player_name}), axis=1)
    df = df.astype({'url': str, 'tweet_id': 'int64', 'player_name': str})
    return df


def split_group(df, N_splits, prefix):
    frames = np.split(df, N_splits)
    return pd.DataFrame({f'{prefix}_{i + 1}': frames[i].index for i in range(N_splits)})


Sample all overlapping player files and concat them into one DF.

In [24]:
ROOT = '../../data/collected_with_some_processing/tweets/all'
PLAYER_FILES = list(map(lambda file: os.path.join(ROOT, file), os.listdir(ROOT)))
OVERLAPPING_SAMPLES_PER_PLAYER = MAX_SAMPLE_SIZE // len(PLAYER_FILES)

df_overlapping = pd.concat([sample_tweet(file, n_samples=OVERLAPPING_SAMPLES_PER_PLAYER, random_state=SEED) for file in PLAYER_FILES]).reset_index()

Sample unique player files and concat them into one DF

In [25]:
UNIQUE_SAMPLES_PER_PLAYER = TOTAL_N_INDIVIDUAL_SAMPLES // len(PLAYER_FILES)
df_unique = pd.concat([
    sample_tweet(file, n_samples=UNIQUE_SAMPLES_PER_PLAYER,
                 random_state=SEED, filter_tweets=df_overlapping['tweet_id'])
    for file in PLAYER_FILES]).reset_index()
assert not df_unique['tweet_id'].isin(df_overlapping['tweet_id']).any()

Split the overlapping data into the two groups used for the two rounds. We do that by grouping by the player and splitting each group into two.

In [26]:
groups = df_overlapping.groupby(by='player_name').apply(lambda df: split_group(df, N_splits=2, prefix='round'))
group_1 = df_overlapping.iloc[groups['round_1'].values]
group_2 = df_overlapping.iloc[groups['round_2'].values]

# Assert grouping has done correctly.
assert pd.DataFrame.all(group_1.groupby('player_name').index.count() == OVERLAPPING_SAMPLES_PER_PLAYER // N_ANNOTATION_ROUNDS)
assert pd.DataFrame.all(group_2.groupby('player_name').index.count() == OVERLAPPING_SAMPLES_PER_PLAYER // N_ANNOTATION_ROUNDS)
assert not group_1['tweet_id'].isin(group_2['tweet_id']).any()
group_1_overlapping = group_1
group_2_overlapping = group_2

Now split the unique data into 4 for each of the two rounds.

In [27]:
groups = df_unique.groupby(by='player_name').apply(lambda df: split_group(df, N_splits=2, prefix='round'))
group_1 = df_unique.iloc[groups['round_1'].values]
group_2 = df_unique.iloc[groups['round_2'].values]

annotators_group_1 = group_1.groupby(by='player_name').apply(lambda df: split_group(df, N_splits=N_ANNOTATORS, prefix='annotator'))
annotators_group_2 = group_2.groupby(by='player_name').apply(lambda df: split_group(df, N_splits=N_ANNOTATORS, prefix='annotator'))

annotators_1 = [df_unique.iloc[annotators_group_1[f'annotator_{i + 1}'].values] for i in range(N_ANNOTATORS)]
annotators_2 = [df_unique.iloc[annotators_group_2[f'annotator_{i + 1}'].values] for i in range(N_ANNOTATORS)]

# Ensure no IDs are overlapping
unique_ids = []
for df in annotators_1 + annotators_2 + [group_1_overlapping] + [group_2_overlapping]:
    for row in df.iterrows():
        assert not row[1].tweet_id in unique_ids
        unique_ids += [row[1].tweet_id]

Save the two annotation rounds for both overlapping and unique

In [28]:
group_1_overlapping.to_csv(f'round-1/samples_seed-{SEED}.csv', sep=',', index=None)
group_2_overlapping.to_csv(f'round-2/samples_seed-{SEED}.csv', sep=',', index=None)

In [29]:
initials = ['daen', 'miim', 'beke', 'toap']

# Round 1
for initial, df in zip(initials, annotators_1):
    df.to_csv(f'round-1/{initial}/samples_seed-{SEED}.csv', sep=',', index=None)

# Round 2
for initial, df in zip(initials, annotators_2):
    df.to_csv(f'round-2/{initial}/samples_seed-{SEED}.csv', sep=',', index=None)
