# Test sampler

Sample a big dataset to create small(er) datasets. Outputs $n$ pairs of {train, test} datasets

Each dataset (train or test) is stored as a folder with parts of a parquet file, located on HDFS at `OUTPUT_PATH`.

To merge the parts into a single file and bring it to the local filesystem, run the last cell of the notebook.

Run this notebook on the cluster.

In [6]:
from pyspark.sql.functions import lit
import os

In [7]:
features = [
    # Tweet features
    "text_tokens",      # List[long]    Ordered list of Bert ids corresponding to Bert tokenization of Tweet text
    "hashtags",         # List[string]  Tab separated list of hastags (identifiers) present in the tweet
    "tweet_id",         # String        Tweet identifier (unique)
    "present_media",    # List[String]  Tab separated list of media types. Media type can be in (Photo, Video, Gif)
    "present_links",    # List[string]  Tab separated list of links (identifiers) included in the Tweet
    "present_domains",  # List[string]  Tab separated list of domains included in the Tweet (twitter.com, dogs.com)
    "tweet_type",       # String        Tweet type, can be either Retweet, Quote, Reply, or Toplevel
    "language",         # String        Identifier corresponding to the inferred language of the Tweet
    "tweet_timestamp",  # Long          Unix timestamp, in sec of the creation time of the Tweet
    
    # Engaged-with User (i.e., Engagee) Features
    "engaged_with_user_id",                 # String    User identifier
    "engaged_with_user_follower_count",     # Long      Number of followers of the user
    "engaged_with_user_following_count",    # Long      Number of accounts the user is following
    "engaged_with_user_is_verified",        # Bool      Is the account verified?
    "engaged_with_user_account_creation",   # Long      Unix timestamp, in seconds, of the creation time of the account
    
    # Engaging User (i.e., Engager) Features
    "engaging_user_id",                     # String    User identifier   
    "engaging_user_follower_count",         # Long      Number of followers of the user
    "engaging_user_following_count",        # Long      Number of accounts the user is following
    "engaging_user_is_verified",            # Bool      Is the account verified?
    "engaging_user_account_creation",       # Long      Unix timestamp, in seconds, of the creation time of the account
    
    # Engagement features
    "engagee_follows_engager"   # Bool  Engagee follows engager?
]

features_idx = dict(zip(features, range(len(features))))

labels_idx = {
    # Engagement features (cont.)
    "reply_timestamp": 20,                  # Long      Unix timestamp (in seconds) of one of the replies, if there is at least one
    "retweet_timestamp": 21,                # Long      Unix timestamp (in seconds) of the retweet by the engaging user, if there is at least one
    "retweet_with_comment_timestamp": 22,   # Long      Unix timestamp (in seconds) of one of the retweet with comment by the engaging user, if there is at least one
    "like_timestamp": 23                    # Long      Unix timestamp (in seconds) of the like by the engaging user, if they liked the tweet
}

In [8]:
INPUT_PATH = "hdfs://BigDataHA/user/s277309/recsys_data/"

OUTPUT_PATH = "recsys_data_sample_generated"

# Filter dataset Full for tweets before a given time in the middle,
# then from it sample one dataset A, save it. Extract list of
# engaging users from dataset A. Filter Full data for tweets
# AFTER the given time in the middle. Use this filtered dataset
# to further apply a filter and only get rows where engaging user
# is inside engaging users list. Then sample from this dataset
# the test set. End of story
# 5x 1mil train, 100k test

# Timestamp to split between training set and test set
SPLIT_TIMESTAMP = 1613602800  # 18 Feb 2021 (max: 1614211199 min: 1612396800)

# Number of rows to extract from the full dataset.
# TRAIN_ROWS (training set) and TEST_ROWS (test set)
TRAIN_ROWS = 10000
TEST_ROWS = 2000

# Number of {train, test} pairs to extract
FOLDS = 5

In [9]:
%%time

# Read data
lines_rdd = sc.textFile(INPUT_PATH)

# Split each line
# Fields in each data entry are separated by the 1 character (0x31 in UTF-8).
# https://recsys-twitter.com/code/snippets
fields_rdd = lines_rdd.map(lambda line: line.strip().split("\x01"))

# Assign schema
schema = features + list(labels_idx)  # Column names

# Create Spark DFs and cache
df = spark.createDataFrame(fields_rdd, schema)
df_train = df.filter(df.tweet_timestamp < SPLIT_TIMESTAMP).cache()
df_test = df.filter(df.tweet_timestamp > SPLIT_TIMESTAMP).cache()

TOTAL_ROWS_TRAIN = df_train.count()
TOTAL_ROWS_TEST = df_test.count()

print(f"Dataset ready, train rows: {TOTAL_ROWS_TRAIN} test rows: {TOTAL_ROWS_TEST}.")

Dataset ready, train rows: 497674007 test rows: 250014492.
CPU times: user 664 ms, sys: 466 ms, total: 1.13 s
Wall time: 2min 9s


In [10]:
%%time

for i in range(FOLDS):
    OUTPUT_DIR = f"sample{i+1}"
    SEED = 153
    
    # Delete output directory if it already exists
    !hdfs dfs -rm -r {OUTPUT_PATH}/{OUTPUT_DIR}
    !hdfs dfs -rm -r {OUTPUT_PATH}/{OUTPUT_DIR}_test
    
    # Sample train dataset
    df_train_sample = df_train.sample(withReplacement=False, fraction=TRAIN_ROWS/TOTAL_ROWS_TRAIN, seed=42+i+SEED)
    
    # Extract list of engaging users
    df_engaging_users = df_train_sample.select("engaging_user_id").distinct()
    
    # Get test dataset with only engaging_users inside df_engaging_users, and sample from it
    df_test_filtered = df_test.join(df_engaging_users, on="engaging_user_id", how="left_semi")
    TOTAL_ROWS_TEST_FILTERED = df_test_filtered.count()
    df_test_sample = df_test_filtered.sample(withReplacement=False, fraction=TEST_ROWS/TOTAL_ROWS_TEST_FILTERED, seed=7357+i+SEED)
    df_test_sample = df_test_sample.select(*schema)  # Reorder columns
    
    # + a small amount of random engaging users for checking purposes
    df_test_additional = df_test.sample(withReplacement=False, fraction=2000/TOTAL_ROWS_TEST, seed=466+i+SEED)
    df_test_additional = df_test_additional.select(*schema)  # Reorder columns
    df_test_sample = df_test_sample.union(df_test_additional)
    
    # Remove duplicate data points
    df_test_sample = df_test_sample.distinct()
    
    # Reorder columns
    df_test_sample = df_test_sample.select(*schema)
    
    # Save everything to disk
    df_train_sample.write.csv(os.path.join(OUTPUT_PATH, OUTPUT_DIR), sep="\x01", header=False)
    df_test_sample.write.csv(os.path.join(OUTPUT_PATH, OUTPUT_DIR + "_test"), sep="\x01", header=False)
    !hdfs dfs -getmerge {OUTPUT_PATH}/{OUTPUT_DIR} ~/recsys-2021/data/raw/{OUTPUT_DIR}
    !hdfs dfs -getmerge {OUTPUT_PATH}/{OUTPUT_DIR}_test ~/recsys-2021/data/raw/{OUTPUT_DIR}_test

21/06/07 10:18:27 INFO fs.TrashPolicyDefault: Moved: 'hdfs://BigDataHA/user/s277596/recsys_data_sample_generated/sample1' to trash at: hdfs://BigDataHA/user/s277596/.Trash/Current/user/s277596/recsys_data_sample_generated/sample1
21/06/07 10:18:30 INFO fs.TrashPolicyDefault: Moved: 'hdfs://BigDataHA/user/s277596/recsys_data_sample_generated/sample1_test' to trash at: hdfs://BigDataHA/user/s277596/.Trash/Current/user/s277596/recsys_data_sample_generated/sample1_test
21/06/07 10:19:12 INFO fs.TrashPolicyDefault: Moved: 'hdfs://BigDataHA/user/s277596/recsys_data_sample_generated/sample2' to trash at: hdfs://BigDataHA/user/s277596/.Trash/Current/user/s277596/recsys_data_sample_generated/sample2
21/06/07 10:19:15 INFO fs.TrashPolicyDefault: Moved: 'hdfs://BigDataHA/user/s277596/recsys_data_sample_generated/sample2_test' to trash at: hdfs://BigDataHA/user/s277596/.Trash/Current/user/s277596/recsys_data_sample_generated/sample2_test
21/06/07 10:19:53 INFO fs.TrashPolicyDefault: Moved: 'hdfs:/