# RecSys - Data exploration

In [1]:
import os


INPUT_PATH = "hdfs://BigDataHA/user/s277309/recsys_data/"

features = [
    # Tweet features
    "text_tokens",      # List[long]    Ordered list of Bert ids corresponding to Bert tokenization of Tweet text
    "hashtags",         # List[string]  Tab separated list of hastags (identifiers) present in the tweet
    "tweet_id",         # String        Tweet identifier (unique)
    "present_media",    # List[String]  Tab separated list of media types. Media type can be in (Photo, Video, Gif)
    "present_links",    # List[string]  Tab separated list of links (identifiers) included in the Tweet
    "present_domains",  # List[string]  Tab separated list of domains included in the Tweet (twitter.com, dogs.com)
    "tweet_type",       # String        Tweet type, can be either Retweet, Quote, Reply, or Toplevel
    "language",         # String        Identifier corresponding to the inferred language of the Tweet
    "tweet_timestamp",  # Long          Unix timestamp, in sec of the creation time of the Tweet
    
    # Engaged-with User (i.e., Engagee) Features
    "engaged_with_user_id",                 # String    User identifier
    "engaged_with_user_follower_count",     # Long      Number of followers of the user
    "engaged_with_user_following_count",    # Long      Number of accounts the user is following
    "engaged_with_user_is_verified",        # Bool      Is the account verified?
    "engaged_with_user_account_creation",   # Long      Unix timestamp, in seconds, of the creation time of the account
    
    # Engaging User (i.e., Engager) Features
    "engaging_user_id",                     # String    User identifier   
    "engaging_user_follower_count",         # Long      Number of followers of the user
    "engaging_user_following_count",        # Long      Number of accounts the user is following
    "engaging_user_is_verified",            # Bool      Is the account verified?
    "engaging_user_account_creation",       # Long      Unix timestamp, in seconds, of the creation time of the account
    
    # Engagement features
    "engagee_follows_engager"   # Bool  Engagee follows engager?
]

features_idx = dict(zip(features, range(len(features))))

labels_idx = {
    # Engagement features (cont.)
    "reply_timestamp": 20,                  # Long      Unix timestamp (in seconds) of one of the replies, if there is at least one
    "retweet_timestamp": 21,                # Long      Unix timestamp (in seconds) of the retweet by the engaging user, if there is at least one
    "retweet_with_comment_timestamp": 22,   # Long      Unix timestamp (in seconds) of one of the retweet with comment by the engaging user, if there is at least one
    "like_timestamp": 23                    # Long      Unix timestamp (in seconds) of the like by the engaging user, if they liked the tweet
}

**Additional notes regarding the dataset.**

* **Negative samples** - We [the authors of the challenge] also wanted to give examples of negative interactions (i.e., this user did not engage with this item), but disclosing this information will create a privacy leak. Negative examples are items the user might have seen but not engaged with. However, a set of such examples would reveal what content was seen by users — this is private information. To get around this, we created the pseudo-negative dataset as follows: for each user we considered all the Tweets that were created by their followers in the considered timeframe and removed the positive examples (i.e., the Tweets that were engaged with). We sampled from the set of remaining Tweets, which does not distinguish between negative examples (items the user saw and did notengage with) and items the user did not see, thereby effectively protecting this private information.

## Data preparation

In [2]:
# Read data
lines_rdd = sc.textFile(INPUT_PATH)

In [3]:
# # Sample data: take a ~100MB and ~1GB random sample from the whole dataset
# SAVE_PATH = "recsys_sampled_data"

# hundred_mb_sample_fraction = 0.0003
# thousand_mb_sample_fraction = 0.003

# hundred_mb_sample = lines.sample(withReplacement=False, fraction=hundred_mb_sample_fraction)
# thousand_mb_sample = lines.sample(withReplacement=False, fraction=thousand_mb_sample_fraction)

# hundred_mb_sample.saveAsTextFile(os.path.join(SAVE_PATH, "sample_100mb"))
# thousand_mb_sample.saveAsTextFile(os.path.join(SAVE_PATH, "sample_1gb"))

In [4]:
# Count total data points
total_count = lines_rdd.count()

print(f"Total number of data points in the training set: {total_count}")

Total number of data points in the training set: 747694282


In [6]:
# Split each line
# Fields in each data entry are separated by the 1 character (0x31 in UTF-8).
# https://recsys-twitter.com/code/snippets

fields = lines_rdd.map(lambda line: line.split("\x01")).cache()

## Tweet analysis

**Number of tweets with at least one hashtag** (%)

In [7]:
hashtag_lines = fields.filter(lambda fields: len(fields[features_idx["hashtags"]]) > 0)
hashtag_lines_count = hashtag_lines.count()

In [8]:
print(f"Data points containing one or more hashtags: {hashtag_lines_count/total_count*100}%")

Data points containing one or more hashtags: 19.997261661551665%


## Interaction analysis

**Amount of negative samples**, i.e., data points where all labels are empty, non-interactions (%)

In [9]:
def non_interaction_filter(fields):
    if len(fields[labels_idx["reply_timestamp"]]) == 0 and \
        len(fields[labels_idx["retweet_timestamp"]]) == 0 and \
        len(fields[labels_idx["retweet_with_comment_timestamp"]]) == 0 and \
        len(fields[labels_idx["like_timestamp"]]) == 0:
        return True
    return False

negative_samples = fields.filter(non_interaction_filter)
negative_samples_count = negative_samples.count()

In [11]:
print(f"Data points corresponding to no interaction (negative samples): {negative_samples_count/total_count*100}%")

Data points corresponding to no interaction (negative samples): 50.25544651684256%


**Amount of type {reply, retweet, retweet with comment, like} interactions** (%)

In [12]:
reply_interactions = fields.filter(lambda fields: len(fields[labels_idx["reply_timestamp"]]) > 0)
retweet_interactions = fields.filter(lambda fields: len(fields[labels_idx["retweet_timestamp"]]) > 0)
retweet_comment_interactions = fields.filter(lambda fields: len(fields[labels_idx["retweet_with_comment_timestamp"]]) > 0)
like_interactions = fields.filter(lambda fields: len(fields[labels_idx["like_timestamp"]]) > 0)

reply_interactions_count = reply_interactions.count()
retweet_interactions_count = retweet_interactions.count()
retweet_comment_interactions_count = retweet_comment_interactions.count()
like_interactions_count = like_interactions.count()

In [13]:
# Note that these data points may contain any combination of these four interactions
print(f"Data points containing a reply interaction: {reply_interactions_count/total_count*100}%")
print(f"Data points containing a retweet interaction: {retweet_interactions_count/total_count*100}%")
print(f"Data points containing a retweet with comment interaction: {retweet_comment_interactions_count/total_count*100}%")
print(f"Data points containing a like interaction: {like_interactions_count/total_count*100}%")

Data points containing a reply interaction: 2.903448845687441%
Data points containing a retweet interaction: 8.75229630283571%
Data points containing a retweet with comment interaction: 0.7013417283295474%
Data points containing a like interaction: 39.7228055837934%
