# RecSys - Graphical analysis

In [1]:
import os

from graphframes import GraphFrame
from pyspark.sql.functions import lit


INPUT_PATH = "hdfs://BigDataHA/user/s277309/recsys_data/"
CHECKPOINT_PATH = "recsys_ckpt"  # This will be created on your HDFS filesystem

features = [
    # Tweet features
    "text_tokens",      # List[long]    Ordered list of Bert ids corresponding to Bert tokenization of Tweet text
    "hashtags",         # List[string]  Tab separated list of hastags (identifiers) present in the tweet
    "tweet_id",         # String        Tweet identifier (unique)
    "present_media",    # List[String]  Tab separated list of media types. Media type can be in (Photo, Video, Gif)
    "present_links",    # List[string]  Tab separated list of links (identifiers) included in the Tweet
    "present_domains",  # List[string]  Tab separated list of domains included in the Tweet (twitter.com, dogs.com)
    "tweet_type",       # String        Tweet type, can be either Retweet, Quote, Reply, or Toplevel
    "language",         # String        Identifier corresponding to the inferred language of the Tweet
    "tweet_timestamp",  # Long          Unix timestamp, in sec of the creation time of the Tweet
    
    # Engaged-with User (i.e., Engagee) Features
    "engaged_with_user_id",                 # String    User identifier
    "engaged_with_user_follower_count",     # Long      Number of followers of the user
    "engaged_with_user_following_count",    # Long      Number of accounts the user is following
    "engaged_with_user_is_verified",        # Bool      Is the account verified?
    "engaged_with_user_account_creation",   # Long      Unix timestamp, in seconds, of the creation time of the account
    
    # Engaging User (i.e., Engager) Features
    "engaging_user_id",                     # String    User identifier   
    "engaging_user_follower_count",         # Long      Number of followers of the user
    "engaging_user_following_count",        # Long      Number of accounts the user is following
    "engaging_user_is_verified",            # Bool      Is the account verified?
    "engaging_user_account_creation",       # Long      Unix timestamp, in seconds, of the creation time of the account
    
    # Engagement features
    "engagee_follows_engager"   # Bool  Does the account of the engaged-with tweet author follow the account that has made the engagement?
]

features_idx = dict(zip(features, range(len(features))))

labels_idx = {
    # Engagement features (cont.)
    "reply_timestamp": 20,                  # Long      Unix timestamp (in seconds) of one of the replies, if there is at least one
    "retweet_timestamp": 21,                # Long      Unix timestamp (in seconds) of the retweet by the engaging user, if there is at least one
    "retweet_with_comment_timestamp": 22,   # Long      Unix timestamp (in seconds) of one of the retweet with comment by the engaging user, if there is at least one
    "like_timestamp": 23                    # Long      Unix timestamp (in seconds) of the like by the engaging user, if they liked the tweet
}

In [None]:
# In Twitter Spaces they mentioned this year they added more "interconnected"
# data points to promote graph-based algorithms!

## Graph preparation

Our **graph representation** represents vertices as users $u_i \in U$, and edges as engagements. An edge $(u_i, u_j) \in E$ corresponds to the fact that user $u_i$ engaged with user $u_j$'s tweet in a specific way. If $u_i$ engaged with user $u_j$'s tweet in multiple ways (e.g., a reply and a retweet), there will be *two* edges characterized by two different types (multigraph).

- Attributes of **users** $u_i$
    - `user_id` (uniquely identifies the user)
    - `follower_count`
    - `following_count`
    - `is_verified`
    - `account_creation`

    
- Attributes of **edges** $(u_i, u_j)$
    - `interaction_type` (either 1, 2, 3, 4 corresponding respectively to "reply", "retweet", "retweet_with_comment", "like")
    - `interaction_timestamp`
    - `engagee_follows_engager` (true if $u_j$ follows $u_i$, i.e., if the engaged-with user follows the user that made the engagement, false otherwise)
    - All tweet features (see above)

In [2]:
# Read data
lines_rdd = sc.textFile(INPUT_PATH)

# Split each line
# Fields in each data entry are separated by the 1 character (0x31 in UTF-8).
# https://recsys-twitter.com/code/snippets
fields_rdd = lines_rdd.map(lambda line: line.strip().split("\x01"))

# Filter out negative samples, i.e., non-interaction samples. Only get *positive* samples.
def interaction_filter(line):
    if len(line[labels_idx["reply_timestamp"]]) == 0 and \
        len(line[labels_idx["retweet_timestamp"]]) == 0 and \
        len(line[labels_idx["retweet_with_comment_timestamp"]]) == 0 and \
        len(line[labels_idx["like_timestamp"]]) == 0:
        return False
    return True

positives_rdd = fields_rdd.filter(interaction_filter)

# Move over to DataFrame interface
schema = features + list(labels_idx)  # Column names
positives_df = spark.createDataFrame(positives_rdd, schema).cache()

In [3]:
positives_df.printSchema()

root
 |-- text_tokens: string (nullable = true)
 |-- hashtags: string (nullable = true)
 |-- tweet_id: string (nullable = true)
 |-- present_media: string (nullable = true)
 |-- present_links: string (nullable = true)
 |-- present_domains: string (nullable = true)
 |-- tweet_type: string (nullable = true)
 |-- language: string (nullable = true)
 |-- tweet_timestamp: string (nullable = true)
 |-- engaged_with_user_id: string (nullable = true)
 |-- engaged_with_user_follower_count: string (nullable = true)
 |-- engaged_with_user_following_count: string (nullable = true)
 |-- engaged_with_user_is_verified: string (nullable = true)
 |-- engaged_with_user_account_creation: string (nullable = true)
 |-- engaging_user_id: string (nullable = true)
 |-- engaging_user_follower_count: string (nullable = true)
 |-- engaging_user_following_count: string (nullable = true)
 |-- engaging_user_is_verified: string (nullable = true)
 |-- engaging_user_account_creation: string (nullable = true)
 |-- engag

In [4]:
# Create vertices DataFrame (users)
engaged_with_users_df = positives_df.selectExpr("engaged_with_user_id AS id",  # Required column
                                                "engaged_with_user_follower_count AS follower_count",
                                                "engaged_with_user_following_count AS following_count",
                                                "engaged_with_user_is_verified AS is_verified",
                                                "engaged_with_user_account_creation AS account_creation")

engaging_users_df = positives_df.selectExpr("engaging_user_id AS id",  # Required column
                                            "engaging_user_follower_count AS follower_count",
                                            "engaging_user_following_count AS following_count",
                                            "engaging_user_is_verified AS is_verified",
                                            "engaging_user_account_creation AS account_creation")

union_users_df = engaged_with_users_df.union(engaging_users_df)

users_df = union_users_df.distinct()  # Requires shuffle

In [5]:
# Create edges DataFrame (interactions)
# edge: engaging --> engaged_with
interactions_df = positives_df.selectExpr( "engaging_user_id AS src",  # Required column
                                           "engaged_with_user_id AS dst",  # Required column
                                           "engagee_follows_engager AS dst_follows_src",
                                           "text_tokens",
                                           "hashtags",
                                           "tweet_id",
                                           "present_media",
                                           "present_links",
                                           "present_domains",
                                           "tweet_type",
                                           "language",
                                           "tweet_timestamp",
                                           "reply_timestamp",
                                           "retweet_timestamp",
                                           "retweet_with_comment_timestamp",
                                           "like_timestamp")

# Each dataframe contains only an interaction type.
reply_interactions_df = interactions_df.where("LENGTH(reply_timestamp) > 0")
retweet_interactions_df = interactions_df.where("LENGTH(retweet_timestamp) > 0")
retweet_with_comment_interactions_df = interactions_df.where("LENGTH(retweet_with_comment_timestamp) > 0")
like_interactions_df = interactions_df.where("LENGTH(like_timestamp) > 0")

# Adapt all dataframes to a common schema (with `interaction_type`, `interaction_timestamp`)
column_df_dict = { 
    # Associates the column specifying the timestamp of the interaction with the dataframe of that type of interaction
    "reply_timestamp": reply_interactions_df,
    "retweet_timestamp": retweet_interactions_df,
    "retweet_with_comment_timestamp": retweet_with_comment_interactions_df,
    "like_timestamp": like_interactions_df
}

interaction_types = {
    # Associates the column of the timestamp of the interaction with the interaction type id
    "reply_timestamp": 1,
    "retweet_timestamp": 2,
    "retweet_with_comment_timestamp": 3,
    "like_timestamp": 4
}

common_df_list = []
for column, df in column_df_dict.items():
    common_schema_df = df.selectExpr("src", "dst",
                                     "dst_follows_src",
                                     "text_tokens",
                                     "hashtags",
                                     "tweet_id",
                                     "present_media",
                                     "present_links",
                                     "present_domains",
                                     "tweet_type",
                                     "language",
                                     "tweet_timestamp",
                                     f"{column} AS interaction_timestamp")
        
    common_schema_with_type_df = common_schema_df.withColumn("interaction_type", lit(interaction_types[column]))
    
    common_df_list.append(common_schema_with_type_df)

# Concatenate interaction dataframes into a single one (final edges DataFrame)
# Note that a single tweet may appear multiple times, if there is more than one type of interaction.
edges_df = common_df_list[0].union(common_df_list[1]).union(common_df_list[2]).union(common_df_list[3])

In [6]:
users_df.printSchema()

root
 |-- id: string (nullable = true)
 |-- follower_count: string (nullable = true)
 |-- following_count: string (nullable = true)
 |-- is_verified: string (nullable = true)
 |-- account_creation: string (nullable = true)



In [7]:
edges_df.printSchema()

root
 |-- src: string (nullable = true)
 |-- dst: string (nullable = true)
 |-- dst_follows_src: string (nullable = true)
 |-- text_tokens: string (nullable = true)
 |-- hashtags: string (nullable = true)
 |-- tweet_id: string (nullable = true)
 |-- present_media: string (nullable = true)
 |-- present_links: string (nullable = true)
 |-- present_domains: string (nullable = true)
 |-- tweet_type: string (nullable = true)
 |-- language: string (nullable = true)
 |-- tweet_timestamp: string (nullable = true)
 |-- interaction_timestamp: string (nullable = true)
 |-- interaction_type: integer (nullable = false)



In [8]:
# Create the graph
g = GraphFrame(users_df, edges_df).cache()

**Basic graph statistics**

In [9]:
n_users = g.vertices.count()
n_users

41404456

In [10]:
g.dropIsolatedVertices().vertices.count()  # There are *no* isolated vertices (makes sense, as the database is built on interacting vertices)

41404456

In [11]:
n_edges = g.edges.count()
n_edges

389398378

## Graph analysis

**In- and out-degree statistics**

In [12]:
in_degrees_df = g.inDegrees
out_degrees_df = g.outDegrees

in_degrees_max = in_degrees_df.agg({"inDegree": "max"})
in_degrees_min = in_degrees_df.agg({"inDegree": "min"})
in_degrees_avg = in_degrees_df.agg({"inDegree": "avg"})
in_degrees_stddev = in_degrees_df.agg({"inDegree": "stddev"})

out_degrees_max = out_degrees_df.agg({"outDegree": "max"})
out_degrees_min = out_degrees_df.agg({"outDegree": "min"})
out_degrees_avg = out_degrees_df.agg({"outDegree": "avg"})
out_degrees_stddev = out_degrees_df.agg({"outDegree": "stddev"})

In [13]:
print(in_degrees_max.collect())
print(in_degrees_min.collect())
print(in_degrees_avg.collect())
print(in_degrees_stddev.collect())

[Row(max(inDegree)=390416)]
[Row(min(inDegree)=1)]
[Row(avg(inDegree)=21.594890696345512)]
[Row(stddev(inDegree)=425.1624978474091)]


In [14]:
print(out_degrees_max.collect())
print(out_degrees_min.collect())
print(out_degrees_avg.collect())
print(out_degrees_stddev.collect())

[Row(max(outDegree)=19833)]
[Row(min(outDegree)=1)]
[Row(avg(outDegree)=10.701041740035109)]
[Row(stddev(outDegree)=55.199456839697945)]


Should take into account supernodes!

**Connected components**

In [15]:
sc.setCheckpointDir(CHECKPOINT_PATH)

In [16]:
# Returns a dataframe that contains one row for each user,
# with an additional column "component" as the unique identifier
# of the connected component assigned to the user
# Note: exclude isolated nodes from the computation
connected_components_df = g.connectedComponents()

In [17]:
n_connected_components = connected_components_df.select("component").distinct().count()
print(f"Number of connected components (excluding isolated nodes): {n_connected_components}")

Number of connected components (excluding isolated nodes): 234123


In [18]:
print(f"Average number of nodes in each connected component: {n_users/n_connected_components}")

Average number of nodes in each connected component: 176.84916048401908
