In [41]:
import os
import matplotlib.pyplot as plt
import pandas as pd
from datetime import datetime
from pyspark.sql.types import IntegerType
from pyspark.sql import Row


INPUT_PATH = "hdfs://BigDataHA/user/s277309/recsys_data/"
TEST = True
if TEST:
    INPUT_PATH = "recsys_data_sample_generated/sample_0.0003"

features = [
    # Tweet features
    "text_tokens",      # List[long]    Ordered list of Bert ids corresponding to Bert tokenization of Tweet text
    "hashtags",         # List[string]  Tab separated list of hastags (identifiers) present in the tweet
    "tweet_id",         # String        Tweet identifier (unique)
    "present_media",    # List[String]  Tab separated list of media types. Media type can be in (Photo, Video, Gif)
    "present_links",    # List[string]  Tab separated list of links (identifiers) included in the Tweet
    "present_domains",  # List[string]  Tab separated list of domains included in the Tweet (twitter.com, dogs.com)
    "tweet_type",       # String        Tweet type, can be either Retweet, Quote, Reply, or Toplevel
    "language",         # String        Identifier corresponding to the inferred language of the Tweet
    "tweet_timestamp",  # Long          Unix timestamp, in sec of the creation time of the Tweet
    
    # Engaged-with User (i.e., Engagee) Features
    "engaged_with_user_id",                 # String    User identifier
    "engaged_with_user_follower_count",     # Long      Number of followers of the user
    "engaged_with_user_following_count",    # Long      Number of accounts the user is following
    "engaged_with_user_is_verified",        # Bool      Is the account verified?
    "engaged_with_user_account_creation",   # Long      Unix timestamp, in seconds, of the creation time of the account
    
    # Engaging User (i.e., Engager) Features
    "engaging_user_id",                     # String    User identifier   
    "engaging_user_follower_count",         # Long      Number of followers of the user
    "engaging_user_following_count",        # Long      Number of accounts the user is following
    "engaging_user_is_verified",            # Bool      Is the account verified?
    "engaging_user_account_creation",       # Long      Unix timestamp, in seconds, of the creation time of the account
    
    # Engagement features
    "engagee_follows_engager"   # Bool  Engagee follows engager?
]

features_idx = dict(zip(features, range(len(features))))

labels_idx = {
    # Engagement features (cont.)
    "reply_timestamp": 20,                  # Long      Unix timestamp (in seconds) of one of the replies, if there is at least one
    "retweet_timestamp": 21,                # Long      Unix timestamp (in seconds) of the retweet by the engaging user, if there is at least one
    "retweet_with_comment_timestamp": 22,   # Long      Unix timestamp (in seconds) of one of the retweet with comment by the engaging user, if there is at least one
    "like_timestamp": 23                    # Long      Unix timestamp (in seconds) of the like by the engaging user, if they liked the tweet
}

labels = ["TARGET_" + l for l in labels_idx.keys()]

In [4]:
df = spark.read.parquet(INPUT_PATH)

In [None]:
df_mini = df.limit(100)

In [20]:
row = df_mini.rdd.first()

In [88]:
def unwindField(row, field_name):
    dict_row = row.asDict()
    field = dict_row.pop(field_name, None)
    rows = []
    if field: # If not empty
        for el in field.split('\t'):
            dict_row[field_name] = el
            rows.append(Row(**dict_row))

    else:
        dict_row[field_name] = 'None'
        rows.append(Row(**dict_row))
    
    return rows

unwindField(row, 'hashtags')

[Row(TARGET_like_timestamp='', TARGET_reply_timestamp='', TARGET_retweet_timestamp='', TARGET_retweet_with_comment_timestamp='', binary_TARGET_like_timestamp=0, binary_TARGET_reply_timestamp=0, binary_TARGET_retweet_timestamp=0, binary_TARGET_retweet_with_comment_timestamp=0, engaged_with_user_account_creation='1363166784', engaged_with_user_follower_count='1201084', engaged_with_user_following_count='7', engaged_with_user_id='71B315C2C7C6494793038F07CED4C53D', engaged_with_user_is_verified='false', engagee_follows_engager='false', engaging_user_account_creation='1459508007', engaging_user_follower_count='48', engaging_user_following_count='922', engaging_user_id='6D304F25882123DB07F40C7638C9C190', engaging_user_is_verified='false', hashtags='None', language='488B32D24BD4BB44172EB981C1BCA6FA', present_domains='', present_links='', present_media='Photo\tPhoto', text_tokens='101\t56898\t137\t10144\t12818\t93103\t14058\t131\t15127\t21736\t21377\t11419\t46128\t21597\t10841\t146\t65884\t100

In [None]:
test_df.show()

In [9]:
def binarize_label(el):
    if el != '':
        return 1
    return 0

binarize = spark.udf.register("binary", binarize_label, IntegerType())

# Dice di non usare un for loop ma non e chiaro come aggiungere piu colonne in un colpo
binary_labels = []
for l in labels:
    bin_l = "binary_" + l
    df = df.withColumn(bin_l, binarize(df[l]))
    binary_labels.append(bin_l)

In [16]:
group = ','.join(["language", "tweet_id"])
target = "binary_TARGET_like_timestamp"
df_mini.registerTempTable("data")
query = """SELECT {}, mean({}) as conditional_probability
            FROM data 
            GROUP BY {}
        """.format(group, target, group)

result = spark.sql(query)

In [18]:
result.show()

+--------------------+--------------------+-----------------------+
|            language|            tweet_id|conditional_probability|
+--------------------+--------------------+-----------------------+
|488B32D24BD4BB441...|1B6AB14A564EB2CDC...|                    0.0|
|488B32D24BD4BB441...|74ED8142BC78BCDAC...|                    1.0|
|488B32D24BD4BB441...|ABD89A248E8D0AB91...|                    0.0|
|488B32D24BD4BB441...|B24F236A2C41645BF...|                    0.0|
|313ECD3A1E5BB0740...|ABFEBDE64CC6CB20F...|                    0.0|
|488B32D24BD4BB441...|9BE186DDDD362DB87...|                    1.0|
|488B32D24BD4BB441...|069CC080DD670BE8D...|                    0.0|
|B0FA488F2911701DD...|336E75E6B67781320...|                    1.0|
|E7F038DE3EAD397AE...|69883FF3F279C260F...|                    0.0|
|488B32D24BD4BB441...|2F640F2A8B0B753AC...|                    0.0|
|488B32D24BD4BB441...|73422C6330C14847E...|                    0.0|
|488B32D24BD4BB441...|1AB5A514BEA6A12E4...|     

In [89]:
def conditional_prob(df, conditioning_rv, target):
    """
    Estimate and plot conditional probability of a target being True, given a set of conditioning random variables.
    This function is not suitable if we have attributes of type List[string], e.g. hashtags
    
    Args:
        df (DataFrame): pyspark DataFrame with random variables sample realizations
        conditioning_rv (List[str]): names of the conditioning random_variables
        target: target random variable for hìwhich parameter is being estimated
        
    Return:
        result: dataframe column with conditioned probability value for each sample
    """
    
    def unwindField(row, field_name):
        dict_row = row.asDict()
        field = dict_row.pop(field_name, None)
        
        rows = []
        if field: # If not empty
            for el in field.split('\t'):
                dict_row[field_name] = el
                rows.append(Row(**dict_row))

        else:
            dict_row[field_name] = 'None' # Instead of str('')
            rows.append(Row(**dict_row))

        return rows
    

    # Be careful, of course it can be very expensive, especially in terms of memory
    for field in conditioning_rv:
        if field in ["text_tokens", "hashtags", "present_media", "present_links", "present_domains"]:
            df = df.rdd.flatMap(lambda row: unwindField(row, field)).toDF()
    
    group = ','.join(conditioning_rv)
    df.registerTempTable("data")
    query = """SELECT {}, mean({}) as conditional_probability
                FROM data 
                GROUP BY {}
            """.format(group, target, group)
    result = spark.sql(query)
    
    # Still need statistical significance. It can be a threshold wrt to max number of occurrences
    # Plot summary statistics
    
    return result

In [None]:
# How to handle null, like no hashtag ==> simply filter them, otherwise they are an issue in summary statistics
# How to handle List[str] attributes, like hashtags ==> simply do flatMap

In [62]:
binary_labels

['binary_TARGET_reply_timestamp',
 'binary_TARGET_retweet_timestamp',
 'binary_TARGET_retweet_with_comment_timestamp',
 'binary_TARGET_like_timestamp']

In [90]:
# Per quanto riguarda la statistical significance, è meglio avere una sola conditioning random variable
# Se ne voglio piu di una, applico due volte questa funzione
result = conditional_prob(df_mini, ["hashtags"], "binary_TARGET_like_timestamp")

In [93]:
result.show(100, truncate = False)

+--------------------------------+-----------------------+
|hashtags                        |conditional_probability|
+--------------------------------+-----------------------+
|B3A404B935DF0B7213829E2B485D4AFD|0.0                    |
|None                            |0.13636363636363635    |
|E75B1887697C80E89F5F0F450ED8D939|0.0                    |
|EDFA1EB79D45FB3E4C8155BCC6C8E5C9|0.0                    |
|C56C09BD117EEB3C1077C7BA0FAEDF8F|0.0                    |
|D424C6056B265C6B812754865337677E|0.0                    |
|CA06BF6CCB628D70A1C7140184F2C222|0.0                    |
|8FA0FFC783F5C16B04A2EB9DC74DB693|0.0                    |
|311B6E08AAA492CFB5944B39583EE20F|0.0                    |
|CDBFC509469934C1BDCFC25483D9EC72|0.0                    |
|C849E30C938D218230221A9298A712EB|0.0                    |
|75F737D8BEF4DCF5D3EFD6C03E5C4FAD|0.0                    |
|6F805B6827F1526BD7D9A9276EFA0EA7|0.0                    |
|01FF486044DFFEFFD6EF80A33E7F12AF|0.0                   