In [1]:
import os
import matplotlib.pyplot as plt
import pandas as ks
import pickle as pkl
from collections import defaultdict
from datetime import datetime
from pyspark.sql.types import IntegerType
from pyspark.sql import Row
from pyspark.sql.functions import col, mean


INPUT_PATH = "hdfs://BigDataHA/user/s277309/recsys_data/"
TEST = True

features = [
    # Tweet features
    "text_tokens",      # List[long]    Ordered list of Bert ids corresponding to Bert tokenization of Tweet text
    "hashtags",         # List[string]  Tab separated list of hastags (identifiers) present in the tweet
    "tweet_id",         # String        Tweet identifier (unique)
    "present_media",    # List[String]  Tab separated list of media types. Media type can be in (Photo, Video, Gif)
    "present_links",    # List[string]  Tab separated list of links (identifiers) included in the Tweet
    "present_domains",  # List[string]  Tab separated list of domains included in the Tweet (twitter.com, dogs.com)
    "tweet_type",       # String        Tweet type, can be either Retweet, Quote, Reply, or Toplevel
    "language",         # String        Identifier corresponding to the inferred language of the Tweet
    "tweet_timestamp",  # Long          Unix timestamp, in sec of the creation time of the Tweet
    
    # Engaged-with User (i.e., Engagee) Features
    "engaged_with_user_id",                 # String    User identifier
    "engaged_with_user_follower_count",     # Long      Number of followers of the user
    "engaged_with_user_following_count",    # Long      Number of accounts the user is following
    "engaged_with_user_is_verified",        # Bool      Is the account verified?
    "engaged_with_user_account_creation",   # Long      Unix timestamp, in seconds, of the creation time of the account
    
    # Engaging User (i.e., Engager) Features
    "engaging_user_id",                     # String    User identifier   
    "engaging_user_follower_count",         # Long      Number of followers of the user
    "engaging_user_following_count",        # Long      Number of accounts the user is following
    "engaging_user_is_verified",            # Bool      Is the account verified?
    "engaging_user_account_creation",       # Long      Unix timestamp, in seconds, of the creation time of the account
    
    # Engagement features
    "engagee_follows_engager"   # Bool  Engagee follows engager?
]

features_idx = dict(zip(features, range(len(features))))

labels_idx = {
    # Engagement features (cont.)
    "reply_timestamp": 20,                  # Long      Unix timestamp (in seconds) of one of the replies, if there is at least one
    "retweet_timestamp": 21,                # Long      Unix timestamp (in seconds) of the retweet by the engaging user, if there is at least one
    "retweet_with_comment_timestamp": 22,   # Long      Unix timestamp (in seconds) of one of the retweet with comment by the engaging user, if there is at least one
    "like_timestamp": 23                    # Long      Unix timestamp (in seconds) of the like by the engaging user, if they liked the tweet
}

labels = [l for l in labels_idx.keys()]


Bad key "text.kerning_factor" on line 4 in
/opt/anaconda3/envs/bigdatalab_cpu_202101/lib/python3.7/site-packages/matplotlib/mpl-data/stylelib/_classic_test_patch.mplstyle.
You probably need to get an updated matplotlibrc file from
https://github.com/matplotlib/matplotlib/blob/v3.1.2/matplotlibrc.template
or from the matplotlib source distribution


In [3]:
if TEST:
    INPUT_PATH = "recsys_data_sample_generated/sample_0.0003"
    df = spark.read.parquet(INPUT_PATH)
else:
    schema = features + list(map(lambda l: f"TARGET_{l}", labels_idx))  # Column names
    df = sc.textFile(INPUT_PATH).map(lambda line: line.strip().split("\x01")).toDF(schema)

In [4]:
df.printSchema()

root
 |-- text_tokens: string (nullable = true)
 |-- hashtags: string (nullable = true)
 |-- tweet_id: string (nullable = true)
 |-- present_media: string (nullable = true)
 |-- present_links: string (nullable = true)
 |-- present_domains: string (nullable = true)
 |-- tweet_type: string (nullable = true)
 |-- language: string (nullable = true)
 |-- tweet_timestamp: string (nullable = true)
 |-- engaged_with_user_id: string (nullable = true)
 |-- engaged_with_user_follower_count: string (nullable = true)
 |-- engaged_with_user_following_count: string (nullable = true)
 |-- engaged_with_user_is_verified: string (nullable = true)
 |-- engaged_with_user_account_creation: string (nullable = true)
 |-- engaging_user_id: string (nullable = true)
 |-- engaging_user_follower_count: string (nullable = true)
 |-- engaging_user_following_count: string (nullable = true)
 |-- engaging_user_is_verified: string (nullable = true)
 |-- engaging_user_account_creation: string (nullable = true)
 |-- engag

In [5]:
pandas_df  = df.toPandas().set_index(['tweet_id', 'engaging_user_id'])

In [30]:
import time
import os
import databricks.koalas as ks
import pickle as pkl
from collections import defaultdict
from constants import ROOT_DIR

PATH_AUXILIARIES = os.path.join(ROOT_DIR, "../data/auxiliary")

def hashtag_popularity(raw_data, features = None, auxiliary_dict = None):
    """
    Args:
        raw_data (ks.DataFrame): dataset to process for feature extraction
    Returns: 
        new_feature (ks.DataFrame): DataFrame where for each input sample, we have the corresponding hashtag counter.
    """
    
    WINDOW_SIZE = 1800 # 2 hours time window
    output_path = os.path.join(PATH_AUXILIARIES, 'hashtag_window_counter.pkl')
    
    # Use loc to pass a view instead of a copy
    hash_time_df = raw_data.loc[:, ('hashtags', 'tweet_timestamp')]
    hash_time_df.sort_values(by='tweet_timestamp', inplace = True)
    
    # Add initialization for existing dictionary at inference time
    # Same for training if we drop first chunk of data
    window_counter = defaultdict(lambda : 0) # initialize to 0 if key is not present
    
    # number of active hashtags, needed for normalization
    # do we want to normalize? I would not, in order to actually detect hot hashtag,
    # and this is contained in absolute count, not relative
    active_counter = 0 
    
    # pointer to reduce counter when timestamp < now - WINDOW_SIZE
    j = 0
    
    # new column container
    new_col = []
    for index, row in hash_time_df.iterrows():
        hashtags = row['hashtags'].split('\t') # list of hashtags whose counter must be incremented
        now = int(row['tweet_timestamp']) # last tweet timestamp
        
        # Remove hashtags out of the 2 hours time window form now
        while int(hash_time_df.iloc[j, -1]) < (now - WINDOW_SIZE):
            row_to_delete = hash_time_df.iloc[j]
            hashtags_to_decrement = row_to_delete['hashtags']
            for h in hashtags_to_decrement:
                window_counter[h] -= 1
                if window_counter[h] <= 0:
                    del window_counter[h]
                active_counter -= 1
                
            j += 1
        
        # I have more than one hashtag for each record, but i need only one counter
        # Need to make a summary of the hashtags ==> max of window_counter?                
        most_popular = 0
        for h in hashtags:
            if h:  # h not empty string ''
                if window_counter[h] > most_popular:
                    most_popular = window_counter[h]
                
                # Increment corresponding hashtag counter
                window_counter[h] += 1
                active_counter += 1

        # Index is tuple for multiIndex. Ulima cosa da fare questo!
        new_col.append({
            'tweet_id': index[0],
            'engaging_user_id': index[1],
            'counter': most_popular
        })
            
    new_feature = ks.DataFrame(new_col).set_index(['tweet_id', 'engaging_user_id'])
    
    # store current window_counter, since this will be the initial counter at inference time
    with open(output_path, 'wb') as f:
        pkl.dump(dict(window_counter), f, protocol=pkl.HIGHEST_PROTOCOL)

    return new_feature

In [13]:
new_feature = hashtag_popularity(pandas_df)

In [16]:
new_feature.describe()

Unnamed: 0,counter
count,199954.0
mean,3.585135
std,33.499096
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,830.0


In [31]:
import pandas as pd
test_dict={'engaging_user_id': ['1','1','1','1','1','1','2','2','2','2','3','3','3'],
           'tweet_id': ['1','2','3','4','5','6','7','8','9','10','11','12','13'],
           'hashtags': ['1\t2\t5', '4', '2\t5', '1', '', '1\t2\t5', '1\t3\t5',
                       '1\t2\t5', '3\t2', '', '', '3\t4', '4\t2'],
           'tweet_timestamp' : [48120, 48120, 50300, 50400, 50400, 50460, 50460, 50600, 51300, 51300, 51720, 51720, 52100]
          }
test_dataframe = pd.DataFrame(test_dict).set_index(['tweet_id', 'engaging_user_id'])

In [32]:
new_feature, window_counter = hashtag_popularity(test_dataframe)

In [33]:
new_feature.to_pandas().assign(tweet_timestamp = [48120, 48120, 50300, 50400, 50400, 50460, 50460, 50600, 51300, 51300, 51720, 51720, 52100],
                  hashtags = ['1\t2\t5', '4', '2\t5', '1', '', '1\t2\t5', '1\t3\t5',
                       '1\t2\t5', '3\t2', '', '', '3\t4', '4\t2'])

Unnamed: 0_level_0,Unnamed: 1_level_0,counter,tweet_timestamp,hashtags
tweet_id,engaging_user_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1,0,48120,1\t2\t5
2,1,0,48120,4
3,1,0,50300,2\t5
4,1,0,50400,1
5,1,0,50400,
6,1,1,50460,1\t2\t5
7,2,2,50460,1\t3\t5
8,2,3,50600,1\t2\t5
9,2,3,51300,3\t2
10,2,0,51300,


In [98]:
new_feature.assign(tweet_timestamp = [48120, 48120, 50300, 50400, 50400, 50460, 50460, 50600, 51300, 51300, 51720, 51720, 52100],
                  hashtags = ['1\t2\t5', '4', '2\t5', '1', '', '1\t2\t5', '1\t3\t5',
                       '1\t2\t5', '3\t2', '', '', '3\t4', '4\t2'])

Unnamed: 0_level_0,Unnamed: 1_level_0,counter,tweet_timestamp,hashtags
tweet_id,engaging_user_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1,0,48120,1\t2\t5
2,1,0,48120,4
3,1,0,50300,2\t5
4,1,0,50400,1
5,1,0,50400,
6,1,1,50460,1\t2\t5
7,2,2,50460,1\t3\t5
8,2,3,50600,1\t2\t5
9,2,3,51300,3\t2
10,2,0,51300,


In [82]:
new_feature

Unnamed: 0_level_0,Unnamed: 1_level_0,counter,tweet_timestamp
tweet_id,engaging_user_id,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,0,
2,1,0,
3,1,0,
4,1,0,
5,1,0,
6,1,1,
7,2,2,
8,2,3,
9,2,3,
10,2,0,
