In [None]:
import os
import matplotlib.pyplot as plt
import pandas as ks
import pickle as pkl
import numpy as np
from functools import reduce
from collections import defaultdict
from datetime import datetime
from pyspark.sql.types import IntegerType
from pyspark.sql import Row
from pyspark.sql.functions import col, mean


INPUT_PATH = "hdfs://BigDataHA/user/s277309/recsys_data/"
TEST = True

features = [
    # Tweet features
    "text_tokens",      # List[long]    Ordered list of Bert ids corresponding to Bert tokenization of Tweet text
    "hashtags",         # List[string]  Tab separated list of hastags (identifiers) present in the tweet
    "tweet_id",         # String        Tweet identifier (unique)
    "present_media",    # List[String]  Tab separated list of media types. Media type can be in (Photo, Video, Gif)
    "present_links",    # List[string]  Tab separated list of links (identifiers) included in the Tweet
    "present_domains",  # List[string]  Tab separated list of domains included in the Tweet (twitter.com, dogs.com)
    "tweet_type",       # String        Tweet type, can be either Retweet, Quote, Reply, or Toplevel
    "language",         # String        Identifier corresponding to the inferred language of the Tweet
    "tweet_timestamp",  # Long          Unix timestamp, in sec of the creation time of the Tweet
    
    # Engaged-with User (i.e., Engagee) Features
    "engaged_with_user_id",                 # String    User identifier
    "engaged_with_user_follower_count",     # Long      Number of followers of the user
    "engaged_with_user_following_count",    # Long      Number of accounts the user is following
    "engaged_with_user_is_verified",        # Bool      Is the account verified?
    "engaged_with_user_account_creation",   # Long      Unix timestamp, in seconds, of the creation time of the account
    
    # Engaging User (i.e., Engager) Features
    "engaging_user_id",                     # String    User identifier   
    "engaging_user_follower_count",         # Long      Number of followers of the user
    "engaging_user_following_count",        # Long      Number of accounts the user is following
    "engaging_user_is_verified",            # Bool      Is the account verified?
    "engaging_user_account_creation",       # Long      Unix timestamp, in seconds, of the creation time of the account
    
    # Engagement features
    "engagee_follows_engager"   # Bool  Engagee follows engager?
]

features_idx = dict(zip(features, range(len(features))))

labels_idx = {
    # Engagement features (cont.)
    "reply_timestamp": 20,                  # Long      Unix timestamp (in seconds) of one of the replies, if there is at least one
    "retweet_timestamp": 21,                # Long      Unix timestamp (in seconds) of the retweet by the engaging user, if there is at least one
    "retweet_with_comment_timestamp": 22,   # Long      Unix timestamp (in seconds) of one of the retweet with comment by the engaging user, if there is at least one
    "like_timestamp": 23                    # Long      Unix timestamp (in seconds) of the like by the engaging user, if they liked the tweet
}

labels = [l for l in labels_idx.keys()]

In [None]:
if TEST:
    INPUT_PATH = "recsys_data_sample_generated/sample_0.0003.parquet"
    df = spark.read.parquet(INPUT_PATH)
else:
    schema = features + list(map(lambda l: f"TARGET_{l}", labels_idx))  # Column names
    df = sc.textFile(INPUT_PATH).map(lambda line: line.strip().split("\x01")).toDF(schema)

In [None]:
df.printSchema()

In [None]:
pandas_df  = df.toPandas().set_index(['tweet_id', 'engaging_user_id'])

In [27]:
def user_activity(raw_data, features = None):
    """
    Args:
        raw_data (ks.DataFrame): dataset to process for feature extraction
    Returns: 
        new_features (Dict[ks.Series]): Each ks.Series in the dictionary is the counter of user activities
        (appearences as engaging or engagee) inside one of the specified time windows
    """
    def counter_initialization(windows):
        return {k:0 for k in WINDOWS}
    
    def clean_window_counter(window_counter, user):
        # Remove user if its counters are all 0
        if reduce(lambda a, b: a+b, window_counter[user].values()) == 0:
            del window_counter[user]
    
        
    # Time windows in seconds
#     WINDOWS = np.array([5, 60, 240, 480, 1440])*60
    WINDOWS = np.array([5, 60])*60
    j = {k:0 for k in WINDOWS} # Clean up window_counter dictionary when a sample is out of window
    window_counter = defaultdict(lambda : counter_initialization(WINDOWS)) # Counter of appearences for each user, for each time window. Dict[Dict]
    new_features = [] # container for the new features
    
    # Sort by timestamp
    raw_data.sort_values(by='tweet_timestamp', inplace = True)
    
    # We convert our to separate numpy array, sicne koalas indexing turns out to be extremely slow
    index_col = raw_data.index.to_numpy()
    engaged_users = raw_data['engaged_with_user_id'].to_numpy()
    timestamps = raw_data['tweet_timestamp'].to_numpy()
    
    new_features = {k:[] for k in WINDOWS}
    for idx, engaged, now in zip(index_col, engaged_users, timestamps):
        tweet_id = idx[0]
        engaging = idx[1]
        
        for time_win in WINDOWS:
            # Remove outdated counts from windows_counter
            while timestamps[j[time_win]] < (now - time_win):
                user_a = index_col[j[time_win]][1]
                user_b = engaged_users[j[time_win]]
                
                if window_counter[user_a][time_win] > 0:
                    window_counter[user_a][time_win] -= 1
                if window_counter[user_b][time_win] > 0:
                    window_counter[user_b][time_win] -= 1
                
                # Remove a user if all of its counter are 0
                clean_window_counter(window_counter, user_a)
                clean_window_counter(window_counter, user_b)
                
                j[time_win] += 1
                
            # Generate new features for current row, and increment window counter by 1
            new_features[time_win].append({
                'tweet_id': tweet_id,
                'engaging_user_id': engaging,
                f'interactions_{time_win}': window_counter[engaging][time_win]
            })
            window_counter[engaging][time_win] += 1
            window_counter[engaged][time_win] += 1
       
    # Convert each list of dict to a series
    for key in new_features.keys():
        new_features[key] = ks.DataFrame(new_features[key]).set_index(['tweet_id', 'engaging_user_id']).squeeze()

    #TODO: store window_counter of active users for inference
    return new_features

In [31]:
import pandas as pd
test_dict={'engaging_user_id': ['1','1','2','2','1','1','2','1','1','2','3','3','3'],
           'engaged_with_user_id': ['2','2','1','3','2','3','1','3','3','1','1','1','1'],
           'tweet_id': ['a','b','c','d','e','f','g','h','i','j','k','l','m'],
           'tweet_timestamp' : [48120, 48120, 50300, 50400, 50600, 50760, 50860, 50900, 52300, 53300, 53720, 54000, 54100]
          }
test_dataframe = pd.DataFrame(test_dict).set_index(['tweet_id', 'engaging_user_id'])

In [32]:
new_features, window_counter = user_activity(test_dataframe)

In [33]:
new_features

{300: tweet_id  engaging_user_id
 a         1                   0
 b         1                   1
 c         2                   0
 d         2                   1
 e         1                   1
 f         1                   1
 g         2                   1
 h         1                   3
 i         1                   0
 j         2                   0
 k         3                   0
 l         3                   1
 m         3                   1
 Name: interactions_300, dtype: int64,
 3600: tweet_id  engaging_user_id
 a         1                   0
 b         1                   1
 c         2                   2
 d         2                   3
 e         1                   3
 f         1                   4
 g         2                   5
 h         1                   6
 i         1                   5
 j         2                   4
 k         3                   4
 l         3                   5
 m         3                   5
 Name: interactions_3600, dtype: int

In [20]:
new_feature.assign(tweet_timestamp = [48120, 48120, 50300, 50400, 50600, 50760, 50860, 50900, 51300, 51300, 51720, 51720, 52100],
                  hashtags = ['1\t2\t5', '4', '2\t5', '1', '', '1\t2\t5', '1\t3\t5',
                       '1\t2\t5', '3\t2', '', '', '3\t4', '4\t2'])

NameError: name 'new_feature' is not defined

In [None]:
new_feature