# Data Preprocessing
1. Topic Extraction
2. User Buckets
3. Data Split

***
## Dataset description
||Feature Name|Feature Type|Feature Description|
|:---|:---|:---|:---|
|Tweet Features|Text tokens<br>Hashtags<br>Tweet id<br>Present media<br>Present links<br>Present domains<br>Tweet type<br>Language<br>Timestamp|List[long]<br>List[string]<br>String<br>List[String]<br>List[string]<br>List[string]<br>String<br>String<br>Long|Ordered list of Bert ids corresponding to Bert tokenization of Tweet text<br>Tab separated list of hastags (identifiers) present in the tweet<br>Tweet identifier<br>Tab separated list of media types. Media type can be in (Photo, Video, Gif)<br>Tab separeted list of links (identifiers) included in the Tweet<br>Tab separated list of domains included in the Tweet (twitter.com, dogs.com)<br>Tweet type, can be either Retweet, Quote, Reply, or Toplevel<br>Identifier corresponding to the inferred language of the Tweet<br>Unix timestamp, in sec of the creation time of the Tweet|
|Engaged With User Features|User id<br>Follower count<br>Following count<br>Is verified?<br>Account creation time|String<br>Long<br>Long<br>Bool<br>Long|User identifier<br>Number of followers of the user<br>Number of accounts<br>the user is following<br>Is the account verified?<br>Unix timestamp, in seconds, of the creation time of the account
|Engaging User Features|User id<br>Follower count<br>Following count<br>Is verified?<br>Account creation time|String<br>Long<br>Long<br>Bool<br>Long|User identifier<br>Number of followers of the user<br>Number of accounts<br>the user is following<br>Is the account verified?<br>Unix timestamp, in seconds, of the creation time of the account
|Engagement Features|Engagee follows engager?<br>Reply engagement timestamp<br>Retweet engagement timestamp<br>Retweet<br>with comment engagement timestamp<br>Like engagement timestamp|Bool<br>Long<br>Long<br>Long<br>Long|Does the account of the engaged tweet author follow the account that has made the engagement?<br>If there is at least one, unix timestamp, in s, of one of the replies<br>If there is one, unix timestamp, in s, of the retweet of the tweet by the engaging user<br>If there is at least one, unix timestamp, in s, of one of the retweet with comment of the tweet by the engaging user<br>If there is one, Unix timestamp, in s, of the like

참고) https://recsys-twitter.com/


***
## 데이터 로드

In [2]:
import pandas as pd
COLS = ["text_tokens", "hashtags", "tweet_id", "present_media", 
                      "present_links", "present_domains", "tweet_type", "language", "tweet_timestamp", "engaged_with_user_id",
                     "engaged_with_user_follower_count", "engaged_with_user_following_count", 
                     "engaged_with_user_is_verified", "engaged_with_user_account_creation", "engaging_user_id",
                     "engaging_user_follower_count", "engaging_user_following_count", "engaging_user_is_verified",
                     "engaging_user_account_creation", "engagee_follows_engager", "reply_timestamp",
                     "retweet_timestamp", "retweet_with_comment_timestamp", "like_timestamp"]
df = pd.read_csv('../dataset/twitter/train100K.csv', names=COLS, skipinitialspace=True, skiprows=1)

In [3]:
df.head()

Unnamed: 0,text_tokens,hashtags,tweet_id,present_media,present_links,present_domains,tweet_type,language,tweet_timestamp,engaged_with_user_id,...,engaging_user_id,engaging_user_follower_count,engaging_user_following_count,engaging_user_is_verified,engaging_user_account_creation,engagee_follows_engager,reply_timestamp,retweet_timestamp,retweet_with_comment_timestamp,like_timestamp
0,101\t10117\t140\t119\t142\t119\t152\t119\t1010...,,373C0F43762B7CEC1D75728BE8A33891,,A2CE3A1941BA410A1C31496C355EFCD7,E14AF8A8D257BB47587843FE7D08382B,TopLevel,D3164C7FBCF2565DDF915B1B3AEFB1DC,1582126349,2A8B6AD2B9D55F535C2441AB673133D2,...,00000865A1538142CDA5936B07FE4311,65,166,False,1452599043,False,,,,
1,101\t10105\t10817\t10124\t59232\t18121\t15629\...,,773A92D9E4824D06105C02BD044BB20A,,,,Quote,D3164C7FBCF2565DDF915B1B3AEFB1DC,1581971193,950A95B81407F33C412E520BE55A1450,...,000009A057792FF118B9E3F2578B8407,1814,1314,False,1322868747,True,1581979000.0,,,
2,101\t48561\t10116\t67737\t18554\t36371\t10989\...,,218A6C27871801759F7380D7C41694A6,,5C683B5A29B308CADD0D7EFA7C9C32D3,6717B03E03DEE1D7ACAE37649ACA7BD6,TopLevel,9BF3403E0EB7EA8A256DA9019C0B0716,1582047119,ABB2F7F22C34057BC7B30D627B0C137A,...,00000DEF82BE9EB5CFD07FB7DB94317B,4,73,False,1573996260,False,,,,
3,101\t100055\t69940\t10414\t159\t11305\t11166\t...,,AB817EBA68064A0C8CBF4A6C059D92DC,Photo,E925556EE312213AD98C4D9F131D7A8D,D722330FEBEAAE68B4F4339CE8BD7C70,TopLevel,691890251F2B9FF922BE6D3699ABEFD2,1581554925,03F96C3B7CE2179B6347AA395880C963,...,0000109A57AFA64758EE4AAE2A01BFC7,15,124,False,1385502405,True,,,,
4,101\t62154\t32221\t71843\t10143\t10237\t15507\...,,349120C1E2801857530393F16D4653A5,,,,TopLevel,9BF3403E0EB7EA8A256DA9019C0B0716,1581568955,E035DCB47CB3DF98C5CD7CFEEC3BC704,...,000012366528B5FEE179A9606DBC9826,1226,655,False,1268639592,True,1581570000.0,,,


***

## Preprocessing

In [11]:
def mediaCounter(row, media='Photo'):
    counter=0
    if type(row)==list:
        for elem in row:
            if elem==media:
                counter+=1
    else:
        pass
    return counter

def listCounter(row):
    counter=0
    if type(row)==list:
        counter+=len(row)
    else:
        pass
    return counter

def labelEncoder(row, mapping_encode):
    """
    Label Encoding or Array<String> types
    
    
    Parameters:
    -----------
    row : list(string)
        List of string or labels
    mapping_encode : dict(label, integer)
        Encoding of some top K labels
    Return:
    -------
    out : list(integers)
        List of Label Encoders.
        if not in mapping Encoded to len(map)
        if not a list Encoded to len(map)+1
    """
    out=[]
    if type(row)==list:
        for elem in row:
            if elem in mapping_encode:
                out.append(mapping_encode.get(elem))
            else:
                out.append(len(mapping_encode))
    else:
        out.append(len(mapping_encode)+1)
    return out

def labelEncoderSingle(row, mapping_encode):
    out=[]
    if row:
        if row in mapping_encode:
            out.append(mapping_encode.get(row))
        else:
            out.append(len(mapping_encode))
    else:
        out.append(len(mapping_encode)+1)
    return out

def hashtagSumCounter(row, mapping_hashtag_count):
    counter=0
    if type(row)==list:
        for elem in row:
            if elem in mapping_hashtag_count:
                counter+=mapping_hashtag_count.get(elem, 0)
    else:
        pass
    return counter

def get_distribution_array_col(df, col):
    distribution_df = df.select(col).filter(F.col(col).isNotNull())\
                              .withColumn(col, 
                                          F.explode(F.col(col)))\
                              .groupBy(col).count()\
                              .orderBy(F.col("count").desc())
    return distribution_df

def save_pkl_to_s3(obj, key_filename, bucket_name):
    serialized_obj = pickle.dumps(obj)
    s3 = boto3.client('s3')
    s3.put_object(Bucket=bucket_name, Key=key_filename, 
                  Body=serialized_obj)
    
def columns2cast(df):
    columns = []
    for col in df.schema:
        if col.dataType.typeName()=="array":
            columns.append(col)
    return columns
    
def cast_array2string(df, columns):
    for col in columns:
        df = df.withColumn(col.name, F.col(col.name).cast(StringType()))
    return df

def cast_string2array(df, columns):
    for col in columns:
        df= df.withColumn(col, 
                          F.split(F.regexp_replace(F.col(col), r"(^\[)|(\]$)|(')", ""),
                                  ", "))
    return df
    
def mappings(df, col, top_k):
    col_dist = get_distribution_array_col(df, col)
    df_col_dist = col_dist.limit(top_k)
    df_col = df_col_dist.toPandas().rename(columns={'_1': col, 
                                                    '_2': 'count'})\
                                    .reset_index().set_index(col)
    mapping_encode = df_col['index'].to_dict()
    mapping_count = df_col['count'].to_dict()
    return mapping_encode, mapping_count

def mapping_label_encoder(df, col, top_k):
    col_dist = df.select(col).filter(F.col(col).isNotNull())\
                      .groupBy(col).count()\
                      .orderBy(F.col("count").desc())
    df_col_dist = col_dist.limit(top_k)
    df_col = df_col_dist.toPandas().rename(columns={'_1': col, 
                                                    '_2': 'count'})\
                                    .reset_index().set_index(col)
    mapping_encoder = df_col['index'].to_dict()
    return mapping_encoder

def validator(df):
    columns_w_nan = {}
    for col in df.schema:
        null_count = df.filter(F.col(col.name).isNull()).count()
        if null_count>0:
            columns_w_nan[col.name]=null_count
    return columns_w_nan

# Mappings
tweet_type_mapping = {'TopLevel':0, 'Quote':1, 'Retweet':2, 'Reply':3}

# # UDF SQL
# PhotoCounter_udf = F.udf(lambda row: mediaCounter(row, 'Photo'), 
#                          IntegerType())
# VideoCounter_udf = F.udf(lambda row: mediaCounter(row, 'Video'), 
#                          IntegerType())
# GifCounter_udf = F.udf(lambda row: mediaCounter(row, 'GIF'), 
#                          IntegerType())
# listCounter_udf = F.udf(listCounter, 
#                          IntegerType())
# tweet_encoded_udf = F.udf(lambda x: tweet_type_mapping[x], 
#                              IntegerType())

In [14]:
df.schema

AttributeError: 'DataFrame' object has no attribute 'schema'

In [13]:
df.schema
def validator(df):
    columns_w_nan = {}
    for col in df.schema:
        null_count = df.filter(F.col(col.name).isNull()).count()
        if null_count>0:
            columns_w_nan[col.name]=null_count
    return columns_w_nan
columns_w_nan = validator(df)


AttributeError: 'DataFrame' object has no attribute 'schema'

In [15]:
import time
import os
import boto3
import gc
import sys
import numpy as np
import pandas as pd
import pickle
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import (FloatType, DateType, StructType, StructField, StringType, LongType, 
    IntegerType, ArrayType, BooleanType, DoubleType)
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.ml.feature import VectorAssembler, StandardScaler, QuantileDiscretizer
gc.enable()

spark = SparkSession.builder.config("spark.sql.shuffle.partitions", 1000).appName("twitter").getOrCreate()
print(spark.sparkContext.getConf().get('spark.driver.memory'))
print(spark.sparkContext.getConf().get("spark.sql.shuffle.partitions"))

None
1000


In [16]:
def parse_data(path='training.tsv', has_labels=True, schema='auto'):
    """
    Parses the training data for the Twitter RecSys Challenge.
    """
    spark = SparkSession.builder.appName("twitter").getOrCreate()
    if schema == 'auto':
        schema = build_schema(has_labels)
    df = spark.read.csv(path, schema=schema, sep='\x01', encoding='utf-8',
                        ignoreLeadingWhiteSpace=True, ignoreTrailingWhiteSpace=True)
    df = df.withColumn('text_tokens', F.split('text_tokens', '\t'))
    df = df.withColumn('hashtags', F.split('hashtags', '\t'))
    df = df.withColumn('present_media', F.split('present_media', '\t'))
    df = df.withColumn('present_links', F.split('present_links', '\t'))
    df = df.withColumn('present_domains', F.split('present_domains', '\t'))
    return df

def build_schema(has_labels=True):
    if has_labels:
        schema = StructType([StructField('text_tokens', StringType()),
                             StructField('hashtags', StringType()),
                             StructField('tweet_id', StringType()),
                             StructField('present_media', StringType()),
                             StructField('present_links', StringType()),
                             StructField('present_domains', StringType()),
                             StructField('tweet_type', StringType()),
                             StructField('language', StringType()),
                             StructField('tweet_timestamp', LongType()),
                             StructField('engaged_with_user_id', StringType()),
                             StructField('engaged_with_user_follower_count', IntegerType()),
                             StructField('engaged_with_user_following_count', IntegerType()),
                             StructField('engaged_with_user_is_verified', BooleanType()),
                             StructField('engaged_with_user_account_creation', LongType()),
                             StructField('engaging_user_id', StringType()),
                             StructField('engaging_user_follower_count', IntegerType()),
                             StructField('engaging_user_following_count', IntegerType()),
                             StructField('engaging_user_is_verified', BooleanType()),
                             StructField('engaging_user_account_creation', LongType()),
                             StructField('engagee_follows_engager', BooleanType()),
                             StructField('reply_timestamp', LongType()),
                             StructField('retweet_timestamp', LongType()),
                             StructField('retweet_with_comment_timestamp', LongType()),
                             StructField('like_timestamp', LongType())
                            ])
    else:
         schema = StructType([StructField('text_tokens', StringType()),
                             StructField('hashtags', StringType()),
                             StructField('tweet_id', StringType()),
                             StructField('present_media', StringType()),
                             StructField('present_links', StringType()),
                             StructField('present_domains', StringType()),
                             StructField('tweet_type', StringType()),
                             StructField('language', StringType()),
                             StructField('tweet_timestamp', LongType()),
                             StructField('engaged_with_user_id', StringType()),
                             StructField('engaged_with_user_follower_count', IntegerType()),
                             StructField('engaged_with_user_following_count', IntegerType()),
                             StructField('engaged_with_user_is_verified', BooleanType()),
                             StructField('engaged_with_user_account_creation', LongType()),
                             StructField('engaging_user_id', StringType()),
                             StructField('engaging_user_follower_count', IntegerType()),
                             StructField('engaging_user_following_count', IntegerType()),
                             StructField('engaging_user_is_verified', BooleanType()),
                             StructField('engaging_user_account_creation', LongType()),
                             StructField('engagee_follows_engager', BooleanType())
                            ])
    return schema

In [19]:
def mediaCounter(row, media='Photo'):
    counter=0
    if type(row)==list:
        for elem in row:
            if elem==media:
                counter+=1
    else:
        pass
    return counter

def listCounter(row):
    counter=0
    if type(row)==list:
        counter+=len(row)
    else:
        pass
    return counter

def labelEncoder(row, mapping_encode):
    """
    Label Encoding or Array<String> types
    
    
    Parameters:
    -----------
    row : list(string)
        List of string or labels
    mapping_encode : dict(label, integer)
        Encoding of some top K labels
    Return:
    -------
    out : list(integers)
        List of Label Encoders.
        if not in mapping Encoded to len(map)
        if not a list Encoded to len(map)+1
    """
    out=[]
    if type(row)==list:
        for elem in row:
            if elem in mapping_encode:
                out.append(mapping_encode.get(elem))
            else:
                out.append(len(mapping_encode))
    else:
        out.append(len(mapping_encode)+1)
    return out

def labelEncoderSingle(row, mapping_encode):
    out=[]
    if row:
        if row in mapping_encode:
            out.append(mapping_encode.get(row))
        else:
            out.append(len(mapping_encode))
    else:
        out.append(len(mapping_encode)+1)
    return out

def hashtagSumCounter(row, mapping_hashtag_count):
    counter=0
    if type(row)==list:
        for elem in row:
            if elem in mapping_hashtag_count:
                counter+=mapping_hashtag_count.get(elem, 0)
    else:
        pass
    return counter

def get_distribution_array_col(df, col):
    distribution_df = df.select(col).filter(F.col(col).isNotNull())\
                              .withColumn(col, 
                                          F.explode(F.col(col)))\
                              .groupBy(col).count()\
                              .orderBy(F.col("count").desc())
    return distribution_df

def save_pkl_to_s3(obj, key_filename, bucket_name):
    serialized_obj = pickle.dumps(obj)
    s3 = boto3.client('s3')
    s3.put_object(Bucket=bucket_name, Key=key_filename, 
                  Body=serialized_obj)
    
def columns2cast(df):
    columns = []
    for col in df.schema:
        if col.dataType.typeName()=="array":
            columns.append(col)
    return columns
    
def cast_array2string(df, columns):
    for col in columns:
        df = df.withColumn(col.name, F.col(col.name).cast(StringType()))
    return df

def cast_string2array(df, columns):
    for col in columns:
        df= df.withColumn(col, 
                          F.split(F.regexp_replace(F.col(col), r"(^\[)|(\]$)|(')", ""),
                                  ", "))
    return df
    
def mappings(df, col, top_k):
    col_dist = get_distribution_array_col(df, col)
    df_col_dist = col_dist.limit(top_k)
    df_col = df_col_dist.toPandas().rename(columns={'_1': col, 
                                                    '_2': 'count'})\
                                    .reset_index().set_index(col)
    mapping_encode = df_col['index'].to_dict()
    mapping_count = df_col['count'].to_dict()
    return mapping_encode, mapping_count

def mapping_label_encoder(df, col, top_k):
    col_dist = df.select(col).filter(F.col(col).isNotNull())\
                      .groupBy(col).count()\
                      .orderBy(F.col("count").desc())
    df_col_dist = col_dist.limit(top_k)
    df_col = df_col_dist.toPandas().rename(columns={'_1': col, 
                                                    '_2': 'count'})\
                                    .reset_index().set_index(col)
    mapping_encoder = df_col['index'].to_dict()
    return mapping_encoder

def validator(df):
    columns_w_nan = {}
    for col in df.schema:
        null_count = df.filter(F.col(col.name).isNull()).count()
        if null_count>0:
            columns_w_nan[col.name]=null_count
    return columns_w_nan

# Mappings
tweet_type_mapping = {'TopLevel':0, 'Quote':1, 'Retweet':2, 'Reply':3}

# UDF SQL
PhotoCounter_udf = F.udf(lambda row: mediaCounter(row, 'Photo'), 
                         IntegerType())
VideoCounter_udf = F.udf(lambda row: mediaCounter(row, 'Video'), 
                         IntegerType())
GifCounter_udf = F.udf(lambda row: mediaCounter(row, 'GIF'), 
                         IntegerType())
listCounter_udf = F.udf(listCounter, 
                         IntegerType())
tweet_encoded_udf = F.udf(lambda x: tweet_type_mapping[x], 
                             IntegerType())

In [20]:
dictionary_size={"final-complete": {"val_size": 500000, 
                                    "train_size": "all"}}

training = False
submission = False
test = True

bucket='bucket-name'
s3_resource = boto3.resource('s3')
top_k_languages = 30
top_k_domains = 3000
top_k_hashtags = 13000

# Embeddings
num_partitions=1000

# Buckets
partition_per_cluster = 100

suffix_sample = "final-complete" #"full", "small", "medium", "sub_medium"
data_path = "final-data"
object_paths = "final-artifacts"

val_size = dictionary_size[suffix_sample]["val_size"]
train_size = dictionary_size[suffix_sample]["train_size"]

bucket_s3 = s3_resource.Bucket(bucket)

In [52]:
#S3
twitter_bucket_s3 = "../dataset/twitter/"
trainining_path = os.path.join(twitter_bucket_s3, "train100K.csv")
submission_path = os.path.join(twitter_bucket_s3, "train100K.csv")
test_path = os.path.join(twitter_bucket_s3, "train100K.csv")

# Splitted paths
train_path = os.path.join(twitter_bucket_s3, data_path, "train-"+suffix_sample)
val_path = os.path.join(twitter_bucket_s3, data_path, "val-"+suffix_sample)

# Processed
processed_train_path = os.path.join(twitter_bucket_s3, data_path, "processed", "train-"+suffix_sample)
processed_val_path = os.path.join(twitter_bucket_s3, data_path, "processed", "val-"+suffix_sample)
processed_submission_path = os.path.join(twitter_bucket_s3, data_path, "processed", "submission-"+suffix_sample)
processed_test_path = os.path.join(twitter_bucket_s3, data_path, "processed", "test-"+suffix_sample)
processed_emb_train_path = os.path.join(twitter_bucket_s3, data_path, "processed-embeddings-final", 
                                        "train-"+suffix_sample)
processed_emb_val_path = os.path.join(twitter_bucket_s3, data_path, "processed-embeddings-final", 
                                      "val-"+suffix_sample)
processed_emb_submission_path = os.path.join(twitter_bucket_s3, data_path, "processed-embeddings-final", 
                                         "submission-"+suffix_sample)
processed_emb_test_path = os.path.join(twitter_bucket_s3, data_path, "processed-embeddings-final", 
                                         "test-"+suffix_sample)
processed_top_train_path = os.path.join(twitter_bucket_s3, data_path, "processed-topics", 
                                        "train-"+suffix_sample)
processed_top_val_path = os.path.join(twitter_bucket_s3, data_path, "processed-topics", 
                                      "val-"+suffix_sample)
processed_top_submission_path = os.path.join(twitter_bucket_s3, data_path, "processed-topics", 
                                             "submission-"+suffix_sample)
processed_top_test_path = os.path.join(twitter_bucket_s3, data_path, "processed-topics", 
                                             "test-"+suffix_sample)
# Resources
engaging_users_training_path = os.path.join(twitter_bucket_s3, data_path, "engaging-users-training")
engaging_users_training_path = os.path.join(twitter_bucket_s3, "engaging-users-training")

engaging_users_submission_path = os.path.join(twitter_bucket_s3, data_path, "engaging-users-submission")
engaging_users_submission_path = os.path.join(twitter_bucket_s3, "engaging-users-submission")

engaging_users_test_path = os.path.join(twitter_bucket_s3, data_path, "engaging-users-test")
intentions_path = os.path.join(twitter_bucket_s3, data_path, "intentions-"+suffix_sample)
map_user_bucket_path = os.path.join(twitter_bucket_s3, data_path, "map_user_bucket")

topic_encodings_path = os.path.join(twitter_bucket_s3, "data", "textEncodings", "user_topics")
users_intime_path = os.path.join(twitter_bucket_s3, data_path, "users_intime-"+suffix_sample)

# keys objects
key_hashtag_mapping = os.path.join(object_paths, f'hashtag_mapping_{suffix_sample}.pkl')
key_domain_mapping = os.path.join(object_paths, f'domain_mapping_{suffix_sample}.pkl')
key_language_mapping = os.path.join(object_paths, f'language_mapping_{suffix_sample}.pkl')
key_hashtag_count = os.path.join(object_paths, f'hashtag_count_{suffix_sample}.pkl')
key_domain_count = os.path.join(object_paths, f'domain_count_{suffix_sample}.pkl')
key_scaling_features = os.path.join(object_paths, f'scaling_dictionary_{suffix_sample}.pkl')
key_diff_min = os.path.join(object_paths, f'diff_min_{suffix_sample}.pkl')
key_impute_perc = os.path.join(object_paths, f'dict_mean_perc_{suffix_sample}.pkl')
key_topiccount = os.path.join(object_paths, f'topiccount_{suffix_sample}.pkl')

# s3+keys
columns = ["engaged_with_user_follower_count", "engaged_with_user_following_count",
           "engaged_with_user_account_creation", "engaging_user_follower_count",
           "engaging_user_following_count", "engaging_user_account_creation"]
qds_paths = {}
for col in columns:
    qds_paths[col] = os.path.join(twitter_bucket_s3, object_paths, f"qs_{suffix_sample}_" + col)
    
# Bucket pipeline
users_buckets = os.path.join(twitter_bucket_s3, data_path, "users_buckets") #
users_buckets_part_2 = os.path.join(twitter_bucket_s3, data_path, "users_buckets_part_2") #

pipeline_kmeans_path = os.path.join(twitter_bucket_s3, object_paths, "pipeline_id_encoding")
cluster_map_path = os.path.join(twitter_bucket_s3, data_path, "cluster_map")

# Embeddings
bert_embeddings_train = os.path.join(twitter_bucket_s3, "data", "textEncodings", "tweets_extended")
submission_rawTweetEncodings_path = os.path.join(twitter_bucket_s3, "data", "textEncodings", "submission-tweets-extended")
test_rawTweetEncodings_path = os.path.join(twitter_bucket_s3, "data", "textEncodings", "test-tweets-extended")

# Topics pipeline
reduced_topics_path = os.path.join(twitter_bucket_s3, "data", "textEncodings", "reducedTopics")

In [61]:
# Training
df = parse_data(trainining_path, has_labels=True).repartition(500)
engaging_user_id_train = df.select("engaging_user_id").distinct()
engaging_user_id_train.write.csv(engaging_users_training_path)

engaging_users_train = spark.read.csv(engaging_users_training_path, 
                                      schema=StructType([StructField('engaging_user_id', StringType())]))

# Submission
df = parse_data(submission_path, has_labels=False).repartition(200)
engaging_users_submission = df.select("engaging_user_id").distinct()
engaging_users_submission.write.csv(engaging_users_submission_path)

engaging_users_submission = spark.read.csv(engaging_users_submission_path, 
                                    schema=StructType([StructField('engaging_user_id', StringType())]))

# Test
df = parse_data(test_path, has_labels=False).repartition(200)
engaging_users_test = df.select("engaging_user_id").distinct()
engaging_users_test.write.csv(engaging_users_test_path)

engaging_users_test = spark.read.csv(engaging_users_test_path, 
                                    schema=StructType([StructField('engaging_user_id', StringType())]))

In [None]:
bucket_s3.objects.filter(Prefix=f"{data_path}/engaging-users-training", Delimiter='./')

In [62]:
engaging_users_training_path

'../dataset/twitter/engaging-users-training'

In [63]:
engaging_users_train

DataFrame[engaging_user_id: string]

In [64]:
engaging_user_id_train

DataFrame[engaging_user_id: string]

In [65]:
df = parse_data(trainining_path, has_labels=True).repartition(1000)

In [47]:
df

DataFrame[text_tokens: array<string>, hashtags: array<string>, tweet_id: string, present_media: array<string>, present_links: array<string>, present_domains: array<string>, tweet_type: string, language: string, tweet_timestamp: bigint, engaged_with_user_id: string, engaged_with_user_follower_count: int, engaged_with_user_following_count: int, engaged_with_user_is_verified: boolean, engaged_with_user_account_creation: bigint, engaging_user_id: string, engaging_user_follower_count: int, engaging_user_following_count: int, engaging_user_is_verified: boolean, engaging_user_account_creation: bigint, engagee_follows_engager: boolean, reply_timestamp: bigint, retweet_timestamp: bigint, retweet_with_comment_timestamp: bigint, like_timestamp: bigint]

In [84]:
def split_final_df(df, time_holdout_fraction=0.1, space_column='engaging_user_id', 
                   time_column='tweet_timestamp', avg_rows_per_user=5.4, 
                   val_size = 500000, perc_val_not_shared = 0.27, seed=0, 
                   engaging_users_submission=None, engaging_users_test=None):
    """
    Split using all the train data.
    Exclude all engaging_users from submission and test to create the validation
    Select the validatioon to be up to 500k samples, where 23% of enagging users are not known 
    (out of space completely). The validation is going to be picked out of time, but the rest 
    of out of time is gooing ot be used in train
    """
    info_dict = {}
    min_date, max_date = df.select(F.min(time_column), F.max(time_column)).first()
    print(df.select(F.min(time_column)))
    print(type(F.max(time_column).first()))
    time_range = max_date - min_date
    time_holdout_timestamp = min_date + int(time_range*(1-time_holdout_fraction))

    #Get engaging_users on submission and test
    engaging_users_test = engaging_users_submission.union(engaging_users_test)
    engaging_users_test = engaging_users_test.select(F.col("engaging_user_id").alias("user_id_1")).distinct()

    join_users_df = engaging_users_train.join(engaging_users_test, 
                                              engaging_users_test.user_id_1==engaging_users_train.engaging_user_id, 
                                              how="left")
    join_users_df = join_users_df.withColumn("indicator_test", 
                                             F.when(F.col("user_id_1").isNotNull(), 1).otherwise(0))
    join_users_df = join_users_df.drop("user_id_1") # Training user w indicator also in test
    join_users_not_test = join_users_df.filter(F.col("indicator_test")==0)
    
    # Sample that are not in test
    df_not_test = df.join(join_users_not_test,
                          on=space_column, 
                          how="inner")
    df_not_test = df_not_test.drop("indicator_test")
    info_dict["df_not_test_count"] = df_not_test.count()

    df_not_test_intime = df_not_test.filter(F.col(time_column) <= time_holdout_timestamp)
    df_not_test_outtime = df_not_test.filter(F.col(time_column) > time_holdout_timestamp) # Set for validation
    info_dict["df_not_test_intime_count"] = df_not_test_intime.count() #
    info_dict["df_not_test_outtime_count"] = df_not_test_outtime.count() #
    engaging_user_id_intime_not_test = df_not_test_intime.select(F.col(space_column)).distinct()
    engaging_user_id_outtime_not_test = df_not_test_outtime.select(F.col(space_column)).distinct()
    info_dict["engaging_user_id_intime_not_test_count"] = engaging_user_id_intime_not_test.count() #
    info_dict["engaging_user_id_outtime_not_test_count"] = engaging_user_id_outtime_not_test.count() #
    inner_engaging_not_test = engaging_user_id_intime_not_test.join(engaging_user_id_outtime_not_test, 
                                                                    on=space_column, 
                                                                    how="inner")
    info_dict["inner_engaging_not_test_count"] = inner_engaging_not_test.count() #
    rows_per_user_outtime_not_test = info_dict["df_not_test_outtime_count"]/\
                                      info_dict["engaging_user_id_outtime_not_test_count"]
    
    engaging_user_id_only_outtime_not_test =engaging_user_id_outtime_not_test.join(inner_engaging_not_test,
                                                                                   on=space_column,
                                                                                   how="left_anti")
    info_dict["engaging_user_id_only_outtime_not_test_count"] = engaging_user_id_only_outtime_not_test.count()#
    frac_user_only_outtime = val_size/rows_per_user_outtime_not_test*(perc_val_not_shared)\
                        /info_dict["engaging_user_id_only_outtime_not_test_count"] # Validation shared 
    valid_users_not_test_outtime = engaging_user_id_only_outtime_not_test.sample(withReplacement=False,
                                                                           fraction=frac_user_only_outtime,
                                                                           seed=seed)
    valid_users_not_test_outtime = valid_users_not_test_outtime.select(F.col("engaging_user_id").alias("user_id"))

    # Select the sample from that correspnd to 27% of rows
    df_valid_unknown = df_not_test_outtime.join(valid_users_not_test_outtime, 
                                    df_not_test_outtime.engaging_user_id==valid_users_not_test_outtime.user_id,
                                    how="inner").drop("user_id")
    
    #Select the rest
    frac_user_inner = val_size/rows_per_user_outtime_not_test*(1-perc_val_not_shared)\
                        /info_dict["inner_engaging_not_test_count"] # Validation shared 
    inner_engaging_not_test = inner_engaging_not_test.select(F.col("engaging_user_id").alias("user_id"))
    valid_users_not_test_shared = inner_engaging_not_test.sample(withReplacement=False,
                                                                 fraction=frac_user_inner,
                                                                 seed=seed)
    df_valid_known = df_not_test_outtime.join(valid_users_not_test_shared,
                                    df_not_test_outtime.engaging_user_id==inner_engaging_not_test.user_id,
                                    how="inner").drop("user_id")
    df_valid = df_valid_known.union(df_valid_unknown)

    valid_samples = df_valid.select(F.col("tweet_id").alias("tweet_id_1"), 
                                    F.col("engaging_user_id").alias("engaging_user_id_1"))
    df_train = df.join(valid_samples, 
                       (df.tweet_id==valid_samples.tweet_id_1)&\
                       (df.engaging_user_id == valid_samples.engaging_user_id_1), 
                       how="left_anti")
    return df_train, df_valid, join_users_not_test, info_dict

In [85]:
df_train, df_valid, join_users_not_test, info_dict = split_final_df(df, time_holdout_fraction=0.1, 
           space_column='engaging_user_id', 
           time_column='tweet_timestamp',
           avg_rows_per_user=5.4, val_size=val_size, 
           perc_val_not_shared=0.27, seed=0,
           engaging_users_submission=engaging_users_submission, 
           engaging_users_test=engaging_users_test)
print("info_dict: ", info_dict)
train = df_train
val = df_valid
columns = columns2cast(train)
train = cast_array2string(train, columns)
val = cast_array2string(val, columns)
join_users_not_test.write.csv(users_intime_path)
train.write.csv(train_path)
val.repartition(1000).write.csv(val_path)
print("Casted columns: ",columns)

DataFrame[min(tweet_timestamp): bigint]


TypeError: 'Column' object is not callable

In [86]:
df = df.withColumn('tweetEncoded', tweet_encoded_udf(df.tweet_type))

In [88]:
df = df.withColumn('linkCount', listCounter_udf(df.present_links))

In [99]:
schema = StructType([StructField('text_tokens', StringType()),
                     StructField('hashtags', StringType()),
                     StructField('tweet_id', StringType()),
                     StructField('present_media', StringType()),
                     StructField('present_links', StringType()),
                     StructField('present_domains', StringType()),
                     StructField('tweet_type', StringType()),
                     StructField('language', StringType()),
                     StructField('tweet_timestamp', LongType()),
                     StructField('engaged_with_user_id', StringType()),
                     StructField('engaged_with_user_follower_count', IntegerType()),
                     StructField('engaged_with_user_following_count', IntegerType()),
                     StructField('engaged_with_user_is_verified', BooleanType()),
                     StructField('engaged_with_user_account_creation', LongType()),
                     StructField('engaging_user_id', StringType()),
                     StructField('engaging_user_follower_count', IntegerType()),
                     StructField('engaging_user_following_count', IntegerType()),
                     StructField('engaging_user_is_verified', BooleanType()),
                     StructField('engaging_user_account_creation', LongType()),
                     StructField('engagee_follows_engager', BooleanType()),
                     StructField('reply_timestamp', LongType()),
                     StructField('retweet_timestamp', LongType()),
                     StructField('retweet_with_comment_timestamp', LongType()),
                     StructField('like_timestamp', LongType())])
train = spark.read.csv(trainining_path, schema=schema)
schema = StructType([StructField('engaging_user_id', StringType()),
                     StructField('text_tokens', StringType()),
                     StructField('hashtags', StringType()),
                     StructField('tweet_id', StringType()),
                     StructField('present_media', StringType()),
                     StructField('present_links', StringType()),
                     StructField('present_domains', StringType()),
                     StructField('tweet_type', StringType()),
                     StructField('language', StringType()),
                     StructField('tweet_timestamp', LongType()),
                     StructField('engaged_with_user_id', StringType()),
                     StructField('engaged_with_user_follower_count', IntegerType()),
                     StructField('engaged_with_user_following_count', IntegerType()),
                     StructField('engaged_with_user_is_verified', BooleanType()),
                     StructField('engaged_with_user_account_creation', LongType()),
                     StructField('engaging_user_follower_count', IntegerType()),
                     StructField('engaging_user_following_count', IntegerType()),
                     StructField('engaging_user_is_verified', BooleanType()),
                     StructField('engaging_user_account_creation', LongType()),
                     StructField('engagee_follows_engager', BooleanType()),
                     StructField('reply_timestamp', LongType()),
                     StructField('retweet_timestamp', LongType()),
                     StructField('retweet_with_comment_timestamp', LongType()),
                     StructField('like_timestamp', LongType())])
val = spark.read.csv(trainining_path, schema=schema).repartition(1000)
join_users_not_test = spark.read.csv(trainining_path, 
                              schema=StructType([StructField('engaging_user_id', StringType())]))
columns = ["text_tokens", "hashtags", "present_media", "present_links", "present_domains"]
train = cast_string2array(train, columns)
val = cast_string2array(val, columns)

In [100]:
train_path

'../dataset/twitter/final-data/train-final-complete'

In [101]:
df = df.withColumn('indicator_reply',F.when(F.col('reply_timestamp').isNotNull(), 1).otherwise(0))
df = df.withColumn('indicator_retweet',F.when(F.col('retweet_timestamp').isNotNull(), 1).otherwise(0))
df = df.withColumn('indicator_retweet_with_comment',
                   F.when(F.col('retweet_with_comment_timestamp').isNotNull(),1).otherwise(0))
df = df.withColumn('indicator_like', F.when(F.col('like_timestamp').isNotNull(),1).otherwise(0))
df = df.withColumn('indicator_interaction', 
                   F.when(F.col('indicator_reply')+\
                          F.col('indicator_retweet')+\
                          F.col('indicator_retweet_with_comment')+\
                          F.col('indicator_like')>0, 1)\
                   .otherwise(0))

In [102]:
intention_df = df.select("engaging_user_id", "indicator_reply", "indicator_retweet", 
                                    "indicator_retweet_with_comment", "indicator_like", "indicator_interaction")\
                        .groupBy("engaging_user_id").agg(F.sum(F.col("indicator_interaction")).alias("n_interactions"), 
                                                         F.sum(F.col("indicator_retweet_with_comment"))\
                                                         .alias("n_commented"),
                                                         F.sum(F.col("indicator_like")).alias("n_liked"),
                                                         F.sum(F.col("indicator_reply")).alias("n_replied"),
                                                         F.sum(F.col("indicator_retweet")).alias("n_retweeted"),
                                                         F.count(F.col("indicator_interaction"))\
                                                         .alias("total_appearance"))
columns = ['n_interactions', 'n_commented', 'n_liked', 'n_replied', 'n_retweeted']
for col_i in columns:
    intention_df = intention_df.withColumn("perc_" + col_i, F.col(col_i)/(F.col("total_appearance")))
intention_df = intention_df.drop(*columns)
join_users_not_test = join_users_not_test.select(F.col("engaging_user_id").alias("drop_users"))
join_users_not_test = join_users_not_test.sample(withReplacement=False,
                                                 fraction=0.15,
                                                 seed=42)
intention_df = intention_df.join(join_users_not_test, 
                                 intention_df.engaging_user_id==join_users_not_test.drop_users, 
                                 how="left_anti").drop("drop_users")
intention_df.repartition(1000).write.csv(intentions_path)

In [104]:
mapping_hashtag_encode, mapping_hashtag_count = mappings(df, "hashtags", top_k_hashtags)
# Saving pkl
save_pkl_to_s3(mapping_hashtag_encode, key_hashtag_mapping, bucket)
save_pkl_to_s3(mapping_hashtag_count, key_hashtag_count, bucket)

NoCredentialsError: Unable to locate credentials

In [105]:
mapping_hashtag_encode

{}

In [107]:
mappings(df, "hashtags", top_k_hashtags)

({}, {})

In [108]:
top_k_hashtags

13000