In [None]:
# Copyright 2021 NVIDIA CORPORATION
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

In [1]:
import glob
import os
import pandas as pd
import numpy as np
import gc
import datetime
import hashlib
#import emoji
import re
from joblib import Parallel, delayed
from tqdm.notebook import tqdm
from transformers import BertTokenizer

In [7]:
files = sorted(glob.glob('/raid/recsys/part*'))
len(files), files[:5]

(695,
 ['/raid/recsys/part-00000',
  '/raid/recsys/part-00000.lzo',
  '/raid/recsys/part-00000.lzo.index',
  '/raid/recsys/part-00001',
  '/raid/recsys/part-00001.lzo'])

In [8]:
trainfiles = [ f for f in files if (len(f)<50) and f.find('lzo')<0 ]
len(trainfiles), trainfiles[0]

(232, '/raid/recsys/part-00000')

In [9]:
trainfiles[:5]

['/raid/recsys/part-00000',
 '/raid/recsys/part-00001',
 '/raid/recsys/part-00002',
 '/raid/recsys/part-00003',
 '/raid/recsys/part-00004']

In [10]:
all_features = [
    'text_tokens',    ###############
    'hashtags',       #Tweet Features
    'tweet_id',       #
    'media',          #
    'links',          #
    'domains',        #
    'tweet_type',     #
    'language',       #
    'timestamp',      ###############
    'a_user_id',              ###########################
    'a_follower_count',       #Engaged With User Features
    'a_following_count',      #
    'a_is_verified',          #
    'a_account_creation',     ###########################
    'b_user_id',              #######################
    'b_follower_count',       #Engaging User Features
    'b_following_count',      #
    'b_is_verified',          #
    'b_account_creation',     #######################
    'b_follows_a',    #################### Engagement Features
    'reply',          #Target Reply
    'retweet',        #Target Retweet    
    'retweet_comment',#Target Retweet with comment
    'like',           #Target Like
                      ####################
]

In [11]:
MAP_MEDIA = {
 '': 0,
 'Photo': 1,
 'Photo\tPho': 2,
 'Video': 3,
 'GIF': 4,
 'Video\tVid': 5,
 'Photo\tVid': 6,
 'Video\tPho': 7,
 'GIF\tPhoto': 8,
 'Photo\tGIF': 9,
 'GIF\tGIF': 10,
 'GIF\tVideo': 11,
 'Video\tGIF': 12,
 'GIF\tGIF\tG': 13
}

MAP_TYPE = {'TopLevel': 0, 'Retweet': 1, 'Quote': 2}

MAP_LANG = {
 '488B32D24BD4BB44172EB981C1BCA6FA': 0,
 'E7F038DE3EAD397AEC9193686C911677': 1,
 'B0FA488F2911701DD8EC5B1EA5E322D8': 2,
 'B8B04128918BBF54E2E178BFF1ABA833': 3,
 '313ECD3A1E5BB07406E4249475C2D6D6': 4,
 '1F73BB863A39DB62B4A55B7E558DB1E8': 5,
 '9FCF19233EAD65EA6E32C2E6DC03A444': 6,
 '9A78FC330083E72BE0DD1EA92656F3B5': 7,
 '8729EBF694C3DAF61208A209C2A542C8': 8,
 'E6936751CBF4F921F7DE1AEF33A16ED0': 9,
 '7F4FAB1EB12CD95EDCD9DB2A6634EFCE': 10,
 'B4DC2F82961F1263E90DF7A942CCE0B2': 11,
 '310ECD7D1E42216E3C1B31EFDDFC72A7': 12,
 '5A0759FB938B1D9B1E08B7A3A14F1042': 13,
 '2F548E5BE0D7F678E72DDE31DFBEF8E7': 14,
 '5B6973BEB05212E396F3F2DC6A31B71C': 15,
 '2573A3CF633EBE6932A1E1010D5CD213': 16,
 'DA13A5C3763C212D9D68FC69102DE5E5': 17,
 '00304D7356D6C64481190D708D8F739C': 18,
 '7D11A7AA105DAB4D6799AF863369DB9C': 19,
 '23686A079CA538645BF6118A1EF51C8B': 20,
 'A5CFB818D79497B482B7225887DBD3AD': 21,
 '838A92D9F7EB57FB4A8B0C953A80C7EB': 22,
 '99CA116BF6AA65D70F3C78BEBADC51F0': 23,
 'D922D8FEA3EFAD3200455120B75BCEB8': 24,
 '159541FA269CA8A9CDB93658CAEC4CA2': 25,
 'E84BE2C963852FB065EE827F41A0A304': 26,
 '6B90065EA806B8523C0A6E56D7A961B2': 27,
 '4B55C45CD308068E4D0913DEF1043AD6': 28,
 'BAC6A3C2E18C26A77C99B41ECE1C738D': 29,
 '4CA37504EF8BA4352B03DCBA50E98A45': 30,
 '3228B1FB4BC92E81EF2FE35BDA86C540': 31,
 'D7C16BC3C9A5A633D6A3043A567C95A6': 32,
 '477ED2ED930405BF1DBF13F9BF973434': 33,
 '41776FB50B812A6775C2F8DEC92A9779': 34,
 'C1E99BF67DDA2227007DE8038FE32470': 35,
 'F70598172AC4514B1E6818EA361AD580': 36,
 '6744F8519308FD72D8C47BD45186303C': 37,
 '10C6C994C2AD434F9D49D4BE9CFBC613': 38,
 '89CE0912454AFE0A1B959569C37A5B8F': 39,
 '105008E45831ADE8AF1DB888319F422A': 40,
 'DE8A3755FCEDC549A408D7B1EB1A2C9F': 41,
 'BF04E736C599E9DE22F39F1DC157E1F1': 42,
 'CF304ED3CFC1ADD26720B97B39900FFD': 43,
 '59BE899EB83AAA19878738040F6828F0': 44,
 '3DF931B225B690508A63FD24133FA0E2': 45,
 '3AB05D6A4045A6C37D3E4566CFDFFE26': 46,
 '678E280656F6A0C0C23D5DFD46B85C14': 47,
 '440116720BC3A7957E216A77EE5C18CF': 48,
 'A3E4360031A7E05E9279F4D504EE18DD': 49,
 'C41F6D723AB5D14716D856DF9C000DED': 50,
 '7E18F69967284BB0601E88A114B8F7A9': 51,
 'F9D8F1DB5A398E1225A2C42E34A51DF6': 52,
 '914074E75CB398B5A2D81E1A51818CAA': 53,
 '5B210378BE9FFA3C90818C43B29B466B': 54,
 'F33767F7D7080003F403FDAB34FEB755': 55,
 'DC5C9FB3F0B3B740BAEE4F6049C2C7F1': 56,
 '3EA57373381A56822CBBC736169D0145': 57,
 '37342508F52BF4B62CCE3BA25460F9EB': 58,
 '7168CE9B777B76E4069A538DC5F28B6F': 59,
 '0BB2C843174730BA7D958C98B763A797': 60,
 'CDE47D81F953D800F760F1DE8AA754BA': 61,
 '9D831A0F3603A54732CCBDBF291D17B7': 62,
 '5F152815982885A996841493F2757D91': 63,
 '82C9890E4A7FC1F8730A3443C761143E': 64,
 '8C64085F46CD49FA5C80E72A35845185': 65}

In [12]:
def hashit(x):
    uhash = '0' if len(x)<=2 else x
    hash_object = hashlib.md5(uhash.encode('utf-8'))
    return int(hash_object.hexdigest(),16)%2**32


def extract_hash(text, split_text='@', no=0):
    text = text.lower()
    uhash = ''
    text_split = text.split('@')
    if len(text_split)>(no+1):
        text_split = text_split[no+1].split(' ')
        cl_loop = True
        uhash += clean_text(text_split[0])
        while cl_loop:
            if len(text_split)>1:
                if text_split[1] in ['_']:
                    uhash += clean_text(text_split[1]) + clean_text(text_split[2])
                    text_split = text_split[2:]
                else:
                    cl_loop = False
            else:
                cl_loop = False
                
    return hashit(uhash)

def clean_text(text):
    if len(text)>1:
        if text[-1] in ['!', '?', ':', ';', '.', ',']:
            return(text[:-1])
    return(text)

def ret_word( x, rw=0 ):
    x = x.split(' ')
    
    if len(x)>rw:
        return hashit(x[rw])
    elif rw<0:
        if len(x)>0:
            return hashit(x[-1])
        else:
            return 0
    else:
        return 0
    
def extract_rt(x_org):
    x = x_org.replace('[sep]', '')
    x = x.split('http')[0]
    x = x.rstrip()
    return(x)

def check_last_char_quest(x_org):
    if len(x_org)<1:
        return(0)
    x = x_org.replace('[sep]', '')
    x = x.split('http')[0]
    if '#' in x:
        x = x.split('#')[0] + ' '.join(x.split('#')[1].split(' ')[1:])
    if '@' in x:
        x = x.split('@')[0] + ' '.join(x.split('@')[1].split(' ')[1:])
    x = x.rstrip()
    if len(x)<2:
        return(0)
    elif x[-1]=='?' and x[-2]!='!':
        return(1)
    elif x[-1]=='?' and x[-2]=='!':
        return(2)
    elif x[-1]=='!' and x[-2]=='?':
        return(3)
    elif x[-1]=='!' and x[-2]!='?':
        return(4)
    else:
        return(0)

In [13]:
!mkdir /raid/recsys/train_proc3

def extract_feature(fn):
    df = pd.read_csv(fn, sep='\x01', header=None)
    df.columns = all_features

    filenumber = int(fn.split('/')[-1].split('-')[-1])

    #Only run in trainset and not in test
    if 'like' in df.columns: # do this file contains the target?
        df['reply'] = df['reply'].fillna(0)
        df['retweet'] = df['retweet'].fillna(0)
        df['retweet_comment'] = df['retweet_comment'].fillna(0)
        df['like'] = df['like'].fillna(0)    
        df.loc[df.reply>0,'reply'] = 1
        df.loc[df.retweet>0,'retweet'] = 1
        df.loc[df.retweet_comment>0,'retweet_comment'] = 1
        df.loc[df.like>0,'like'] = 1
        df['reply'] = df['reply'].astype(np.int8)
        df['retweet'] = df['retweet'].astype(np.int8)
        df['retweet_comment'] = df['retweet_comment'].astype(np.int8)
        df['like'] = df['like'].astype(np.int8)
    
    ###########################
    #Tweet token processing####
    ###########################
    tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
    val = df['text_tokens'].values.copy()
    for n,v in enumerate(val):
        val[n] = tokenizer.decode(v.split('\t'))    
    df['text'] = val

    ##########################################################################################    
    df['tw_len_media'] = df['media'].apply(lambda x: str(x).count('\t')+1 if not(pd.isnull(x)) else 0).astype(np.int8)
    df['tw_len_photo'] = df['media'].apply(lambda x: str(x).count('Photo') if not(pd.isnull(x)) else 0).astype(np.int8)
    df['tw_len_video'] = df['media'].apply(lambda x: str(x).count('Video') if not(pd.isnull(x)) else 0).astype(np.int8)
    df['tw_len_gif'] = df['media'].apply(lambda x: str(x).count('GIF') if not(pd.isnull(x)) else 0).astype(np.int8)
    df['tw_len_quest'] = df['text'].apply(lambda x: str(x).count('?')).astype(np.int8)
    df['tw_len_token'] = df['text_tokens'].apply(lambda x: str(x).count('\t')).astype(np.int16)
    df['tw_count_capital_words'] = df['text'].apply(lambda x: len(re.findall(r'\b[A-Z]{2,}\b', x)) ).astype(np.int16)
    df['tw_count_excl_quest_marks'] = df['text'].apply(lambda x: len(re.findall(r'!|\?', x)) ).astype(np.int16)
    df['tw_count_special1'] = df['text'].str.count('¶').astype(np.int16)
    df['tw_count_hash'] = df['text'].str.count('#').astype(np.int16)
    df['tw_last_quest'] = df['text'].apply(lambda x: check_last_char_quest(x) ).astype(np.int8)
    
    df['text'] = df['text'].apply(lambda x: x.lower() )
    df['text'] = df['text'].apply( lambda x: x.replace('http : / / t. co / ', 'http') )
    df['text'] = df['text'].apply( lambda x: x.replace('https : / / t. co / ', 'http') )
    df['text'] = df['text'].apply(lambda x: x[0:-5] )
    df['text'] = df['text'].apply( lambda x: x.replace(' _ ', '_') )
    df['text'] = df['text'].apply( lambda x: x.replace('@ ', '@') )
    df['text'] = df['text'].apply( lambda x: x.replace('# ', '#') )
    
    df['tw_len_retweet'] = df['text'].apply(lambda x: str(x).count('retweet')).astype(np.int8)    
    df['tw_isrt'] = (df['tweet_type']=='Retweet').astype(np.int8)
    df['text'] = df['text'].str.replace('\[cls\] rt @', '@')
    df['text'] = df['text'].str.replace('\[cls\] ', ' ')
    df['tw_len_rt'] = df['text'].apply(lambda x: str(extract_rt(x)).count(' rt ')).astype(np.int8)
    
    # Split retweet text and original text
    df['rt_text'] = df.apply(lambda x: '' if x['tw_isrt']==0 else x['text'].split(':')[0], axis=1)
    df['text'] = df.apply(lambda x: x['text'] if x['tw_isrt']==0 else ':'.join(x['text'].split(':')[1:]) , axis=1)
    
    df['tw_count_at'] = df['text'].str.count('@').astype(np.int16)
    df['text'] = df['text'].apply( lambda x: x.replace('¶ ', ' ') )
    
    df['rt_text'] = df['rt_text'].apply( lambda x: x.replace('¶ ', ' ') )
    df['text'] = df['text'].apply(lambda x: x.strip())
    
    df['rt_text'] = df['rt_text'].apply(lambda x: x.strip())
    df['text'] = df['text'].apply( lambda x: " ".join(x.split()) )
    
    df['rt_text'] = df['rt_text'].apply( lambda x: " ".join(x.split()) )
    
    df['tw_count_words'] = df['text'].str.count(' ').astype(np.int16)
    df['tw_count_char']  = df['text'].apply(lambda x: len(x)).astype(np.int16)
    df['tw_rt_count_words'] = df['rt_text'].str.count(' ').astype(np.int16)
    df['tw_rt_count_char']  = df['rt_text'].apply(lambda x: len(x)).astype(np.int16)
    df['tw_original_user0'] = df['text'].apply(lambda x: extract_hash(x, no=0)   )
    df['tw_original_user1'] = df['text'].apply(lambda x: extract_hash(x, no=1)   )
    df['tw_original_user2'] = df['text'].apply(lambda x: extract_hash(x, no=2)   )
    df['tw_rt_user0'] = df['rt_text'].apply(lambda x: extract_hash(x, no=0)   )
    df['tw_original_http0'] = df['text'].apply(lambda x: extract_hash(x, split_text='http', no=0)   )
    
    df['tw_word0'] = df['text'].apply(lambda x: ret_word(x,0)).astype(np.int32)
    df['tw_word1'] = df['text'].apply(lambda x: ret_word(x,1)).astype(np.int32)
    df['tw_word2'] = df['text'].apply(lambda x: ret_word(x,2)).astype(np.int32)
    df['tw_word3'] = df['text'].apply(lambda x: ret_word(x,3)).astype(np.int32)
    df['tw_word4'] = df['text'].apply(lambda x: ret_word(x,4)).astype(np.int32)
    df['tw_tweet'] = df['text'].apply(lambda x: hashit(x) ).astype(np.int32)
    ##########################################################################################    
    
    
    ##########################################################################################    
    df['group'] = 0
    df['group'] = df['group'] + 1*(df['a_follower_count']>=222)
    df['group'] = df['group'] + 1*(df['a_follower_count']>=578)
    df['group'] = df['group'] + 1*(df['a_follower_count']>=1225)
    df['group'] = df['group'] + 1*(df['a_follower_count']>=3689)
    df['group'] = df['group'].astype(np.int8)
    
    df['date'] = pd.to_datetime(df['timestamp'], unit='s')
    df['dt_day']  = df['date'].dt.day.astype(np.int8)
    df['dt_dow']  = df['date'].dt.weekday.astype(np.int8)
    df['dt_minute'] = df['date'].dt.hour.astype(np.int16) * 60 + df['date'].dt.minute.astype(np.int16)
    del df['date']
    
    df['len_hashtags'] = df['hashtags'].apply( lambda x:  len(x.split('\t')) if x==x else 0 ).astype(np.int16)
    df['len_links'] = df['links'].apply( lambda x: len(x.split('\t')) if x==x else 0 ).astype(np.int16)
    df['len_domains'] = df['domains'].apply( lambda x: len(x.split('\t')) if x==x else 0 ).astype(np.int16)
    
    df['hashtags'] = df['hashtags'].apply( lambda x:  int(x.split('\t')[0],16)%2**32 if x==x else 0 ).astype(np.int32)
    df['links'] = df['links'].apply( lambda x: int(x.split('\t')[0],16)%2**32 if x==x else 0 ).astype(np.int32)
    df['domains'] = df['domains'].apply( lambda x: int(x.split('\t')[0],16)%2**32 if x==x else 0 ).astype(np.int32)
        
    df['media'] = df['media'].apply(lambda x: MAP_MEDIA[x[:9]] if x==x else 0).astype(np.int8)
    df['tweet_type'] = df['tweet_type'].apply(lambda x: MAP_TYPE[x] if x==x else 0).astype(np.int8)
    df['language'] = df['language'].apply(lambda x: MAP_LANG[x] if x==x else 0).astype(np.int8)
    
    df['timestamp'] = df['timestamp'].astype(np.uint32)
    
    df.loc[ df.a_account_creation<0 ,'a_account_creation'] = 1138308613
    df['a_account_creation'] = 240*(df['a_account_creation'] - 1138308613)/(1139000000 - 1138308613) - 127
    df['a_account_creation'] = df['a_account_creation'].astype(np.int8)
    
    df.loc[ df.b_account_creation<0 ,'b_account_creation'] = 1138308613
    df['b_account_creation'] = 240*(df['b_account_creation'] - 1138308613)/(1139000000 - 1138308613) - 127
    df['b_account_creation'] = df['b_account_creation'].astype(np.int8)

    df['a_follower_count'] = df['a_follower_count'].astype(np.int32)
    df['a_following_count'] = df['a_following_count'].astype(np.int32)
    df['b_follower_count'] = df['b_follower_count'].astype(np.int32)
    df['b_following_count'] = df['b_following_count'].astype(np.int32)

    df['a_is_verified'] = df['a_is_verified'].astype(np.int8)
    df['b_is_verified'] = df['b_is_verified'].astype(np.int8)
    df['b_follows_a'] = df['b_follows_a'].astype(np.int8)
    
    df['tweet_id'] = df['tweet_id'].apply(lambda x: int(x[-16:],16) ).astype(np.int64)
    df['a_user_id'] = df['a_user_id'].apply(lambda x: int(x[-16:],16) ).astype(np.int64)
    df['b_user_id'] = df['b_user_id'].apply(lambda x: int(x[-16:],16) ).astype(np.int64)
    ##########################################################################################    
    
    
#     del df['text_tokens']#Comment if you want to write strings to disk
    del df['rt_text']    #Comment if you want to write strings to disk
    del df['tw_isrt']
        
    df.to_parquet( '/raid/recsys/train_proc3/' + fn.split('/')[-1] + '.parquet'  )

    return


In [14]:
res = Parallel(n_jobs=18,backend='multiprocessing')(delayed(extract_feature)(fn) for fn in tqdm(trainfiles))
gc.collect()

  0%|          | 0/232 [00:00<?, ?it/s]



91

In [15]:
print('Done!!!')

Done!!!


In [11]:
!ls -l /raid/kaggle/2021/recsys/input/train_proc3/*

-rw-rw-r-- 1 giba giba 444658237 mai 17 17:48 /raid/kaggle/2021/recsys/input/train_proc3/part-00223.parquet
-rw-rw-r-- 1 giba giba 316775265 mai 17 17:45 /raid/kaggle/2021/recsys/input/train_proc3/part-00238.parquet


In [12]:
files = glob.glob('/raid/kaggle/2021/recsys/input/train_proc3/*')
len(files), files[0]

(2, '/raid/kaggle/2021/recsys/input/train_proc3/part-00223.parquet')

In [13]:
df = pd.read_parquet(files[0])
df.shape

(2423164, 62)

In [14]:
df.head()

Unnamed: 0,hashtags,tweet_id,media,links,domains,tweet_type,language,timestamp,a_user_id,a_follower_count,...,tw_word3,tw_word4,tw_tweet,group,dt_day,dt_dow,dt_minute,len_hashtags,len_links,len_domains
0,0,-7478896324428040558,1,0,0,2,1,1613289038,4617197474967167510,115,...,-108567334,-198969456,-50886855,0,14,6,470,0,0,0
1,0,-1848879083273022537,1,0,0,0,1,1613384304,-2213252647855724748,20414,...,-108567334,-108567334,-352821537,4,15,0,618,0,0,0
2,-1296565368,3946890577877323301,0,0,0,2,0,1614123198,5175105526563620283,10525,...,334126495,1623923521,1339766166,4,23,1,1413,1,0,0
3,0,-8098814074633273649,3,408568495,-1381235230,0,1,1613469158,5097682272736924186,20168,...,-108567334,-108567334,-153187484,4,16,1,592,0,1,1
4,0,8916107366268847947,0,1408315546,166479900,0,0,1613162448,4085168375921872338,3484,...,-341144402,-108567334,-1431441349,3,12,4,1240,0,1,1


In [15]:
df.dtypes

hashtags        int32
tweet_id        int64
media            int8
links           int32
domains         int32
                ...  
dt_dow           int8
dt_minute       int16
len_hashtags    int16
len_links       int16
len_domains     int16
Length: 62, dtype: object

In [16]:
df['dt_day'].value_counts()

4     122444
16    122215
22    120854
9     118692
17    117802
5     116986
23    116889
18    116447
15    116408
11    115878
10    115861
21    115662
14    115064
12    114654
19    114318
8     113881
7     112367
24    110578
6     110041
20    109556
13    106567
Name: dt_day, dtype: int64

In [17]:
df.columns

Index(['hashtags', 'tweet_id', 'media', 'links', 'domains', 'tweet_type',
       'language', 'timestamp', 'a_user_id', 'a_follower_count',
       'a_following_count', 'a_is_verified', 'a_account_creation', 'b_user_id',
       'b_follower_count', 'b_following_count', 'b_is_verified',
       'b_account_creation', 'b_follows_a', 'reply', 'retweet',
       'retweet_comment', 'like', 'text', 'tw_len_media', 'tw_len_photo',
       'tw_len_video', 'tw_len_gif', 'tw_len_quest', 'tw_len_retweet',
       'tw_len_token', 'tw_len_rt', 'tw_count_capital_words',
       'tw_count_excl_quest_marks', 'tw_count_special1', 'tw_count_at',
       'tw_count_hash', 'tw_last_quest', 'tw_isrt', 'tw_count_words',
       'tw_count_char', 'tw_rt_count_words', 'tw_rt_count_char',
       'tw_original_user0', 'tw_original_user1', 'tw_original_user2',
       'tw_rt_user0', 'tw_rt_user1', 'tw_original_http0', 'tw_word0',
       'tw_word1', 'tw_word2', 'tw_word3', 'tw_word4', 'tw_tweet', 'group',
       'dt_day', 'dt_d

In [32]:
df.groupby('tw_isrt')['like'].agg(['mean','count'])

Unnamed: 0_level_0,mean,count
tw_isrt,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.469561,1631174
1,0.247206,791990


In [34]:
df.groupby(['tweet_type','tw_isrt'])['like'].agg(['mean','count'])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,count
tweet_type,tw_isrt,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,0.484667,1434083
0,1,0.383394,819
1,1,0.247062,791092
2,0,0.359641,197091
2,1,0.278481,79


In [None]:
# train 
# dt_day: 4,5,6,7,8,9,10,11,12,13,14,15,16,17

# valid 
# dt_day: 18,19,20,21,22,23,24