# Load unique tweet tokens from file 
# Remove mentions and hashtags from tweets

### Save in another file the number of mentions for that tweet and the mentions list (same for hashtags)

In [190]:
import time
from TokenizerWrapper import TokenizerWrapper
from TokenizerWrapper import special_tokens
import numpy as np

### Constants

In [191]:
N_ROWS = 1000

### Paths

In [3]:
TWEET_ID = "tweet_features_tweet_id"
TWEET_TOKENS = "tweet_features_text_tokens"

TWEET_TOKENS_FILE = "tweet_tokens/text_tokens_all_no_escaped_chars.csv"  #"tweet_tokens/tweet_text_longer_than_280_no_escaped_chars.csv"

RESULT_PATH = "tweet_tokens/text_tokens_clean_2.csv"
MENTIONS_PATH = "tweet_tokens/mentions/mentions.csv"
HASHTAGS_PATH = "tweet_tokens/hashtags/hashtags.csv"

### Functions to extract mentions and hashtags from the tweet

In [4]:
# return text_tokens, mentions_list, mentions_count
# in case the tweet is a retweet
def get_RT_mentions(tokens, mentions):

    length = len(tokens)-1
    
    i = 2  # exclude CLS and the 56898 ('RT') token
    while tokens[i] != special_tokens[':'] and i < length:
        i += 1

    #print('i: ' + str(i))

    mentions.append(tokens[2:i])
    #mentions.append('102\n') # append SEP \n

    tokens = tokens[i+1:]
    tokens.insert(0, '101')   # insert CLS at beginning
    
    return tokens, mentions

In [5]:
def get_mentions(tokens, mentions):
    
    found_initial = False
        
    initial_index = 0
    final_index = 0
    
    for i in range(len(tokens)):
        
        t = tokens[i]
        
        if t == special_tokens['@'] and not found_initial:
            initial_index = i
            found_initial = True
            
        elif found_initial and i==initial_index+1:
            pass
        
        elif found_initial and i > initial_index+1:
            decoded_t = tok.convert_tokens_to_strings([t])[0]
            if '##' in decoded_t:
                pass
            elif '_' == decoded_t:
                pass
            elif tok.convert_tokens_to_strings([tokens[i-1]])[0] == '_':
                pass
            else:
                final_index = i
                mentions.append(tokens[initial_index:final_index])
                found_initial = False
            
    return tokens, mentions


In [6]:
def get_remove_mentions(tokens, mentions):
    
    found_initial = False
    
    mask = []
    
    initial_index = 0
    final_index = 0
    
    for i in range(len(tokens)):
        
        t = tokens[i]
            
        if found_initial and i==initial_index+1:
            mask.append(False)
        
        elif found_initial and i > initial_index+1:
            decoded_t = tok.convert_tokens_to_strings([t])[0]
            if '##' in decoded_t:
                mask.append(False)
            elif '_' == decoded_t:
                mask.append(False)
            elif tok.convert_tokens_to_strings([tokens[i-1]])[0] == '_':
                mask.append(False)
            else:
                final_index = i
                mentions.append(tokens[initial_index:final_index])
                found_initial = False
                # mask.append(True)
    
                
        if not found_initial and t == special_tokens['@']:
            initial_index = i
            found_initial = True
            mask.append(False)
            
        elif not found_initial:
            mask.append(True)
        
            #print(decoded_t)
    tokens_arr = np.array(tokens)
    tokens_arr = tokens_arr[mask]
    tokens = tokens_arr.tolist()
    
    return tokens, mentions


In [196]:
def get_remove_mentions_hashtags(tokens, mentions, hashtags):
    
    found_initial = False
    
    mask = []
    
    initial_index = 0
    final_index = 0
    is_mention = False
    
    for i in range(len(tokens)):
        
        t = tokens[i]
        
        if found_initial and i==initial_index+1:
            mask.append(False)
        
        elif found_initial and i > initial_index+1:
            decoded_t = tok.convert_tokens_to_strings([t])[0]
            if '##' in decoded_t:
                mask.append(False)
            elif '_' == decoded_t:
                mask.append(False)
            elif tok.convert_tokens_to_strings([tokens[i-1]])[0] == '_':
                mask.append(False)
            else:
                final_index = i
                if is_mention:
                    mentions.append(tokens[initial_index:final_index])
                else:
                    hashtags.append(tokens[initial_index:final_index])

                found_initial = False
                # mask.append(True)
    
                
        if not found_initial and (t == special_tokens['@'] or t == special_tokens['#']):
            if t == special_tokens['@']:
                is_mention = True
            elif t == special_tokens['#']:
                is_mention = False
                
            initial_index = i
            found_initial = True
            mask.append(False)
            
        elif not found_initial:
            mask.append(True)
            
            #print(decoded_t)
    tokens_arr = np.array(tokens)
    tokens_arr = tokens_arr[mask]
    tokens = tokens_arr.tolist()
            
    return tokens, mentions, hashtags


In [197]:
def split_line(l):
    l = l.split(',')
    t_id = l[0]
    t_list = l[1].split('\t')  # replace("\\n",'').replace("\\t",'\t')
    
    return t_id, t_list

In [198]:
def convert_tokens_to_strings(m_list):
    
    # print(m_list)
    strings_list = []
    
    for m in m_list:
        m = tok.decode(m)
        m = m.replace(' ', '')
        
        strings_list.append(m)  # otherwise last string not added
            
    return strings_list

In [199]:

mentions_dict = {}
current_mapping = 0

def map_mentions(m_list):
    global mentions_dict, current_mapping
    mapped = []
    for m in m_list:
        if m not in mentions_dict:
            mentions_dict[m] = current_mapping
            current_mapping += 1
        
        mapped.append(mentions_dict[m])
    
    return mapped

In [200]:
hashtags_dict = {}
current_mapping_hashtag = 0

def map_hashtags(m_list):
    global hashtags_dict, current_mapping_hashtag
    mapped = []
    for m in m_list:
        m = m.lower()
        if m not in hashtags_dict:
            hashtags_dict[m] = current_mapping_hashtag
            current_mapping_hashtag += 1
        
        mapped.append(hashtags_dict[m])
    
    return mapped

### Functions to write results

In [201]:
def save_tweet(index, text_tokens):
    string = index + ',' + '\t'.join(text_tokens)
    result_file.write(string)
    

def save_mentions_or_hashtags(text_tokens, text, mapped, count, is_mentions=True):
    for i in range(len(text_tokens)):
        text_tokens[i] = '\t'.join(text_tokens[i])
    
    # each mentions is separated by a ";"
    # each token in a mention is separated by a "\t"
    string = str(count) + ',' + ';'.join(text_tokens) + ',' + ''.join(text) + ',' + '\t'.join(map(str, mapped)) + '\n'
    
    if is_mentions:
        mentions_file.write(string)
    else:
        hashtags_file.write(string)

In [202]:
f_to_int = lambda x: int(x)
f_int = lambda x: list(map(f_to_int, x))

### Create a TokenizerWrapper and the dictionary to map mentions to integers

In [203]:
tok = TokenizerWrapper()

### Open output files and wirte headers (column names)

In [229]:
result_file = open(RESULT_PATH, "w+")
mentions_file = open(MENTIONS_PATH, "w+")
hashtags_file = open(HASHTAGS_PATH, "w+")

In [230]:
result_file.write(TWEET_ID + ',' + TWEET_TOKENS + "\n")
mentions_file.write("mentions_count,mentions_tokens,mentions_text,mentions_mapped\n")
hashtags_file.write("hashtags_count,hashtags_tokens,hashtags_text,hashtags_mapped\n")

61

### Open files to be read

In [231]:
tokens_file = open(TWEET_TOKENS_FILE, "r")

### Execute

In [232]:
%%time

# ~2h 30m EXECUTION

# ignore header
line = tokens_file.readline()

start = time.time()

finished = False
row = 0

while not finished:  # and row < N_ROWS:
    
    mentions_tokens = []
    hashtags_tokens = []
    
    if row % 1000000 == 0:
        elapsed_time = time.time() - start
        print('Row: ', row, ' - Elapsed time: ', elapsed_time)
            
    line = str(tokens_file.readline())
    
    #print(line)
    
    if line != '':
        
        tweet_id, tokens_list = split_line(line)
        
        #if tweet_id == '130' or tweet_id == '154' or tweet_id == '161':
            
        #print('\ntweet_id: ', tweet_id)
        #print(tokens_list)
        #decoded_tweet = tok.decode(tokens_list)
        #print('\n', decoded_tweet, '\n')

        # retweets contain the word RT (right after CLS, in position 1) followed
        # by mentions and then a ':', before starting with the actual tweet text
        if tokens_list[1] == special_tokens['RT'] and tokens_list[2] == special_tokens['@']:
            tokens_list, mentions_tokens = get_RT_mentions(tokens_list, mentions_tokens)

        # remove remaining mentions
        tokens_list, mentions_tokens, hashtags_tokens = get_remove_mentions_hashtags(tokens_list, mentions_tokens, hashtags_tokens)

        mentions_count = len(mentions_tokens)
        mentions_strings = convert_tokens_to_strings(mentions_tokens)
        mapped_mentions = map_mentions(mentions_strings)

        hashtags_count = len(hashtags_tokens)
        hashtags_strings = convert_tokens_to_strings(hashtags_tokens)
        mapped_hashtags = map_hashtags(hashtags_strings)

        #print('tweet tokens: ', tokens_list)
        #print('mentions tokens: ', mentions_tokens)
        #print('mentions text: ', mentions_strings)
        #print('mapped_mentions: ', mapped_mentions)
        #print('mentions count: ', mentions_count)
        #print('decoded tweet: ', tok.decode(f_int(tokens_list)))

        #print('hashtag text: ', hashtags_strings)
        #print('mapped_hashtags: ', mapped_hashtags)

        save_tweet(tweet_id, tokens_list)
        save_mentions_or_hashtags(mentions_tokens, mentions_strings, mapped_mentions, mentions_count, is_mentions=True)
        save_mentions_or_hashtags(hashtags_tokens, hashtags_strings, mapped_hashtags, hashtags_count, is_mentions=False)

    else:
        finished = True

    row += 1

Row:  0  - Elapsed time:  2.384185791015625e-06
Row:  1000000  - Elapsed time:  120.48065638542175
Row:  2000000  - Elapsed time:  239.3873484134674
Row:  3000000  - Elapsed time:  358.2570924758911
Row:  4000000  - Elapsed time:  477.6771969795227
Row:  5000000  - Elapsed time:  594.0386674404144
Row:  6000000  - Elapsed time:  714.2335116863251
Row:  7000000  - Elapsed time:  836.6887924671173
Row:  8000000  - Elapsed time:  958.6229357719421
Row:  9000000  - Elapsed time:  1078.2425425052643
Row:  10000000  - Elapsed time:  1199.7779710292816
Row:  11000000  - Elapsed time:  1319.963332414627
Row:  12000000  - Elapsed time:  1440.3555297851562
Row:  13000000  - Elapsed time:  1559.6439867019653
Row:  14000000  - Elapsed time:  1680.6134593486786
Row:  15000000  - Elapsed time:  1801.6696860790253
Row:  16000000  - Elapsed time:  1921.6638708114624
Row:  17000000  - Elapsed time:  2042.315838098526
Row:  18000000  - Elapsed time:  2162.236151456833
Row:  19000000  - Elapsed time:  22

In [233]:
tokens_file.close()

result_file.close()
mentions_file.close()
hashtags_file.close()

### Save mapping dictionaries

In [241]:
import json

In [246]:
len(mentions_dict)

6890195

In [242]:
json_mentions_mapping = json.dumps(mentions_dict)

In [243]:
with open('tweet_tokens/mentions/mentions_mapping.json', 'w+') as f:
    f.write(json_mentions_mapping)

In [247]:
len(hashtags_dict)

1872374

In [244]:
json_hashtags_mapping = json.dumps(hashtags_dict)

In [245]:
with open('tweet_tokens/hashtags/hashtags_mapping.json', 'w+') as f:
    f.write(json_hashtags_mapping)

### Check the dataset

In [8]:
%%time

import pandas as pd

df = pd.read_csv(RESULT_PATH,
                    #names=[TWEET_ID],
                    nrows=1000,
                    header=0,
                    index_col=0)

CPU times: user 9.92 ms, sys: 5.93 ms, total: 15.9 ms
Wall time: 13.6 ms


In [9]:
df

Unnamed: 0_level_0,tweet_features_text_tokens
tweet_features_tweet_id,Unnamed: 1_level_1
0,101\t6417\t3410\t3398\t3184\t1909\t56910\t1683...
1,101\t14120\t131\t120\t120\t188\t119\t11170\t12...
2,101\t62342\t10858\t54439\t19571\t22480\t7831\t...
3,101\t58955\t10898\t103305\t1901\t16181\t7168\t...
4,101\t2435\t5656\t2594\t8279\t8623\t1925\t64126...
...,...
995,101\t50133\t13028\t18926\t10142\t10911\t10142\...
996,101\t42451\t10114\t10741\t64312\t10551\t37655\...
997,101\t220\t11839\t41541\t10105\t13702\t10108\t1...
998,101\t147\t100\t62691\t35885\t27830\t14131\t101...


In [12]:
%%time

df = pd.read_csv(MENTIONS_PATH,
                    #names=[TWEET_ID],
                    nrows=1000,
                    header=0)

CPU times: user 10.1 ms, sys: 1.72 ms, total: 11.8 ms
Wall time: 9.29 ms


In [13]:
df

Unnamed: 0,mentions_count,mentions_tokens,mentions_text,mentions_mapped
0,0,,,
1,0,,,
2,0,,,
3,0,,,
4,0,,,
...,...,...,...,...
995,0,,,
996,1,137\t126\t77484,@5pm,534
997,0,,,
998,1,137\t10879\t31510\t10920\t10138\t168,@kireidesu_,535


In [15]:
%%time

df = pd.read_csv(HASHTAGS_PATH,
                    #names=[TWEET_ID],
                    nrows=1000,
                    header=0)

CPU times: user 0 ns, sys: 7.8 ms, total: 7.8 ms
Wall time: 85.9 ms


In [16]:
df

Unnamed: 0,hashtags_count,hashtags_tokens,hashtags_text,hashtags_mapped
0,0,,,
1,0,,,
2,0,,,
3,2,108\t29005\t10230;108\t7457,#Peing#質,0\t1
4,0,,,
...,...,...,...,...
995,0,,,
996,0,,,
997,0,,,
998,0,,,
