In [1]:
import pandas as pd
import numpy as np
import time

### Constants

In [5]:
COLUMN_NAME = "tweet_features_text_tokens"
COLUMN_TYPE = pd.StringDtype()
PATH = "../data/training/raw_columns/tweet_features/text_tokens.csv.gz"
PATH_TWEETS_PADDED = "tweet_tokens/text_tokens_padded_1000.csv.gz"
N_ROWS = 1000
CHUNKSIZE = 100
PAD = int(0)

In [6]:
with open("max_tweet_length.txt", "r") as f:
    max_tweet_length = int(f.readline())

In [7]:
max_tweet_length

511

### Read tweet_text_tokens and remove duplicates

In [118]:
TWEET_ID = "tweet_features_tweet_id"
TWEET_TOKENS = "tweet_features_text_tokens"

columns = [
    TWEET_ID,
    TWEET_TOKENS
]

files = [
    "../mapped_columns/direct_mapping/tweet_features_tweet_id.pck.gz",
    "../raw_columns/raw_tweet_features_text_tokens.pck.gz"
]

dataframe = pd.DataFrame()

for i in range(2):
    %time dataframe[columns[i]] = pd.read_pickle(files[i], compression='gzip')[columns[i]]

CPU times: user 32.7 s, sys: 6.54 s, total: 39.3 s
Wall time: 39.2 s


KeyboardInterrupt: 

In [100]:
%time dataframe["tweet_features_text_tokens"] = dataframe["tweet_features_text_tokens"].str.split("\t").apply(lambda x: [int(i) for i in x])

CPU times: user 22 ms, sys: 4.04 ms, total: 26.1 ms
Wall time: 24.6 ms


In [101]:
dataframe

Unnamed: 0,tweet_features_tweet_id,tweet_features_text_tokens
0,0,"[101, 6417, 3410, 3398, 3184, 1909, 56910, 168..."
1,1,"[101, 14120, 131, 120, 120, 188, 119, 11170, 1..."
2,2,"[101, 62342, 10858, 54439, 19571, 22480, 7831,..."
3,3,"[101, 58955, 10898, 103305, 1901, 16181, 7168,..."
4,4,"[101, 2435, 5656, 2594, 8279, 8623, 1925, 6412..."
...,...,...
995,994,"[101, 56898, 137, 10125, 61643, 99036, 168, 10..."
996,995,"[101, 50133, 13028, 18926, 10142, 10911, 10142..."
997,996,"[101, 42451, 10114, 10741, 64312, 10551, 37655..."
998,997,"[101, 220, 11839, 41541, 10105, 13702, 10108, ..."


In [102]:
%time len(dataframe[TWEET_ID].unique())

CPU times: user 579 µs, sys: 8 µs, total: 587 µs
Wall time: 490 µs


999

In [103]:
%time dataframe.drop_duplicates(TWEET_ID, inplace=True)

CPU times: user 0 ns, sys: 6.49 ms, total: 6.49 ms
Wall time: 4.83 ms


In [104]:
%time len(dataframe[TWEET_ID])

CPU times: user 166 µs, sys: 3 µs, total: 169 µs
Wall time: 180 µs


999

In [None]:
%time dataframe.reset_index(inplace=True)

### Pad all the tweets to match with length = max_length and save them

In [107]:
def save_chunk(chunk, path):
    chunk.to_csv(path, 
                 columns=[COLUMN_NAME],
                 header=None,
                 mode='a',
                 compression='gzip')

In [111]:
%%time

for chunk in pd.read_csv(PATH,
                            chunksize=CHUNKSIZE,
                            names=[TWEET_ID],
                            compression='gzip',
                            nrows=N_ROWS,
                            header=0,
                            index_col=0):
    
    for row in chunk[TWEET_TOKENS]:

        for i in range(len(row), max_tweet_length):
            row.append(PAD)

        if (len(row) != max_tweet_length):
            print('Error')
            break

CPU times: user 1.92 ms, sys: 12 µs, total: 1.94 ms
Wall time: 1.83 ms


In [113]:
dataframe.to_csv(PATH_TWEETS_PADDED,
                 header=None,
                 mode='w',
                 compression='gzip')

In [114]:
# We'll borrow the `pad_sequences` utility function to do this.
#from keras.preprocessing.sequence import pad_sequences

#print('\nPadding/truncating all sentences to %d values...' % MAX_LEN)

#print('\nPadding token: "{:}", ID: {:}'.format(tokenizer.pad_token, tokenizer.pad_token_id))

# Pad our input tokens with value 0.
# "post" indicates that we want to pad and truncate at the end of the sequence,
# as opposed to the beginning.
#input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", 
#                          value=0, truncating="post", padding="post")

#print('\nDone.')

ModuleNotFoundError: No module named 'keras'

### Load padded tokens lists

In [27]:
# Declaring two lambdas in order to cast a string to a numpy array of integers
f_to_int = lambda x: int(x)
f_int = lambda x: np.array(list(map(f_to_int, x.replace('[', '').replace(']', '').replace(' ', '').split(','))))  # NOTE: bert wants lists

In [28]:
def read_tweets_list(path):
    
    list_of_tweets = []
    
    for chunk in pd.read_csv(path,
                            chunksize=CHUNKSIZE,
                            names=[COLUMN_NAME],
                            #dtype={COLUMN_NAME: pd.Int32Dtype()},
                            nrows=N_ROWS,
                            header=None,
                            index_col=0,
                            compression='gzip'):
      #print(chunk)

      tweets = chunk[COLUMN_NAME]

      for t in tweets:
        t_list = f_int(t)
        list_of_tweets.append(t_list)

    return list_of_tweets

In [29]:
tweets_list = read_tweets_list(PATH_TWEETS_PADDED)

In [30]:
len(tweets_list)

1000

### Check that they all have the same length

In [31]:
for t in tweets_list:
    if len(t) != max_tweet_length:
        print('Error')

### Print all the tweets

In [32]:
tweets_list

[array([  101,  6417,  3410,  3398,  3184,  1909, 56910, 16838, 82904,
         1901,   100,   102,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
      