In [1]:
import tensorflow as tf

from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

2024-04-25 09:46:40.723928: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-25 09:46:42.084194: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-25 09:46:42.087282: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Previous work - Load, Transform data
import numpy as np
import pandas as pd

df_train = pd.read_csv('./usercode/train.csv')
train_data = df_train['text'].to_numpy()

In [3]:
# 1. Tokenize the numpy array of texts.
tokenizer = Tokenizer(num_words=5000, oov_token='<oov>')
tokenizer.fit_on_texts(train_data)

# 1.1 We convert the texts to sequence after learning from training data
train_seq = tokenizer.texts_to_sequences(train_data)
print(train_data, '\n\n' ,train_seq[:2])

# Next, we tokenize the textual comments using the Tokenizer provided by Keras.
#  We use the training set comments alone to build a vocabulary of tokens, 
# and use them to convert all the comments into a (padded) sequence of tokens of the same length.


['I am still waiting on my card?'
 "What can I do if my card still hasn't arrived after 2 weeks?"
 'I have been waiting over a week. Is the card still coming?' ...
 'What countries are getting support?' 'Are cards available in the EU?'
 'Which countries are represented?'] 

 [[2, 49, 63, 209, 29, 3, 7], [14, 11, 2, 9, 55, 3, 7, 63, 122, 276, 163, 457, 306]]


In [4]:
# 2. Understanding padding and masking
# Masking is a way to tell sequence-processing layers that certain timesteps
# in an input are missing, and thus should be skipped when processing the data.

# Padding comes from the need to encode sequence data into contiguous batches: 
# in order to make all sequences in a batch fit a given standard length, 
# it is necessary to pad or truncate some sequences.

'''
[
  ["Hello", "world", "!"],
  ["How", "are", "you", "doing", "today"],
  ["The", "weather", "will", "be", "nice", "tomorrow"],
]

[
  [71, 1331, 4231]
  [73, 8, 3215, 55, 927],
  [83, 91, 1, 645, 1253, 927],
]

# The data is a nested list where individual samples have length 3, 5, and 6, respectively.
# Since the input data for a deep learning model must be a single tensor
# (of shape e.g. (batch_size, 6, vocab_size) in this case), 
# samples that are shorter than the longest item need to be padded
#  with some placeholder value
'''


raw_inputs = [
    [711, 632, 71],
    [73, 8, 3215, 55, 927],
    [83, 91, 1, 645, 1253, 927],
]

# By default, this will pad using 0s; it is configurable via the "value" parameter.
# Note that you could use "pre" padding (at the beginning) or "post" padding (at the end).
# We recommend using "post" padding when working with RNN layers
# (in order to be able to use the CuDNN implementation of the layers).
padded_inputs = tf.keras.utils.pad_sequences(raw_inputs, padding="post")
print(padded_inputs)

[[ 711  632   71    0    0    0]
 [  73    8 3215   55  927    0]
 [  83   91    1  645 1253  927]]


In [20]:
x_train = pad_sequences(train_seq, maxlen=50, padding='post', truncating='post')
print(train_seq[:2], '\n\n' ,x_train[:2])

[[2, 49, 63, 209, 29, 3, 7], [14, 11, 2, 9, 55, 3, 7, 63, 122, 276, 163, 457, 306]] 

 [[  2  49  63 209  29   3   7   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [ 14  11   2   9  55   3   7  63 122 276 163 457 306   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0]]


In [17]:
# 3. label to codes
train_labels = pd.Categorical(df_train['category']).codes
print(train_labels.shape)

# 3.1 What does reshape does ?
# -> The -1 is a placeholder that means “adjust this dimension to make the data fit”.
# Asking numpy to reshape the array with 1 column and as many rows as possible
print(df_train['category'].values.reshape(-1, 1))

y_train = train_labels.reshape(-1, 1)
print(y_train)


(10003,)
[['card_arrival']
 ['card_arrival']
 ['card_arrival']
 ...
 ['country_support']
 ['country_support']
 ['country_support']]
[[12]
 [12]
 [12]
 ...
 [25]
 [25]
 [25]]
