Data preprocessing

In [42]:
#Libraries
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_datasets as tfds
import pickle

from src.utils.process_text import clean_tweet

In [17]:
#Load the data

#Define columns
columns = ["sentiment", "id", "date", "query", "user", "text"]

#train data 
data = pd.read_csv(
    "../../../../data/raw/tweets-data/train.csv",
    header=None,
    names=columns,
    engine="python",
    encoding="latin1"
)

In [18]:
data['text'][0]

"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D"

In [19]:
#Drop fields
data.drop(["id", "date", "query", "user"],
          axis=1,
          inplace=True)

In [20]:
data.head()

Unnamed: 0,sentiment,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [6]:
#Clean the text
text_clean = [clean_tweet(tweet) for tweet in data.text]

  tweet = BeautifulSoup(tweet, "lxml").get_text()


In [7]:
text_clean

[" Awww that's a bummer. You shoulda got David Carr of Third Day to do it. D",
 "is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!",
 ' I dived many times for the ball. Managed to save The rest go out of bounds',
 'my whole body feels itchy and like its on fire ',
 " no it's not behaving at all. i'm mad. why am i here? because I can't see you all over there. ",
 ' not the whole crew ',
 'Need a hug ',
 " hey long time no see! Yes.. Rains a bit only a bit LOL I'm fine thanks how's you ?",
 " K nope they didn't have it ",
 ' que me muera ? ',
 "spring break in plain city... it's snowing ",
 'I just re pierced my ears ',
 " I couldn't bear to watch it. And I thought the UA loss was embarrassing . . . . .",
 ' It it counts idk why I did either. you never talk to me anymore ',
 " i would've been the first but i didn't have a gun. not really though zac snyder's just a doucheclown.",
 ' I wish I got to watch it with you!! I miss you

In [22]:
#Targets
labels = data['sentiment']
labels[labels == 4] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labels[labels == 4] = 1


In [23]:
labels.value_counts()

sentiment
0    800000
1    800000
Name: count, dtype: int64

In [26]:
#Tokenizer
tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    text_clean, target_vocab_size=2**16
)

inputs = [tokenizer.encode(sentence) for sentence in text_clean]

2023-07-28 12:34:00.802495: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [27]:
inputs

[[65316,
  1570,
  113,
  65323,
  10,
  6,
  3553,
  1,
  135,
  5262,
  50,
  1484,
  38165,
  16,
  13337,
  606,
  2,
  49,
  33,
  1,
  65352],
 [11,
  1090,
  23,
  122,
  77,
  65323,
  15,
  754,
  195,
  1841,
  124,
  2975,
  33,
  27,
  8,
  327,
  818,
  78,
  6,
  3642,
  1830,
  80,
  3006,
  1,
  6353,
  65317],
 [65316,
  3,
  41563,
  117,
  339,
  524,
  13,
  4,
  3798,
  1,
  11861,
  2,
  1194,
  104,
  610,
  42,
  41,
  16,
  10504,
  65399],
 [7, 494, 1036, 597, 4898, 8, 37, 81, 18, 1767],
 [65316,
  51,
  33,
  65323,
  10,
  32,
  22118,
  29,
  426,
  1,
  65389,
  65323,
  19,
  2819,
  1,
  158,
  56,
  9,
  280,
  25,
  223,
  3,
  77,
  65323,
  15,
  70,
  12,
  40,
  144,
  220,
  1],
 [65316, 32, 4, 494, 3719],
 [980, 6, 1342],
 [65316,
  313,
  202,
  71,
  51,
  1259,
  5,
  1693,
  47,
  60451,
  65316,
  6,
  288,
  121,
  6,
  288,
  371,
  65357,
  65323,
  19,
  801,
  157,
  1404,
  65323,
  10,
  55,
  861],
 [65316, 1140, 1717, 96, 150, 65323

In [29]:
#Padding
MAX_LEN = max([len(sentence) for sentence in inputs])
inputs = tf.keras.preprocessing.sequence.pad_sequences(inputs,
                                                            value=0,
                                                            padding="post",
                                                            maxlen=MAX_LEN)

In [41]:
inputs

array([[65316,  1570,   113, ...,     0,     0,     0],
       [   11,  1090,    23, ...,     0,     0,     0],
       [65316,     3, 41563, ...,     0,     0,     0],
       ...,
       [  927,    12,   229, ...,     0,     0,     0],
       [  366,   337,  1309, ...,     0,     0,     0],
       [  181, 51236,     0, ...,     0,     0,     0]], dtype=int32)

In [35]:
#Split the data
test_idx = np.random.randint(0, 800000, 8000)
test_idx = np.concatenate((test_idx, test_idx+800000))

test_inputs = inputs[test_idx]
test_labels = labels[test_idx]
train_inputs = np.delete(inputs, test_idx, axis=0)
train_labels = np.delete(labels, test_idx)

In [39]:
#Save the data
np.savez('../../../../data/processed/tweets_data/tweets_train_data.npz', inputs=train_inputs, labels=train_labels)
np.savez('../../../../data/processed/tweets_data/tweets_test_data.npz', inputs=test_inputs, labels=test_labels)

In [43]:
#Save tokenizer
with open('../../../../exports/sentiment_analysis/tokenizers/tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)