Data preprocessing

In [25]:
#Libraries
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_datasets as tfds
from sklearn.model_selection import train_test_split
import pickle

from src.utils.process_text import clean_tweet

In [9]:
#Load the data

#train data 
data = pd.read_csv(
    "../../../../../data/raw/tweets-data/spanish/train.csv",
    engine="python",
    encoding="latin1"
)

In [10]:
data.head()

Unnamed: 0,tweet_date_created,tweet_id,tweet_text,language,sentiment,sentiment_score
0,2018-08-08T13:09:15.489000,1027179935184703489,"Alisson puede estar más tranquilo, no cargará ...",es,POSITIVE,"{""Neutral"":0.082259356975555419921875,""Negativ..."
1,2018-08-08T18:27:37.320000,1027260056092344320,@iPincheViky @ChelseaFC Es que el director eje...,es,NEUTRAL,"{""Neutral"":0.827011644840240478515625,""Negativ..."
2,2018-08-12T14:59:31.520000,1028657238116843520,Upto £100 #freebets &gt; https://t.co/cbjeMXI9...,es,NEUTRAL,"{""Neutral"":0.930982112884521484375,""Negative"":..."
3,2018-08-04T13:23:30.257000,1025733971320160257,"Bobby Duncan, primo de Steven Gerrard, deja la...",es,NEUTRAL,"{""Neutral"":0.906872212886810302734375,""Negativ..."
4,2018-07-28T11:21:06.480000,1023166450981396480,@TorreiraForeva @lepvtron @Arsenal @IntChampio...,es,NEUTRAL,"{""Neutral"":0.942405760288238525390625,""Negativ..."


In [11]:
data['tweet_text'][0]

'Alisson puede estar más tranquilo, no cargará con el peso de ser el arquero mas caro de la historia. Si es que eso le pesaba en algo. Gracias @ChelseaFC'

In [12]:
#Drop fields
data.drop(["tweet_date_created", "tweet_id", "language", "sentiment_score"],
          axis=1,
          inplace=True)

In [13]:
data.head()

Unnamed: 0,tweet_text,sentiment
0,"Alisson puede estar más tranquilo, no cargará ...",POSITIVE
1,@iPincheViky @ChelseaFC Es que el director eje...,NEUTRAL
2,Upto £100 #freebets &gt; https://t.co/cbjeMXI9...,NEUTRAL
3,"Bobby Duncan, primo de Steven Gerrard, deja la...",NEUTRAL
4,@TorreiraForeva @lepvtron @Arsenal @IntChampio...,NEUTRAL


In [27]:
#Split the data
train_set, test_set = train_test_split(
        data, test_size=0.2, random_state=42, shuffle=True, stratify=None)

In [37]:
#Clean the text
train_text_clean = [clean_tweet(tweet) for tweet in train_set.tweet_text]
test_text_clean = [clean_tweet(tweet) for tweet in test_set.tweet_text]
test_clean = [clean_tweet(tweet) for tweet in data.tweet_text]

  tweet = BeautifulSoup(tweet, "lxml").get_text()


In [31]:
train_text_clean

['Gan el Manchester United con gol de Angel Gomes en el descuento de penalti. YouthLeague. MUFC.',
 ' RangelM Cierto ese centro por derecha y remat con direcci n a puerta. Enhorabuena por Jimenez pero me doli ver empatar a mi y con uno menos era cuesti n de tiempo',
 'El ManCity a un punto de igualar el record del Chelsea Tendr an que ganar los que le quedan para llegar a los puntos nunca los superar an. Ceremonia de entrega del ?Trofeo de Campe n de la PremierLeague ??A CONTINUACI N !!! ',
 'Estamos en la final y vamos por el campeonato nunca caminar s solo',
 ' vuelve a ser cedido por el al . El holand s jugar en su pa s. En breves la noticia en . HazloVAVEL PremierLeage Southampton',
 ' PremierLeague En desarrollo Tottenham Fulham Leicester Wolverhampton WestHam Bournemouth Everton Southampton ',
 ' De acuerdo gracias por su respuesta xD',
 ' Cup Felicitationes por el partido de hoy.',
 'OFICIAL ?. Diego Rico es nuevo jugador del Bournemouth. El lateral llega procedente del Legan s.

In [32]:
train_set['sentiment'].value_counts()

sentiment
NEUTRAL     89060
POSITIVE     8816
NEGATIVE     7599
MIXED         690
Name: count, dtype: int64

In [33]:
test_set['sentiment'].value_counts()

sentiment
NEUTRAL     22274
POSITIVE     2188
NEGATIVE     1890
MIXED         190
Name: count, dtype: int64

In [55]:
#Targets
train_targets = train_set['sentiment'].map({'NEUTRAL': 2, 'POSITIVE': 1, 'NEGATIVE': 0, 'MIXED': 3})
test_targets = test_set['sentiment'].map({'NEUTRAL': 2, 'POSITIVE': 1, 'NEGATIVE': 0, 'MIXED': 3})

In [51]:
train_targets.value_counts()

sentiment
2    89060
1     8816
0     7599
3      690
Name: count, dtype: int64

In [56]:
test_targets.value_counts()

sentiment
2    22274
1     2188
0     1890
3      190
Name: count, dtype: int64

In [38]:
#Tokenizer
tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    text_clean, target_vocab_size=2**16
)

In [40]:
train_inputs = [tokenizer.encode(sentence) for sentence in train_text_clean]
test_inputs = [tokenizer.encode(sentence) for sentence in test_text_clean]

In [41]:
train_inputs

[[1983,
  2,
  85,
  133,
  11,
  77,
  1,
  13946,
  609,
  5,
  2,
  2435,
  1,
  6929,
  4,
  32550,
  4,
  317,
  63952],
 [63938,
  57215,
  30723,
  63938,
  8162,
  168,
  575,
  12,
  1783,
  8,
  5927,
  11,
  4022,
  10,
  3,
  7595,
  4,
  2615,
  12,
  202,
  35,
  48,
  12859,
  83,
  2414,
  3,
  79,
  8,
  11,
  180,
  223,
  201,
  2627,
  10,
  1,
  537],
 [21,
  126,
  3,
  19,
  767,
  1,
  4115,
  2,
  4008,
  9,
  34,
  4995,
  165,
  7,
  184,
  14,
  7,
  33,
  1525,
  20,
  334,
  3,
  14,
  379,
  407,
  14,
  3367,
  2290,
  4,
  14747,
  52047,
  1510,
  1,
  3581,
  120,
  72,
  15423,
  1,
  980,
  10,
  1,
  6,
  78,
  142,
  52,
  62343,
  5689,
  63984,
  1004],
 [1465, 5, 6, 91, 8, 342, 12, 2, 2068, 407, 4078, 17, 3207],
 [63938,
  564,
  3,
  40,
  254,
  12,
  2,
  190,
  22,
  21,
  3082,
  17,
  99,
  5,
  25,
  269,
  64021,
  4,
  64,
  27137,
  6,
  977,
  309,
  22,
  2337,
  41699,
  63938,
  2169],
 [63938, 43, 64, 6367, 62, 496, 378, 849, 228

In [45]:
#Padding
MAX_LEN = max([len(sentence) for sentence in train_inputs])
train_inputs = tf.keras.preprocessing.sequence.pad_sequences(train_inputs,
                                                            value=0,
                                                            padding="post",
                                                            maxlen=MAX_LEN)
test_inputs = tf.keras.preprocessing.sequence.pad_sequences(test_inputs,
                                                            value=0,
                                                            padding="post",
                                                            maxlen=MAX_LEN)

In [46]:
train_inputs

array([[ 1983,     2,    85, ...,     0,     0,     0],
       [63938, 57215, 30723, ...,     0,     0,     0],
       [   21,   126,     3, ...,     0,     0,     0],
       ...,
       [ 2057,    18,    70, ...,     0,     0,     0],
       [63938,    67,   688, ...,     0,     0,     0],
       [63938,   186,  4227, ...,     0,     0,     0]], dtype=int32)

In [47]:
test_inputs

array([[63938,   208, 54037, ...,     0,     0,     0],
       [ 1602,  1019,   570, ...,     0,     0,     0],
       [ 3364,    43,   256, ...,     0,     0,     0],
       ...,
       [63938,  3948,     3, ...,     0,     0,     0],
       [  518,    52,    36, ...,     0,     0,     0],
       [  723,    16,   468, ...,     0,     0,     0]], dtype=int32)

In [57]:
#Save the data
np.savez('../../../../../data/processed/tweets_data/spanish/tweets_train_data.npz', inputs=train_inputs, targets=train_targets)
np.savez('../../../../../data/processed/tweets_data/spanish/tweets_test_data.npz', inputs=test_inputs, targets=test_targets)

In [49]:
#Save tokenizer
with open('../../../../../exports/sentiment_analysis/tokenizers/spanish/tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)