# Sentiment Analysis (Transfer Learning)

## Preparación de ambiente

### Carga de módulos

In [1]:
# Misc
from warnings import filterwarnings

# Data Wrangling
import numpy as np
import pandas as pd

# Data exploration
from PIL import Image
import cufflinks as cf

# Preprocessing
import re
import nltk
import spacy
import unicodedata
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from nltk.stem.wordnet import WordNetLemmatizer
from tensorflow.keras.utils import to_categorical

# Modeling
import keras
import tensorflow as tf
import tensorflow_hub as hub
import keras.layers as layers
from keras.models import Model
from keras import backend as K
from keras.models import load_model
from sklearn.model_selection import train_test_split
from keras.callbacks import EarlyStopping, ModelCheckpoint

# Environment setup
cf.go_offline()
filterwarnings("ignore")

## Conexión con Drive

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Lectura de datos

In [3]:
df = pd.read_csv(filepath_or_buffer='/content/drive/MyDrive/FES Acatlán/Cursos/ANIEI/NLP/Datos/tweets_clean.csv')

In [4]:
df

Unnamed: 0,id,text,target
0,1467810369,awww that s bummer shoulda get david carr thir...,NEG
1,1467810672,upset can not update facebook by texting might...,NEG
2,1467810917,dive many time for ball manage save 50 rest go...,NEG
3,1467811184,whole body feel itchy like fire,NEG
4,1467811193,behave I m mad can not see,NEG
...,...,...,...
1599995,2193601966,wake school good feel ever,POS
1599996,2193601969,thewdbcom cool hear old walt interview,POS
1599997,2193601991,ready for mojo makeover ask for detail,POS
1599998,2193602064,happy 38th birthday boo alll time tupac amaru ...,POS


#### Preparación de conjuntos

In [5]:
X = df["text"]
y = df["target"]

In [6]:
X

0          awww that s bummer shoulda get david carr thir...
1          upset can not update facebook by texting might...
2          dive many time for ball manage save 50 rest go...
3                            whole body feel itchy like fire
4                                 behave I m mad can not see
                                 ...                        
1599995                           wake school good feel ever
1599996               thewdbcom cool hear old walt interview
1599997               ready for mojo makeover ask for detail
1599998    happy 38th birthday boo alll time tupac amaru ...
1599999                                 happy charitytuesday
Name: text, Length: 1600000, dtype: object

In [7]:
y

0          NEG
1          NEG
2          NEG
3          NEG
4          NEG
          ... 
1599995    POS
1599996    POS
1599997    POS
1599998    POS
1599999    POS
Name: target, Length: 1600000, dtype: object

In [8]:
le = LabelEncoder()

In [9]:
y = le.fit_transform(y)

In [10]:
y

array([0, 0, 0, ..., 1, 1, 1])

In [11]:
y = to_categorical(y)

In [12]:
y

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [0., 1.],
       [0., 1.],
       [0., 1.]], dtype=float32)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

## Entrenamiento

### Carga de modelo preentrenado

In [14]:
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

embed_size = embed.variables[-1].shape[-1]
print("Embedding size: ", embed_size)

Embedding size:  512


In [15]:
word = "Elephant"
sentence = "I am a sentence for which I would like to get its embedding."
paragraph = ("Universal Sentence Encoder embeddings also support short paragraphs. "
             "There is no hard limit on how long the paragraph is. Roughly, the longer the more 'diluted' the embedding will be.")
messages = [word, sentence, paragraph]

In [16]:
for element in messages:
    print(element, embed([element])[0][:10])

Elephant tf.Tensor(
[ 0.00834444  0.00048085  0.06595246 -0.01971266  0.03570081  0.03658931
  0.03706126 -0.0112046  -0.00317394  0.04214219], shape=(10,), dtype=float32)
I am a sentence for which I would like to get its embedding. tf.Tensor(
[ 0.0508086  -0.01652431  0.0157378  -0.04286412 -0.04372453 -0.02536193
 -0.07321192  0.04079886  0.01748865  0.01217931], shape=(10,), dtype=float32)
Universal Sentence Encoder embeddings also support short paragraphs. There is no hard limit on how long the paragraph is. Roughly, the longer the more 'diluted' the embedding will be. tf.Tensor(
[-0.02833269 -0.05586217 -0.01294149 -0.00322107 -0.07933776 -0.01764221
  0.04226501  0.00828533  0.08381325  0.07212216], shape=(10,), dtype=float32)


### Construcción del modelo

#### Capa de entrada

In [17]:
input_text = layers.Input(shape=(1,), dtype="string")

#### Capa de embedding

In [18]:
def UniversalEmbedding(x):
    results = embed(tf.squeeze(tf.cast(x, tf.string)))
    return K.concatenate([results])

In [None]:
embedding = layers.Lambda(UniversalEmbedding, output_shape=(embed_size,))(input_text)

#### Capas ocultas

In [20]:
x = layers.Dense(256, activation='relu')(embedding)
x = layers.Dropout(0.25)(x)
x = layers.Dense(64, activation='relu')(x)
x = layers.Dropout(0.125)(x)

#### Capa de salida

In [21]:
x = layers.Dense(len(le.classes_), activation='sigmoid')(x)

#### Configuración del modelo

In [22]:
model_sa = Model(inputs=[input_text], outputs=x)

In [23]:
opt = keras.optimizers.Adam(learning_rate=0.001)

In [24]:
model_sa.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])
model_sa.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 lambda (Lambda)             (None, 512)               0         
                                                                 
 dense (Dense)               (None, 256)               131328    
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 dense_1 (Dense)             (None, 64)                16448     
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense_2 (Dense)             (None, 2)                 130   

#### Callbacks

In [25]:
early_stopping = EarlyStopping(monitor='val_accuracy', patience=15, min_delta=0.0001)

In [26]:
checkpoint = ModelCheckpoint('models/sentiment/model_{val_accuracy:.3f}.h5',
                             save_best_only=True,
                             save_weights_only=False,
                             monitor='val_accuracy')

### Entrenamiento

In [27]:
X_train = np.array([np.array(val) for val in X_train])

In [28]:
X_test = np.array([np.array(val) for val in X_test])

In [29]:
history = model_sa.fit(X_train, y_train, epochs=10, batch_size=2048, callbacks=[early_stopping, checkpoint], validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [26]:
loss, acc = model_sa.evaluate(X_test, y_test)



In [30]:
results = pd.DataFrame(data = zip(history.history["loss"], history.history["val_loss"], history.history["accuracy"], history.history["val_accuracy"]), columns=["loss", "val_loss", "accuracy", "val_accuracy"])

In [31]:
results.iplot()

In [29]:
df["y_hat"] = le.inverse_transform(np.argmax(model_sa.predict(X), axis=1))

ValueError: ignored