https://vgpena.github.io/classifying-tweets-with-keras-and-tensorflow/

En el anterior enlace, tenéis un ejemplo sobre cómo, a partir de tweets con un label específico (un sentimiento, positivo o negativo): 

1. Genera un conjunto de entrenamiento. El conjunto de entrenamiento es formado a partir de tweets completos pasados a un array con un tamaño específico.
2. Ese array (X_train de tamaño N) tiene un label que representa el sentimiento (y_train)
3. Como todas las frases tienen un tamaño N, la entrada de la red neuronal será de tamaño N y la salida de la red será de tamaño 2 usando activación softmax(porque hay dos clases).

Se pide: 

- Realizar un clasificador de reviews para el dataset de IMDB de la carpeta data_exercise/

**Cuando usa la importación "keras.x", reemplázalo por "tensorflow.keras.x"**

In [1]:
# Your code
from tensorflow.keras.preprocessing.text import Tokenizer
import json
import tensorflow.keras
import tensorflow.keras.preprocessing.text as kpt
from tensorflow.keras.preprocessing.text import Tokenizer

In [2]:
import numpy as np
import pandas as pd

# extract data from a csv
# notice the cool options to skip lines at the beginning
# and to only take data from certain columns

#Cargo los datos en un DF y luego lo convierto np array para seguir como el ejemplo
training_df = pd.read_csv("./data/IMDB_Dataset.csv")
# Codifico la categorización de los sentimientos a entero
training_df.sentiment=training_df.sentiment.map({'positive': 1, 'negative': 0})
training = training_df.to_numpy()

# create our training data from the tweets
train_x = [x[0] for x in training]
# index all the sentiment labels
train_y = np.asarray([x[1] for x in training])


In [3]:
type(train_x)

list

In [4]:
training_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 781.4+ KB


In [5]:
training.shape

(50000, 2)

In [6]:

# only work with the 5000 most popular words found in our dataset
max_words = 5000

In [7]:
# create a new Tokenizer
tokenizer = Tokenizer(num_words=max_words)

In [8]:
type(train_x)

list

In [9]:

# feed our tweets to the Tokenizer
tokenizer.fit_on_texts(train_x)

In [10]:

# Tokenizers come with a convenient list of words and IDs
dictionary = tokenizer.word_index

In [11]:

# Let's save this out so we can use it later
with open('dictionary.json', 'w') as dictionary_file:
    json.dump(dictionary, dictionary_file)

In [12]:

def convert_text_to_index_array(text):
    # one really important thing that `text_to_word_sequence` does
    # is make all texts the same length -- in this case, the length
    # of the longest text in the set.
    return [dictionary[word] for word in kpt.text_to_word_sequence(text)]

In [13]:
allWordIndices = []
# for each review, change each token to its ID in the Tokenizer's word_index
for text in train_x:
    wordIndices = convert_text_to_index_array(text)
    allWordIndices.append(wordIndices)

In [14]:
# now we have a list of all review converted to index arrays.
# cast as an array for future usage.
allWordIndices = np.asarray(allWordIndices)

In [15]:
# create one-hot matrices out of the indexed reviews
train_x = tokenizer.sequences_to_matrix(allWordIndices, mode='binary')

In [16]:
# treat the labels as categories
train_y = tensorflow.keras.utils.to_categorical(train_y, 2)

# EL MODELO

In [17]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation

model = Sequential()
model.add(Dense(512, input_shape=(max_words,), activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(256, activation='sigmoid'))
model.add(Dropout(0.5))
model.add(Dense(2, activation='softmax'))

In [18]:
model.compile(loss='categorical_crossentropy',
  optimizer='adam',
  metrics=['accuracy'])

In [19]:
model.fit(train_x, train_y,
  batch_size=32,
  epochs=5,
  verbose=1,
  validation_split=0.1,
  shuffle=True)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f80fc6ac400>

In [20]:
# SALVAR EL MODELO
model_json = model.to_json()
with open('model.json', 'w') as json_file:
    json_file.write(model_json)

model.save_weights('model_pnl.h5')

### PROBAR EL MODELO CON ALGUN REVIEW (INPUT POR PANTALLA)

In [21]:

# we're still going to use a Tokenizer here, but we don't need to fit it
tokenizer = Tokenizer(num_words=5000)
# for human-friendly printing
labels = ['negative', 'positive']

In [22]:
# read in our saved dictionary
with open('dictionary.json', 'r') as dictionary_file:
    dictionary = json.load(dictionary_file)

In [23]:
# this utility makes sure that all the words in your input
# are registered in the dictionary
# before trying to turn them into a matrix.
def convert_text_to_index_array(text):
    words = kpt.text_to_word_sequence(text)
    wordIndices = []
    for word in words:
        if word in dictionary:
            wordIndices.append(dictionary[word])
        else:
            print("'%s' not in training corpus; ignoring." %(word))
    return wordIndices

In [24]:

# read in your saved model structure
json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()

In [25]:
loaded_model_json

'{"class_name": "Sequential", "config": {"name": "sequential", "layers": [{"class_name": "InputLayer", "config": {"batch_input_shape": [null, 5000], "dtype": "float32", "sparse": false, "ragged": false, "name": "dense_input"}}, {"class_name": "Dense", "config": {"name": "dense", "trainable": true, "batch_input_shape": [null, 5000], "dtype": "float32", "units": 512, "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "Dropout", "config": {"name": "dropout", "trainable": true, "dtype": "float32", "rate": 0.5, "noise_shape": null, "seed": null}}, {"class_name": "Dense", "config": {"name": "dense_1", "trainable": true, "dtype": "float32", "units": 256, "activation": "sigmoid", "use_bias": true, "kernel_initializer

In [None]:
# and create a model from that
#model = model_from_json(loaded_model_json)

In [26]:
# and weight your nodes with your saved values
model.load_weights('model_pnl.h5')

In [27]:
# okay here's the interactive part
while 1:
    evalSentence = input ('Input a sentence to be evaluated, or Enter to quit: ')
    if len(evalSentence) == 0:
        break
    # format your input for the neural net
    testArr = convert_text_to_index_array(evalSentence)
    inp = tokenizer.sequences_to_matrix([testArr], mode='binary')
    # predict which bucket your input belongs in
    pred = model.predict(inp)
    # and print it for the humons
    print("%s sentiment; %f%% confidence" % (labels[np.argmax(pred)], pred[0][np.argmax(pred)] * 100))

positive sentiment; 99.628371% confidence
negative sentiment; 82.875359% confidence
negative sentiment; 99.999595% confidence
positive sentiment; 99.245435% confidence
negative sentiment; 83.996415% confidence
negative sentiment; 81.927466% confidence
negative sentiment; 64.099753% confidence
negative sentiment; 74.808854% confidence
positive sentiment; 98.161948% confidence
negative sentiment; 99.926132% confidence
