In [1]:
import numpy as np
import pandas as pd

import tensorflow as tf

from tensorflow import feature_column
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
test_df = pd.read_csv('../Data/test_processed.csv')
train_df = pd.read_csv('../Data/train_processed.csv')

### Hiperparámetros a mejorar.
- 1 - test_size del split.
- 2 - Batch_size del dataframe to dataset.
- 3 - Las columnas a utilizar.
- 4 - Dimension de las columnas embedding.
- 5 - El epoch del fit.

In [3]:
# Divido el dataframe en train, validation y test.
train, test = train_test_split(train_df, test_size = 0.2)
train, val = train_test_split(train, test_size = 0.2)

In [4]:
# A partir del dataframe creo data de tensorflow.
def df_to_dataset(dataframe, shuffle = True, batch_size = 32):
    dataframe = dataframe.copy()
    
    if 'target' in dataframe.columns:
        labels = dataframe.pop('target')
        ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    else:
        ds = tf.data.Dataset

    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    
    ds = ds.batch(batch_size)

    return ds

In [5]:
# El batch size fue elegido al azar.
batch_size = 32
train_ds = df_to_dataset(train, batch_size = batch_size)
val_ds = df_to_dataset(val, shuffle = False, batch_size = batch_size)
test_ds = df_to_dataset(test, shuffle = False, batch_size = batch_size)

In [6]:
train.head(1)

Unnamed: 0.1,Unnamed: 0,id,keyword,location,text,target,text_clean,hashtags,mentions,links,...,stop_word_count,punctuation_count,hashtag_count,mention_count,link_count,caps_count,caps_ratio,location_clean,keyword_target,location_clean_target
1369,1369,1973,bush%20fires,"iPhone: -27.499212,153.011072",@marcoarment Middle of winter in Sydney we hav...,1,@marcoarment Middle of winter in Sydney we hav...,no,marcoarment,no,...,12,3,1,1,1,3,0.021898,Others,0.72,0.413344


In [7]:
train.columns

Index(['Unnamed: 0', 'id', 'keyword', 'location', 'text', 'target',
       'text_clean', 'hashtags', 'mentions', 'links', 'text_len', 'word_count',
       'stop_word_count', 'punctuation_count', 'hashtag_count',
       'mention_count', 'link_count', 'caps_count', 'caps_ratio',
       'location_clean', 'keyword_target', 'location_clean_target'],
      dtype='object')

In [8]:
def obtenerColumnas(dataframe):
    # Elijo las columnas a utilizar (todas por ahora).
    columnas_features = []

    # Columnas numéricas.
    for columna in ['id', 'text_len', 'word_count', 'stop_word_count', 'punctuation_count', 'hashtag_count', 'mention_count', 'link_count', 'caps_count', 'caps_ratio', 'keyword_target', 'location_clean_target']:
        columnas_features.append(feature_column.numeric_column(columna))
    
    # Columnas categóricas.
    for columna in ['keyword', 'location', 'hashtags', 'mentions', 'links', 'location_clean']:
        categorical = feature_column.categorical_column_with_vocabulary_list(columna, dataframe[columna].unique())
        indicator = feature_column.indicator_column(categorical)
        columnas_features.append(indicator)

    # Columnas de textos.
    for columna in ['text', 'text_clean']:
        texto = feature_column.categorical_column_with_vocabulary_list(columna, dataframe[columna].unique())
        texto_embedding = feature_column.embedding_column(texto, dimension = 8)
        columnas_features.append(texto_embedding)
    
    # Con las columnas creadas creo el input al modelo que voy a utilizar.
    return columnas_features


In [9]:
# Creo el modelo, lo compilo y lo entreno.
columnas_train = obtenerColumnas(train)
layer_entrenamiento = tf.keras.layers.DenseFeatures(columnas_train)

modelo = tf.keras.Sequential([ layer_entrenamiento,
                               layers.Dense(128, activation='relu'),
                               layers.Dense(128, activation='relu'),
                               layers.Dropout(.1),
                               layers.Dense(2) ])

modelo.compile(optimizer='adam', loss = tf.keras.losses.BinaryCrossentropy(from_logits = True), metrics = ['accuracy'])
modelo.fit(train_ds, validation_data = val_ds, epochs = 25)


Epoch 1/25
Consider rewriting this model with the Functional API.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



ValueError: in user code:

    /opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:809 train_function  *
        return step_function(self, iterator)
    /opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:799 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    /opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:1261 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:2794 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:3217 _call_for_each_replica
        return fn(*args, **kwargs)
    /opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:792 run_step  **
        outputs = model.train_step(data)
    /opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:751 train_step
        loss = self.compiled_loss(
    /opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/engine/compile_utils.py:203 __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    /opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/losses.py:148 __call__
        losses = ag_call(y_true, y_pred)
    /opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/losses.py:252 call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    /opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/util/dispatch.py:201 wrapper
        return target(*args, **kwargs)
    /opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/losses.py:1601 binary_crossentropy
        K.binary_crossentropy(y_true, y_pred, from_logits=from_logits), axis=-1)
    /opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/util/dispatch.py:201 wrapper
        return target(*args, **kwargs)
    /opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/backend.py:4825 binary_crossentropy
        return nn.sigmoid_cross_entropy_with_logits(labels=target, logits=output)
    /opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/util/dispatch.py:201 wrapper
        return target(*args, **kwargs)
    /opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/ops/nn_impl.py:173 sigmoid_cross_entropy_with_logits
        raise ValueError("logits and labels must have the same shape (%s vs %s)" %

    ValueError: logits and labels must have the same shape ((None, 2) vs (None, 1))


In [None]:
loss, accuracy = modelo.evaluate(test_ds)
print("Accuracy", accuracy)

In [None]:
predicciones = []
for index, row in test_df.iterrows(): 
    input_dict = { name: tf.convert_to_tensor([value]) for name, value in row.items() }
    prediccion = modelo.predict(input_dict)
    predicciones.append(prediccion)
    print(prediccion)

In [None]:
input_dict

In [None]:
input_dict