In [1]:
import numpy as np
import pandas as pd

import tensorflow as tf

from tensorflow import feature_column
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
test_df = pd.read_csv('../Data/test_processed.csv')
train_df = pd.read_csv('../Data/train_processed.csv')

### Hiperparámetros a mejorar.
- 1 - test_size del split.
- 2 - Batch_size del dataframe to dataset.
- 3 - Las columnas a utilizar.
- 4 - Dimension de las columnas embedding.
- 5 - El epoch del fit.

In [3]:
# Divido el dataframe en train, validation y test.
train, test = train_test_split(train_df, test_size = 0.2)
train, val = train_test_split(train, test_size = 0.2)

In [4]:
# A partir del dataframe creo data de tensorflow.
def df_to_dataset(dataframe, shuffle = True, batch_size = 32):
    dataframe = dataframe.copy()
    
    labels = dataframe.pop('target')
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))

    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    
    ds = ds.batch(batch_size)

    return ds

In [5]:
# El batch size fue elegido al azar.
batch_size = 32
train_ds = df_to_dataset(train, batch_size = batch_size)
val_ds = df_to_dataset(val, shuffle = False, batch_size = batch_size)
test_ds = df_to_dataset(test, shuffle = False, batch_size = batch_size)

In [6]:
train.head(1)

Unnamed: 0.1,Unnamed: 0,id,keyword,location,text,target,text_clean,hashtags,mentions,links,...,stop_word_count,punctuation_count,hashtag_count,mention_count,link_count,caps_count,caps_ratio,location_clean,keyword_target,location_clean_target
870,870,1259,blood,The World,Ain't no hoe in my blood,1,Ain't no hoe in my blood,no,no,no,...,3,1,1,1,1,1,0.041667,Others,0.142857,0.413344


In [7]:
def obtenerColumnas(dataframe):
    # Elijo las columnas a utilizar (todas por ahora).
    columnas_features = []

    # Columnas numéricas.
    for columna in ['id', 'text_len', 'word_count', 'stop_word_count', 'punctuation_count', 'hashtag_count', 'mention_count', 'link_count', 'caps_count', 'caps_ratio', 'keyword_target', 'location_clean_target']:
        columnas_features.append(feature_column.numeric_column(columna))
    
    # Columnas categóricas.
    for columna in ['keyword', 'location', 'hashtags', 'mentions', 'links', 'location_clean']:
        categorical = feature_column.categorical_column_with_vocabulary_list(columna, dataframe[columna].unique())
        indicator = feature_column.indicator_column(categorical)
        columnas_features.append(indicator)

    # Columnas de textos.
    for columna in ['text', 'text_clean']:
        texto = feature_column.categorical_column_with_vocabulary_list(columna, dataframe[columna].unique())
        texto_embedding = feature_column.embedding_column(texto, dimension = 8)
        columnas_features.append(texto_embedding)
    
    # Con las columnas creadas creo el input al modelo que voy a utilizar.
    return columnas_features


In [8]:
# Creo el modelo, lo compilo y lo entreno.
columnas_train = obtenerColumnas(train)
layer_entrenamiento = tf.keras.layers.DenseFeatures(columnas_train)

modelo = tf.keras.Sequential([ layer_entrenamiento,
                               layers.Dense(128, activation='relu'),
                               layers.Dense(128, activation='relu'),
                               layers.Dropout(.1),
                               layers.Dense(1) ])

modelo.compile(optimizer='adam', loss = tf.keras.losses.BinaryCrossentropy(from_logits = True), metrics = ['accuracy'])
modelo.fit(train_ds, validation_data = val_ds, epochs = 25)


Epoch 1/25
Consider rewriting this model with the Functional API.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

Consider rewriting this model with the Functional API.
Consider rewriting this model with the Functional API.
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25


Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<tensorflow.python.keras.callbacks.History at 0x7f8eeffb7790>

In [9]:
loss, accuracy = modelo.evaluate(test_ds)
print("Accuracy", accuracy)

Accuracy 0.6520026326179504


In [10]:
predicciones = []
for index, row in test_df.iterrows(): 
    input_dict = { name: tf.convert_to_tensor([value]) for name, value in row.items() }
    prediccion = modelo.predict(input_dict)
    predicciones.append(prediccion[0][0])
    print(prediccion[0][0])

Consider rewriting this model with the Functional API.
6.7180986
13.13041
10.058476
8.178694
5.791718
2.9632137
5.9398456
-2.898669
-3.4107537
-2.7642806
-4.671399
-2.4656134
-4.1080728
-3.9767969
-3.0537581
5.538938
-3.5467753
6.405724
-14.551204
-1.307729
-3.2305636
-0.9520155
-1.5948889
3.4738681
-3.9180882
-4.8906984
0.01245451
2.5705245
-4.023645
4.796805
-0.39296824
-1.9532816
12.103967
4.368081
1.4900835
0.8118641
4.29405
-0.7777913
-2.823801
2.6553173
-4.5695324
5.642249
-1.6604044
4.9563885
-3.5943305
-6.2697487
1.7331746
-23.109068
-5.1313
-12.748105
-8.887977
-9.35116
-4.9284563
-23.743336
-15.698949
-8.097541
-8.203421
-6.9725585
-8.144545
-15.47391
4.9554644
3.7985485
0.07462585
1.9311001
10.831869
4.115891
-0.39790946
11.641422
3.962044
0.958461
-1.5808015
7.022686
8.304105
9.765349
8.036247
21.943495
-3.5083535
-11.660657
-3.0969493
3.0557792
-5.3352304
1.9807482
-4.675483
-3.011129
21.355206
-1.099525
0.12666053
-3.418531
9.699689
10.159002
-8.826945
-5.955783
-3.308813

1.3736057
0.22020578
0.83489746
0.91109043
-0.008190036
1.6612854
1.2706912
-0.63953745
-1.0741594
2.0350668
-1.8698928
0.8280284
0.44684613
-2.1610982
-0.980436
-1.2007608
1.8786132
-0.90546054
-3.036424
0.508526
-2.6319497
2.0063765
0.84281117
0.86096996
-0.5595753
0.80743796
-3.199617
-0.80579084
0.015746444
-1.2378483
1.1297503
-1.2316997
-0.7204767
-2.7727857
-1.9843245
-1.1859131
-3.7043169
-0.5164206
-2.0633614
-4.740512
-0.17204565
-3.7831824
-3.6731677
0.21974981
-1.5835493
-1.3071575
0.06264219
1.4134552
-1.2745156
-3.3155475
-1.3353553
13.1844225
0.5221191
0.78601915
-1.4341733
0.8570786
0.31125027
-2.64533
0.83364075
-1.814677
0.95508367
-1.1425104
1.080754
0.43512565
0.0826273
-11.426095
1.3620877
-3.3219995
-0.9044999
-0.09076065
1.1799891
0.40732372
0.5722701
-1.4187093
-0.020085782
1.0408072
0.17768618
-6.2484913
0.37890667
-2.1569102
-2.1976733
-2.2953908
-1.787467
-0.7375982
-2.1587791
-0.46266854
-3.2170472
-5.712794
-2.567095
-1.4789271
-4.29851
5.956538
5.1771317
-

-2.4738944
-2.7406824
-1.8114996
-1.595299
-1.5972891
0.16119927
0.43374723
-0.6514823
1.0377071
0.0034998655
0.35449213
0.6085049
-0.61734504
-6.561562
-0.024457037
0.42694372
1.4199722
1.2236929
0.16004664
-1.7362218
-0.95128447
-0.5933171
-1.1045344
-1.390233
-1.0334685
-0.8343928
-0.29597682
-0.991908
-1.8197334
-0.095614254
-1.1089754
0.0354349
0.40608698
0.75431293
-1.5157094
-0.2769639
-1.6307762
-2.2846425
-5.0271807
-2.3592637
-17.71297
-0.8928714
-4.33168
-4.4334707
-1.5157182
-1.3461578
-1.540765
-2.3269632
7.2840886
0.046242893
-5.0748725
-1.7281682
-1.0098678
-1.1772708
0.011284649
-0.31443185
-2.1800723
-1.1484704
-1.7573786
-1.1796273
-1.8348169
-0.48970813
-1.0991449
-0.07962185
1.2461665
0.08247149
0.55982673
2.5117233
0.122181
1.2578375
-0.36299276
1.0238285
0.37955648
-1.058304
0.2637446
1.5540917
-0.52506906
-1.950716
-0.29862058
0.17357028
0.5072486
-1.1813934
-1.0790684
7.6240897
-3.7239716
-0.43623686
0.57487243
-0.7545968
0.5094274
6.57928
-2.871655
-2.7869368
-

-2.7814174
-3.927016
-1.3332894
-4.253393
-1.9185271
-2.1612275
-2.25246
-2.7056954
-2.1450331
-2.894117
-2.7781782
-3.3249614
-2.8316166
-2.5993707
-1.8909082
-2.152207
-2.6084104
-2.737757
-2.659706
-3.2013793
-2.198226
-1.536545
-0.6837857
-1.133956
-2.296306
-0.37104684
1.5504022
-1.0359991
-1.6746638
-2.5314586
-2.2984433
-1.0359828
-2.2211256
-1.1845226
-2.8723485
-2.2144446
-0.8214235
-0.44666773
-0.10196406
-0.23597664
-1.6127496
0.40967315
-1.0215471
7.035737
-1.3438246
0.4333046
7.030919
0.5260927
-1.1021203
-0.3353706
5.58431
-2.4575312
-2.0425391
-1.6735694
-1.5391886
-3.8629
-1.9558151
-2.6075237
-0.77885884
-1.4444413
-1.6060889
-1.0440326
-2.5594833
-1.5409739
-2.5895536
-1.8926933
-3.4460654
-4.416719
-2.1002562
-3.9031491
-3.8718288
-1.6315272
-3.1593788
-1.5743318
-2.3067875
-3.4942985
-3.0919063
-2.6531007
-2.7833886
-3.4944003
-3.4525192
-2.2647233
-1.488208
-3.1061265
-3.1812344
-2.6084225
-2.962027
-1.7962971
-3.3746884
-2.2614498
-3.727236
-2.941789
-3.8910673
-3

-2.9538238
-3.8156803
-2.9978209
-3.5393567
-1.0436385
-1.3400929
-2.2912452
-2.183148
-0.46316153
-3.1887643
-1.5495944
-1.5404842
-2.8484828
-1.6043248
-0.75568837
-1.4554873
-1.0471172
-2.401205
-1.5705469
7.5902243
-2.0918853
-3.6941125
-1.9606364
-3.336696
-2.4209616
-1.4009085
-1.527622
-1.6531265
-1.694675
-4.286338
-3.210748
-3.0751083
-3.7609322
-4.074553
-3.706876
-2.5053978
6.573
-3.4510465
-3.4287696
-2.2569144
-2.8310678
-2.3715026
-3.9267335
-3.210278
-2.4588983
6.2103076
-3.5190902
6.1878104
-2.0787659
6.20597
-4.164229
-4.680271
-5.2869425
-2.9184983
-2.3608804
-4.282072
-3.0570047
-3.1548827
-1.9435351
-3.4847443
-4.730365
-3.049597
-2.4668376
-3.9251678
-3.5557153
-3.8674147
-3.3972883
-3.2741814
-2.1103969
-3.3865871
-2.4836895
-3.0655098
-1.5641139
-3.5802484
-2.833752
-3.005734
0.35704285
-5.001194
-3.7799687
-2.973812
-2.6519666
-2.8077688
-2.5428777
-2.5390184
-2.4565997
-2.9529798
-2.376951
-5.5336533
-4.8416533
-3.107797
-3.864257
-3.253752
-3.7932942
-3.845641

In [22]:
predicciones_dt = test_df['id'].to_frame()
predicciones_dt['target'] = predicciones
predicciones_dt['target'] = predicciones_dt['target'].apply(lambda x: 1 if x > 0 else 0)
predicciones_dt.to_csv('tensorFlow-struct-opt.csv', index = False)
predicciones_dt.head(20)

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
5,12,1
6,21,1
7,22,0
8,27,0
9,29,0


In [12]:
len(predicciones)

3263

In [13]:
len(test_df)

3263