In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
import string
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.model_selection import train_test_split
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from tensorflow.keras.models import Sequential
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.layers import Dense, Input, Dropout, Conv1D, Flatten, GlobalMaxPool1D, Reshape
from nltk.corpus import stopwords
from collections import defaultdict
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow_hub as hub
from bert import tokenization

In [None]:
trainCSV = pd.read_csv('./forHiper')

In [None]:
x = trainCSV.drop(['id', 'text', 'keyword', 'location','target'], axis=1)
y = trainCSV.pop('target')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=.01, random_state=42)

## Busco hiperparametros

GridSearch para batch_size, epochs y optimizador.

In [None]:
def build_model(optimizer):
  model = Sequential()
  model.add(Dense(64, input_shape=(X_train.shape[1],), activation='relu'))
  model.add(Dropout(0.1))
  model.add(Dense(32, activation='relu'))
  model.add(Dropout(0.1))
  model.add(Dense(1, activation='sigmoid'))
  model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
  return model

In [None]:
parameters = parameters = {'batch_size': [8,16,32],
             'epochs':[5,10,50,100,200,500],
             'optimizer': ['adadelta', 'rmsprop', 'adam']}

In [None]:
estimator = KerasClassifier(build_fn=build_model, verbose=0)
grid_search = GridSearchCV(estimator=estimator, param_grid=parameters, scoring='accuracy', cv=10)
# grid_search.fit(X_train, y_train)
# grid_search.best_params_

GridSearch para cantidad de neuronas por capa.

In [None]:
#Capas
def build_model(l1, l2):
  model = Sequential()
  model.add(Dense(l1, input_shape=(X_train.shape[1],), activation='relu'))
  model.add(Dropout(0.1))
  model.add(Dense(l2, activation='relu'))
  model.add(Dropout(0.1))
  model.add(Dense(1, activation='sigmoid'))
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  return model

In [None]:
parameters = parameters = {'l1':[8,16,32,64,128,256],
                           'l2':[8,16,32,64,128,256]}

In [None]:
estimator = KerasClassifier(build_fn=build_model, verbose=0, batch_size=16, epochs=10)
grid_search = GridSearchCV(estimator=estimator, param_grid=parameters, scoring='accuracy', cv=10)
# grid_search.fit(X_train, y_train)
# grid_search.best_params_

GridSearch para valor de dropout.

In [None]:
#Dropouts
def build_model(d1, d2):
    model = Sequential()
    model.add(Dense(256, input_shape=(X_train.shape[1],), activation='relu'))
    model.add(Dropout(d1))
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(d2))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [None]:
parameters = {'d1':[0.01,0.1,0.2,0.25],
              'd2':[0.01,0.1,0.2,0.25]}

In [None]:
estimator = KerasClassifier(build_fn=build_model, verbose=0, batch_size=16, epochs=200)
grid_search = GridSearchCV(estimator=estimator, param_grid=parameters, scoring='accuracy', cv=10)
# grid_search.fit(X_train, y_train)
# grid_search.best_params_

## Entrenamiento de modelo

In [None]:
def get_compiled_model():
# (64, 16), (64, 32), (128, 16) va bien
    model = Sequential()
    model.add(Input(shape=(X_train.shape[1],)))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(16, activation='relu')) 
    model.add(Dropout(0.1))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) # adadelta
    return model

In [None]:
model = get_compiled_model()
model.summary()

In [None]:
# model.fit(X_train, y_train, epochs=10, batch_size=16)

In [None]:
# results = model.evaluate(X_test, y_test, batch_size=16)
# print("test loss, test acc:", results)

In [None]:
# testCSV = pd.read_csv('./testWithFeatures.csv')
# x_predict = testCSV.drop(['id', 'text', 'keyword', 'location'], axis=1)
# x_predict.head()

In [None]:
# predictions = model.predict_classes(x_predict)

In [None]:
# b = predictions.ravel()
# pd.Series(b).value_counts()

In [None]:
# submissionDf = pd.read_csv('./sample_submission.csv')
# submissionDf['target'] = b

In [None]:
# submissionDf.head(10)

In [None]:
submissionDf.to_csv('submission.csv', index = False)

## BERT encoding

In [None]:
# !wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py

In [None]:
tf.gfile = tf.io.gfile

In [None]:
# %%time
# module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1"
# bert_layer = hub.KerasLayer(module_url, trainable=True)

In [None]:
# train = pd.read_csv('./train.csv', dtype={'id': np.int16, 'target': np.int8})
# test = pd.read_csv('./test.csv', dtype={'id': np.int16})

In [None]:
# test['text'] = test['text'].fillna('')

In [None]:
def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts: # tokeniza cada tweet
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2] # corta a max len y deja espacio para los token especiales
        input_sequence = ["[CLS]"] + text + ["[SEP]"] #Agrego tokens especiales
        pad_len = max_len - len(input_sequence) # calculo padding
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence) #convierto tokens a ids
        tokens += [0] * pad_len #sumo padding
        pad_masks = [1] * len(input_sequence) + [0] * pad_len #creo la mask que me dice que es padding 
        segment_ids = [0] * max_len # es para tener distintas secuencias, lo dejamos en 0 porque todo es un tweet
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [None]:
# vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
# do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
# tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

In [None]:
# train_input = bert_encode(train.text.values, tokenizer, max_len=160)
# test_input = bert_encode(test.text.values, tokenizer, max_len=160)
# train_labels = train.target.values

In [None]:
# train_input

## Defino model

In [None]:
def build_model1(bert_layer, max_len=512):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    out = Dense(1, activation='sigmoid')(clf_output)
    
    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

def build_model2(bert_layer, max_len=512):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    out_first_layer    = Dense(16, activation='relu')(clf_output)
    out_first_dropout  = Dropout(0.2)(out_first_layer)
    out_second_layer   = Dense(8, activation='relu')(out_first_dropout)
    out_second_dropout = Dropout(0.2)(out_second_layer)
    out = Dense(1, activation='sigmoid')(out_second_dropout)
    
    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

def build_model3(bert_layer, max_len=512):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    out_first_layer    = Dense(128, activation='relu')(clf_output)
    out_first_dropout  = Dropout(0.2)(out_first_layer)
    out_second_layer   = Dense(64, activation='relu')(out_first_dropout)
    out_second_dropout = Dropout(0.2)(out_second_layer)
    out = Dense(1, activation='sigmoid')(out_second_dropout)
    
    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

def build_model4(bert_layer, max_len=512):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    print(sequence_output.shape)
    clf_output = sequence_output[:, 0, :]
    reshape            = Reshape((1024,1))(clf_output)
    out_first_filter   = Conv1D(32, kernel_size=3, activation='relu', input_shape=(1024,1))(reshape)
    out_second_filter  = Conv1D(64, kernel_size=3, activation='relu')(out_first_filter)
    out_max_pooling    = GlobalMaxPool1D()(out_second_filter)
    out_flatten        = Flatten()(out_max_pooling)
    out_first_layer    = Dense(16, activation='relu')(out_flatten)
    out_second_dropout = Dropout(0.2)(out_first_layer)
    # out_second_layer   = Dense(8, activation='relu')(out_second_dropout)
    # out_third_dropout  = Dropout(0.2)(out_second_layer)
    out = Dense(1, activation='sigmoid')(out_second_dropout)
    
    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

## Entrenamiento

In [None]:
# model = build_model2(bert_layer, max_len=160) # best_score model! 
# model.summary()


In [None]:
# checkpoint = ModelCheckpoint('model_train.h5', monitor='val_loss', save_best_only=True)

# train_history = model.fit(
#     train_input, train_labels,
#     validation_split=0.2,
#     epochs=3,
#     callbacks=[checkpoint],
#     batch_size=16
# )

In [None]:
# model.load_weights('modelv_train.h5')

In [None]:
# test_pred = model.predict(test_input)

In [None]:
# submission = pd.read_csv('./sample_submission.csv')
# submission.head()

In [None]:
# submission['target'] = test_pred.round().astype(int)
# submission.to_csv('submission.csv', index=False)