In [None]:
%run "Funciones.py"
%matplotlib inline

## Entrenamiento con redes neuronales

---

### Modelo base

In [None]:
# Importo los datos como dataframe de pandas
real = pd.read_csv("Data/True.csv")
fake = pd.read_csv("Data/Fake.csv")

In [None]:
# Agrego etiquetas
real['fake?'] = 0
fake['fake?'] = 1

In [None]:
fake.head()

In [None]:
# Mezclamos los 2 dataframes y los ramdomisamos el orden
news = real.append(fake, ignore_index=True)
news = shuffle(news)
news

In [None]:
#Combinamos columnas de texto
news['text'] = news['title'] + " " + news['text']
news.drop(['title', 'subject', 'date'], axis=1, inplace=True)
news.sample(5)

In [None]:
news.shape

In [None]:
news['fake?'].value_counts()

In [None]:
# Se separan el dataset en los sets de train (luego el de train en validacion) y el de test
# este enfoque se justifica previo a la tokenizacion
train, test = train_test_split(news, test_size=0.3, random_state=42)
train, validation = train_test_split(train, test_size = 0.2, random_state=42)
print(len(train), len(validation), len(test) )

In [None]:
vocabulario_max = 20000
filtrar = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\''

tokenizer = Tokenizer(num_words=vocabulario_max, filters=filtrar, lower=True, split=" ")
tokenizer.fit_on_texts(train.text)
word_index = tokenizer.word_index

In [None]:
# summarize what was learned
#print(tokenizer.word_counts)
#print(tokenizer.document_count)
#print(tokenizer.word_index)
#print(tokenizer.word_docs)

In [None]:
train_matrix = tokenizer.texts_to_matrix(np.array(train.text))
val_matrix = tokenizer.texts_to_matrix(np.array(validation.text))
test_matrix = tokenizer.texts_to_matrix(np.array(test.text))

In [None]:
type(train_matrix)

In [None]:
train_matrix.shape

In [None]:
train_matrix[0:2,:10]

In [None]:
val_matrix.shape

In [None]:
test_matrix.shape

In [None]:
x_train = np.copy(train_matrix)
x_val = np.copy(val_matrix)
x_test = np.copy(test_matrix)
y_train = train['fake?'].values
y_val = validation['fake?'].values
y_test = test['fake?'].values

### Aqui comienza la configuracion y entrenamiento de la red base

In [None]:
# Creamos un modelo con dos capas ocultas de 16 neuronas cada una
model = Sequential(name="modelo_base")
model.add(Dense(16, activation='relu', input_shape=(20000,)))
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.summary()

In [None]:
# Compilamos el modelo
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [None]:
# Entrenamos
history = model.fit(x=x_train, y=y_train,
                    epochs=20, batch_size=512, validation_data=(x_val, y_val))

In [None]:
# El atributo `history` contiene un diccionario de métricas por epoch
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)

plt.figure(figsize=(12,8))
plt.title('Modelo de base')
plt.plot(epochs, loss)
plt.plot(epochs, val_loss)
plt.xticks(ticks=epochs)
plt.ylabel('Loss')
plt.legend(['Training loss', 'Validation loss'])
plt.savefig('Graficos/07_Loss.png')

plt.figure(figsize=(12,8))
plt.plot(epochs, acc)
plt.plot(epochs, val_acc)
plt.xticks(ticks=list(epochs))
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(['Training accuracy', 'Validation accuracy']);
plt.savefig('Graficos/08_Accuracy.png')

### Testeo

In [None]:
y_pred = model.predict_classes(x_test)

In [None]:
y_pred.shape

In [None]:
np.reshape(y_pred, y_pred.shape[0])

In [None]:
y_test[0:5]

In [None]:
print(classification_report(y_test, y_pred, target_names = ['Fake','Not Fake']))

In [None]:
cm = confusion_matrix(y_test, y_pred , labels=[0, 1])

In [None]:
plt.figure(figsize=(10, 10))
hm = sns.heatmap(cm, annot=True, fmt='.0f')
plt.ylabel('Verdaderos')
plt.title('Cofusion Matrix - Base Model')
plt.xlabel('Predichos');

fig = hm.get_figure()
fig.savefig('Graficos/09_HashMap.png')