## [75.06 / 95.58] Organización de Datos
## Trabajo Práctico 2: Competencia de Machine Learning
### Grupo 18: DATAVID-20

* 102732 - Bilbao, Manuel
* 101933 - Karagoz, Filyan
* 98684 - Markarian, Darío
* 100901 - Stroia, Lautaro

## 1. Importación general de librerias y set-up de datos.

In [23]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import re as re
import os

#Para instalar gensim
#! pip3 install gensim
import gensim
from gensim.parsing.preprocessing import remove_stopwords

#Instalar tensorflow
#!pip3 install tensorflow
import tensorflow
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer



import warnings
warnings.filterwarnings('ignore')

pd.options.display.max_rows = None #mostrar todas las filas del df
%matplotlib inline
plt.style.use('default') # haciendo los graficos un poco mas bonitos en matplotlib
plt.rcParams['figure.figsize'] = (20, 10)
sns.set(style="whitegrid") # seteando tipo de grid en seaborn
pd.options.display.float_format = '{:20,.2f}'.format # suprimimos la notacion cientifica en los outputs


In [24]:
test_set = pd.read_csv('test.csv')
train_set = pd.read_csv('train.csv')
train_set.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


### Limpieza de datos
**Definicion de funciones auxiliares**

In [25]:
#Eliminar numeros de un texto
def eliminar_numeros(text):
    return re.sub("\d+", "",text)

#Eliminar puntuacion
def eliminar_puntuacion(text):
    return re.sub(r'[^\w\s]','',text)

#Pasar letras a minusculas
def minusculas(text):
    return text.lower()

#Eliminar caracteres especiales
def eliminar_caracteres(text):
    return re.sub('[^a-zA-Z0-9 \n\.]', '',text)

#Eliminar urls
def eliminar_url(text):
    url_reg = re.compile(r'https?://\S+|www\.\S+')
    return url_reg.sub(r'',text)

## 2. Word Embeddings para NLP

### Obtenemos un set de vectores de palabras pre-entrenados de:
#### http://nlp.stanford.edu/data/wordvecs/glove.6B.zip

In [26]:
#! wget -P ~ 'http://nlp.stanford.edu/data/wordvecs/glove.6B.zip' && unzip ~/glove.6B.zip -d ~/

In [27]:
word2vec = {}
with open(os.environ['HOME']+'/glove.6B.200d.txt', 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        word2vec[word] = vector

In [28]:
#Separamos la variable a predecir
Y = train_set['target']
train_set = train_set.drop('target',axis=1)

#Concateno sets de test y train para entrenar con todos los tweets de ambos sets
data = pd.concat([train_set,test_set],axis=0).reset_index(drop=True)

**Procesado de datos -> aplicar funciones de limpieza**

In [29]:
data['text'] = data['text'].apply(lambda x: eliminar_puntuacion(x))
data['text'] = data['text'].apply(lambda x: minusculas(x))
data['text'] = data['text'].apply(lambda x: eliminar_numeros(x))
data['text'] = data['text'].apply(lambda x: eliminar_caracteres(x))
data['text'] = data['text'].apply(lambda x: remove_stopwords(x))
data['text'] = data['text'].apply(lambda x: eliminar_url(x))

text_data = data['text']

In [30]:
#Separo los tweets en palabras (tokens)
tokenizer = Tokenizer()
#Actualiza vocab interno basado en una lista de textos (text_data)
tokenizer.fit_on_texts(text_data) 
word2index = tokenizer.word_index

In [31]:
#Cantidad de palabras distintas
len(word2index)

27497

In [32]:
#Armo secuencias de enteros a partir de los tokens
secuencias = tokenizer.texts_to_sequences(text_data)
secuencias[:5]

[[5334, 627, 116, 1653, 3357],
 [56, 88, 590, 7979, 7980, 1172],
 [1321, 1245, 1945, 546, 7981, 1515, 131, 1945, 546, 1408, 1017],
 [7, 4076, 1055, 131, 1408, 22],
 [20, 1246, 150, 5335, 2172, 137, 1055, 7982, 84]]

In [33]:
#Podemos observar que los vectores formados son de distinta longitud, por lo que habia
#que llevarlos a todos a la misma dimension agregando un padding
data_same_dim = tensorflow.keras.preprocessing.sequence.pad_sequences(secuencias,)
data_same_dim[:5]

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0, 5334,  627,  116, 1653, 3357],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,   56,   88,  590, 7979, 7980, 1172],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0, 1321,
        1245, 1945,  546, 7981, 1515,  131, 1945,  546, 1408, 1017],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    7, 4076, 1055,  131, 1408,   22],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,   20, 1246,  150, 5335, 2172,  137, 1055, 7982,   84]],
      dtype=int32)

In [34]:
#dimension de la matriz. Acordar que tenemos ambos datasets (test y train) concatenados, por eso
#tantas filas
data_same_dim.shape

(10876, 21)

In [35]:
padded_test = data_same_dim[train_set.shape[0]:]
padded_train = data_same_dim[:train_set.shape[0]]

In [36]:
#Armo matriz de embeddings
embeddings = np.zeros((len(word2index)+1,200))
vec = []
for word, i in word2index.items():
    try:
       embeddings[i] = word2vec[word]
    except KeyError:
        continue
embeddings.shape

(27498, 200)

In [37]:
#Mostrando un ejemplo
embeddings[1]

array([ 2.55270004e-01,  3.36780012e-01, -5.23590028e-01, -2.40370005e-01,
        1.05619997e-01,  1.18989997e-01, -5.52529991e-01,  3.66450012e-01,
       -4.06459987e-01,  3.73580009e-01, -2.14589998e-01,  5.29079974e-01,
        4.40459996e-01,  8.75909999e-02, -1.44730002e-01, -1.64940000e-01,
       -2.73649991e-01,  2.56119996e-01, -5.50870001e-02,  9.07370001e-02,
        1.82710007e-01,  2.52329993e+00,  2.40480006e-01, -3.24369997e-01,
        5.53879976e-01, -2.04510003e-01,  1.98369995e-01, -1.71360001e-01,
       -1.49820000e-01,  1.20710000e-01,  9.07389969e-02, -7.63079971e-02,
       -4.71910000e-01,  2.12339997e-01, -3.11740011e-01, -6.76829964e-02,
       -2.80149996e-01, -5.18589988e-02, -5.04290015e-02,  3.36719990e-01,
       -1.72470003e-01, -7.40220025e-02, -1.03090003e-01,  3.96609992e-01,
       -1.92629993e-01, -7.38490000e-02,  8.28279972e-01, -4.21000004e-01,
       -9.29619968e-02,  5.76099992e-01, -1.36020005e-01,  1.32479995e-01,
       -6.70439973e-02,  

**Ahora, procedemos a entrenar esta matriz de embeddings mediante algoritmo de deep learning**

In [51]:
#Definimos el modelo secuencial
model1 = keras.Sequential()
embedding = keras.layers.Embedding(len(word2index)+1, 200, weights=[embeddings], input_length= data_same_dim.shape[1],
              trainable=False)
model1.add(embedding)
model1.add(keras.layers.Flatten())
#Le agrego 1 red neuronal a la capa "oculta"
model1.add(keras.layers.Dense(1, activation='sigmoid')) #Sigmoidea para mapear los resultados a un rango (0,1)
#Probe agregando mas capas con 1 red neuronal y el resultado no cambia.

#Compilamos el modelo1
model1.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

#Resumen del modelo 1
print(model1.summary())

#Entrenamos el modelo
model1.fit(padded_train,Y,epochs=120, verbose=0) #a mas epochs, mejor accuracy

#Evaluamos el 1er modelo
loss, accuracy = model1.evaluate(padded_train, Y, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 21, 200)           5499600   
_________________________________________________________________
flatten_4 (Flatten)          (None, 4200)              0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 4201      
Total params: 5,503,801
Trainable params: 4,201
Non-trainable params: 5,499,600
_________________________________________________________________
None
Accuracy: 92.263234


In [54]:
#Preparando el submit de kaggle
submit = pd.DataFrame(test_set['id'],columns=['id'])
predictions = model1.predict(padded_test)
submit['target'] = predictions
submit['target'] = round(submit['target']).astype('int')
submit.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,0
3,9,1
4,11,1


In [56]:
#submit.to_csv('embeddings+DL.csv', index=False)