# Notebook oficial - TP Datos

In [None]:
# Importando librerias
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import datetime
import warnings

import re
import nltk
import string
from nltk.tokenize import TweetTokenizer

from sklearn.model_selection import train_test_split
#Modelos
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression

In [None]:
#Carga del dataset
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')
#Dimension
print('La dimension del dataset es: ',df_train.shape[0], 'registros,', df_train.shape[1],'columnas')
# Vista de los primeros registros
df_train.head(5)
# Data:
# id - identificador unico para cada tweet
# keyword - un keyword para el tweet (podría faltar)
# location - ubicación desde donde fue enviado (podría no estar)
# text - el texto del tweet
# target - indica si se trata de un desastre real (1) o no (0)

In [None]:
#Definición de tipos
df_train['id'] = df_train['id'].astype(int)
df_train['keyword'] = df_train['keyword'].fillna(value = "noKeyword").astype('object')
df_train['location'] = df_train['location'].astype('object')
df_train['text'] = df_train['text'].astype('object')
df_train['target'] = df_train['target'].astype('bool')
df_test['id'] = df_test['id'].astype(int)
df_test['keyword'] = df_test['keyword'].fillna(value = "noKeyword").astype('object')
df_test['location'] = df_test['location'].astype('object')
df_test['text'] = df_test['text'].astype('object')

## Filtrado de datos

In [None]:
#Elimino duplicados
df_train.drop_duplicates(inplace=True)

In [None]:
df_train.describe(include="all")

Hay algunos registros cuyo label es incorrecto, los corregimos

In [None]:
mislabelled_ids = [328,443,513,2619,3640,3900,4342,5781,6552,6554,6570,6701,6702,6729,6861,7226]

df_train.loc[ df_train['id'].isin(mislabelled_ids)]

In [None]:
def relabel_target( tweet_id , target ):
    
    if tweet_id in mislabelled_ids:
        target = False

    return target

In [None]:
# reemplazo los targets que estan mislabeleados

df_train['target'] = df_train.apply(lambda row: relabel_target(row["id"], row['target']), axis=1)

df_train.loc[ df_train['id'].isin(mislabelled_ids)]

In [None]:
#----------WARNING---------------#

# esta libreria solo se puede instalar si tenes JDK VERSION 8
# si tenes cualquier otra version NO SE PUEDE INSTALAR

# para instalarlo: 
# pip install pycontractions
from pycontractions import Contractions

In [None]:
cont = Contractions(api_key="glove-twitter-100")
cont.load_models()

In [None]:
df_train_cont = df_train.copy()
df_test_cont = df_test.copy()

In [None]:
#revisamos un tweet que tiene alguna contraccion
df_train_cont.iloc[99,3]

In [None]:

df_train_cont['text'] = list( cont.expand_texts(df_train_cont['text'].to_list()) )

df_test_cont['text'] = list( cont.expand_texts(df_test_cont['text'].to_list()) )

In [None]:
df_train_cont.iloc[99,3]

#### reemplazo los urls presentes con "URL"

In [None]:
df_train_cont.iloc[4732,3]

In [None]:
#patron que siguen los urls de los tweets
pattern = '(http|ftp|https)://(?:[-\w.]|(?:%[\da-fA-F]{2}))+/(?:[-\w.]|(?:%[\da-fA-F]{2}))+'

def replace_urls(text):
 
    replaced = re.sub(pattern, 'URL', text)
    
    return replaced

In [None]:
df_train_cont['text'] = df_train_cont['text'].apply(lambda x: replace_urls(x))

df_test_cont['text'] = df_test_cont['text'].apply(lambda x: replace_urls(x))

In [None]:
df_train_cont.iloc[4732,3]

#### cuento la cantidad de palabras que tiene cada tweet

In [None]:
def word_count(text):
    
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    words = tokenizer.tokenize(text)
    return len(words)

In [None]:
df_train_cont['word_count'] = df_train_cont['text'].apply(lambda x: word_count(x))
df_test_cont['word_count'] = df_test_cont['text'].apply(lambda x: word_count(x))

In [None]:
df_train_cont.head(3)

In [None]:
df_train_cont.iloc[4732,3]

nos interesaba contar los url como palabra, entonces decidimos eliminarlos despues de crear el 'word_count'

In [None]:
def remove_urls(text):
    
    no_url = re.sub('URL', '', text)
    
    return no_url

In [None]:
df_train_cont['text'] = df_train_cont['text'].apply( lambda x: remove_urls(x) )
df_test_cont['text'] = df_train_cont['text'].apply( lambda x: remove_urls(x) )

In [None]:
df_train_cont.iloc[4732,3]

## Preprocesamiento de datos

### Limpieza de los text

In [None]:
def procesar_slang(tweet):

    
    # remover caracteres especiales

    tweet = re.sub(r"\x89Û_", "", tweet)
    tweet = re.sub(r"\x89ÛÒ", "", tweet)
    tweet = re.sub(r"\x89ÛÓ", "", tweet)
    tweet = re.sub(r"\x89ÛÏWhen", "When", tweet)
    tweet = re.sub(r"\x89ÛÏ", "", tweet)
    tweet = re.sub(r"China\x89Ûªs", "China's", tweet)
    tweet = re.sub(r"let\x89Ûªs", "let's", tweet)
    tweet = re.sub(r"\x89Û÷", "", tweet)
    tweet = re.sub(r"\x89Ûª", "", tweet)
    tweet = re.sub(r"\x89Û\x9d", "", tweet)
    tweet = re.sub(r"å_", "", tweet)
    tweet = re.sub(r"\x89Û¢", "", tweet)
    tweet = re.sub(r"\x89Û¢åÊ", "", tweet)
    tweet = re.sub(r"fromåÊwounds", "from wounds", tweet)
    tweet = re.sub(r"åÊ", "", tweet)
    tweet = re.sub(r"åÈ", "", tweet)
    tweet = re.sub(r"JapÌ_n", "Japan", tweet)    
    tweet = re.sub(r"Ì©", "e", tweet)
    tweet = re.sub(r"å¨", "", tweet)
    tweet = re.sub(r"SuruÌ¤", "Suruc", tweet)
    tweet = re.sub(r"åÇ", "", tweet)
    tweet = re.sub(r"å£3million", "3 million", tweet)
    tweet = re.sub(r"åÀ", "", tweet)


    tweet = re.sub(r"&gt;", ">", tweet)
    tweet = re.sub(r"&lt;", "<", tweet)
    tweet = re.sub(r"&amp;", "&", tweet)
    

    return tweet

In [None]:
def clean_text(text):
    
    text = text.lower()
    text = re.sub('\n', '', text)
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('[=><,*;_:#@&\']', '',text)
    
    return text

In [None]:
def pre_process_text(text):

    tokenizer = TweetTokenizer(reduce_len=True,strip_handles=False)

    processed_text = procesar_slang(text)
    processed_text = clean_text(processed_text)
    processed_text = tokenizer.tokenize(processed_text) 
    processed_text = ' '.join(processed_text)
    
    return processed_text

In [None]:
df_train_cont.iloc[0,3]

In [None]:
df_train_cont['text'] = df_train_cont['text'].apply(lambda x : pre_process_text(x))
df_test_cont['text'] = df_test_cont['text'].apply(lambda x : pre_process_text(x))

In [None]:
df_train_cont.iloc[0,3]

In [None]:
# para instalar textblob:  
#pip install -U textblob

#para instalar los datos para usar textblob: 
#python -m textblob.download_corpora

from textblob import Word
from textblob import TextBlob

In [None]:
def lematizar_texto(text):
    
    lem = []
    
    for i in text.split():
        word1= Word(i).lemmatize("n")
        word2= Word(word1).lemmatize("v")
        word3= Word(word2).lemmatize("a")
        lem.append(Word(word3).lemmatize())
    
    lem_text = " ".join(lem)
    return lem_text

In [None]:
df_train_cont.iloc[99,3]

In [None]:
df_train_cont['text'] = df_train_cont['text'].apply(lambda x: lematizar_texto(x))
df_test_cont['text'] = df_test_cont['text'].apply(lambda x: lematizar_texto(x))

In [None]:
df_train_cont.iloc[99,3]

------------------------------------------------------

## Modelo Final: Red neuronal

In [None]:
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers.core import Activation, Dropout, Dense
from keras.layers import Flatten, LSTM
from keras.models import Model
from keras.layers.embeddings import Embedding
from keras.preprocessing.text import Tokenizer
from keras.layers import Input
from keras.layers import Bidirectional

#### abrimos y preparamos el pre-trained embedding

In [None]:
# abro el pre-trained embedding y me creo un diccionario que contenga
# todos sus elementos

embeddings_dictionary = dict()
glove_file = open('data/glove.twitter.27B.100d.txt', encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = np.asarray(records[1:], dtype='float32')
    embeddings_dictionary [word] = vector_dimensions

glove_file.close()

preparamos el texto del train para entrenar el modelo

In [None]:
# entreno el tokenizer con 

word_tok = Tokenizer(filters='=><*_,;:#@&\'')

word_tok.fit_on_texts(df_train_cont['text'])


#defino el vocab length (cant. de unique words +1 )
vocab_length = len(word_tok.word_index) + 1

#encodeo los tweets
embedded_train_text = word_tok.texts_to_sequences(df_train_cont['text'])

#cuantas palabras tiene el tweet mas largo
longest_text = max( df_train_cont['word_count'] )

#agrego padding para que la longitud de todos los tweets sea de 'longest_text'
padded_train_text = pad_sequences(embedded_train_text, longest_text, padding='post')

In [None]:
# creo la matriz de weights, que solo contiene los embeddings
# de las palabras que hay en el X_train

weights_matrix = np.zeros((vocab_length, 100))

for word, index in word_tok.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    
    if embedding_vector is not None:
        weights_matrix[index] = embedding_vector

In [None]:
#casteo el target a int
targets = df_train_cont['target'].astype(int)

In [None]:
model = Sequential()

embedding_layer = Embedding(vocab_length, 100, weights=[weights_matrix], input_length=longest_text, trainable=True)

model.add(embedding_layer)
model.add(Bidirectional(LSTM(6, dropout= 0.2)))

model.add(Dense(units=3, activation='relu'))
model.add(Dense(units=3, activation='relu'))

model.add(Dense(1, activation='sigmoid'))

In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
print(model.summary())

In [None]:
model.fit(padded_train_text, targets, epochs=4, verbose=1)

le doy formato al texto del test para poder predecir

In [None]:
embedding_test = word_tok.texts_to_sequences(df_test_cont['text'])
padded_test = pad_sequences(embedding_test, longest_text, padding='post')

In [None]:
predictions = model.predict_classes(padded_test)

In [None]:
#las predictions es un array de listas, donde cada una tiene la prediccion,
# aca lo convertimos en una sola lista que contiene todas las predicciones
predictions.tolist()
form_predictions =[]

for pred in predictions:
    for target in pred:
        form_predictions.append(target)

In [None]:
# le doy el formato que pide kaggle y guardo el csv
pred = pd.DataFrame()
pred['id'] = df_test_cont['id']
pred['target'] = form_predictions

pred.to_csv('results/resultKeras.csv',index=False)