# Notebook oficial - TP Datos

In [1]:
# Importando librerias
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import datetime
import warnings

import re
import nltk
import string
from nltk.tokenize import TweetTokenizer

from sklearn.model_selection import train_test_split
#Modelos
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression

In [2]:
#Carga del dataset
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')
#Dimension
print('La dimension del dataset es: ',df_train.shape[0], 'registros,', df_train.shape[1],'columnas')
# Vista de los primeros registros
df_train.head(5)
# Data:
# id - identificador unico para cada tweet
# keyword - un keyword para el tweet (podría faltar)
# location - ubicación desde donde fue enviado (podría no estar)
# text - el texto del tweet
# target - indica si se trata de un desastre real (1) o no (0)

La dimension del dataset es:  7613 registros, 5 columnas


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
#Definición de tipos
df_train['id'] = df_train['id'].astype(int)
df_train['keyword'] = df_train['keyword'].fillna(value = "noKeyword").astype('object')
df_train['location'] = df_train['location'].astype('object')
df_train['text'] = df_train['text'].astype('object')
df_train['target'] = df_train['target'].astype('bool')
df_test['id'] = df_test['id'].astype(int)
df_test['keyword'] = df_test['keyword'].fillna(value = "noKeyword").astype('object')
df_test['location'] = df_test['location'].astype('object')
df_test['text'] = df_test['text'].astype('object')

## Filtrado de datos

In [4]:
#Elimino duplicados
df_train.drop_duplicates(inplace=True)

In [5]:
df_train.describe(include="all")

Unnamed: 0,id,keyword,location,text,target
count,7613.0,7613,5080,7613,7613
unique,,222,3341,7503,2
top,,noKeyword,USA,11-Year-Old Boy Charged With Manslaughter of T...,False
freq,,61,104,10,4342
mean,5441.934848,,,,
std,3137.11609,,,,
min,1.0,,,,
25%,2734.0,,,,
50%,5408.0,,,,
75%,8146.0,,,,


Hay algunos registros cuyo label es incorrecto, los corregimos

In [6]:
mislabelled_ids = [328,443,513,2619,3640,3900,4342,5781,6552,6554,6570,6701,6702,6729,6861,7226]

df_train.loc[ df_train['id'].isin(mislabelled_ids)]

Unnamed: 0,id,keyword,location,text,target
229,328,annihilated,,Ready to get annihilated for the BUCS game,True
301,443,apocalypse,,Short Reading\r\n\r\nApocalypse 21:1023 \r\n\r...,True
356,513,army,Studio,But if you build an army of 100 dogs and their...,True
1822,2619,crashed,,My iPod crashed..... \r\n#WeLoveYouLouis \r\n#...,True
2536,3640,desolation,"Quilmes , Arg",This desperation dislocation\r\nSeparation con...,True
2715,3900,devastated,PG Chillin!,Man Currensy really be talkin that talk... I'd...,True
3024,4342,dust%20storm,chicago,Going to a fest? Bring swimming goggles for th...,True
4068,5781,forest%20fires,,Campsite recommendations \r\nToilets /shower \...,True
4609,6552,injury,Saint Paul,My prediction for the Vikings game this Sunday...,True
4611,6554,injury,,Dante Exum's knee injury could stem Jazz's hop...,True


In [7]:
def relabel_target( tweet_id , target ):
    
    if tweet_id in mislabelled_ids:
        target = False

    return target

In [8]:
# reemplazo los targets que estan mislabeleados

df_train['target'] = df_train.apply(lambda row: relabel_target(row["id"], row['target']), axis=1)

df_train.loc[ df_train['id'].isin(mislabelled_ids)]

Unnamed: 0,id,keyword,location,text,target
229,328,annihilated,,Ready to get annihilated for the BUCS game,False
301,443,apocalypse,,Short Reading\r\n\r\nApocalypse 21:1023 \r\n\r...,False
356,513,army,Studio,But if you build an army of 100 dogs and their...,False
1822,2619,crashed,,My iPod crashed..... \r\n#WeLoveYouLouis \r\n#...,False
2536,3640,desolation,"Quilmes , Arg",This desperation dislocation\r\nSeparation con...,False
2715,3900,devastated,PG Chillin!,Man Currensy really be talkin that talk... I'd...,False
3024,4342,dust%20storm,chicago,Going to a fest? Bring swimming goggles for th...,False
4068,5781,forest%20fires,,Campsite recommendations \r\nToilets /shower \...,False
4609,6552,injury,Saint Paul,My prediction for the Vikings game this Sunday...,False
4611,6554,injury,,Dante Exum's knee injury could stem Jazz's hop...,False


In [9]:
#----------WARNING---------------#

# esta libreria solo se puede instalar si tenes JDK VERSION 8
# si tenes cualquier otra version NO SE PUEDE INSTALAR

# para instalarlo: 
# pip install pycontractions
from pycontractions import Contractions

In [10]:
cont = Contractions(api_key="glove-twitter-100")
cont.load_models()

In [11]:
df_train_cont = df_train.copy()
df_test_cont = df_test.copy()

In [12]:
#revisamos un tweet que tiene alguna contraccion
df_train_cont.iloc[99,3]

"only had a car for not even a week and got in a fucking car accident .. Mfs can't fucking drive ."

In [13]:

df_train_cont['text'] = list( cont.expand_texts(df_train_cont['text'].to_list()) )

df_test_cont['text'] = list( cont.expand_texts(df_test_cont['text'].to_list()) )

In [14]:
df_train_cont.iloc[99,3]

'only had a car for not even a week and got in a fucking car accident .. Mfs cannot fucking drive .'

#### reemplazo los urls presentes con "URL"

In [15]:
df_train_cont.iloc[4732,3]

'Check out my Lava lamp dude ???? http://t.co/To9ViqooFv'

In [16]:
#patron que siguen los urls de los tweets
pattern = '(http|ftp|https)://(?:[-\w.]|(?:%[\da-fA-F]{2}))+/(?:[-\w.]|(?:%[\da-fA-F]{2}))+'

def replace_urls(text):
 
    replaced = re.sub(pattern, 'URL', text)
    
    return replaced

In [17]:
df_train_cont['text'] = df_train_cont['text'].apply(lambda x: replace_urls(x))

df_test_cont['text'] = df_test_cont['text'].apply(lambda x: replace_urls(x))

In [18]:
df_train_cont.iloc[4732,3]

'Check out my Lava lamp dude ???? URL'

#### cuento la cantidad de palabras que tiene cada tweet

In [19]:
def word_count(text):
    
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    words = tokenizer.tokenize(text)
    return len(words)

In [20]:
df_train_cont['word_count'] = df_train_cont['text'].apply(lambda x: word_count(x))
df_test_cont['word_count'] = df_test_cont['text'].apply(lambda x: word_count(x))

In [21]:
df_train_cont.head(3)

Unnamed: 0,id,keyword,location,text,target,word_count
0,1,noKeyword,,Our Deeds are the Reason of this #earthquake M...,True,13
1,4,noKeyword,,Forest fire near La Ronge Sask. Canada,True,7
2,5,noKeyword,,All residents asked to 'shelter in place' are ...,True,22


In [22]:
df_train_cont.iloc[4732,3]

'Check out my Lava lamp dude ???? URL'

nos interesaba contar los url como palabra, entonces decidimos eliminarlos despues de crear el 'word_count'

In [23]:
def remove_urls(text):
    
    no_url = re.sub('URL', '', text)
    
    return no_url

In [24]:
df_train_cont['text'] = df_train_cont['text'].apply( lambda x: remove_urls(x) )
df_test_cont['text'] = df_train_cont['text'].apply( lambda x: remove_urls(x) )

In [25]:
df_train_cont.iloc[4732,3]

'Check out my Lava lamp dude ???? '

## Preprocesamiento de datos

### Limpieza de los text

In [26]:
def procesar_slang(tweet):

    
    # remover caracteres especiales

    tweet = re.sub(r"\x89Û_", "", tweet)
    tweet = re.sub(r"\x89ÛÒ", "", tweet)
    tweet = re.sub(r"\x89ÛÓ", "", tweet)
    tweet = re.sub(r"\x89ÛÏWhen", "When", tweet)
    tweet = re.sub(r"\x89ÛÏ", "", tweet)
    tweet = re.sub(r"China\x89Ûªs", "China's", tweet)
    tweet = re.sub(r"let\x89Ûªs", "let's", tweet)
    tweet = re.sub(r"\x89Û÷", "", tweet)
    tweet = re.sub(r"\x89Ûª", "", tweet)
    tweet = re.sub(r"\x89Û\x9d", "", tweet)
    tweet = re.sub(r"å_", "", tweet)
    tweet = re.sub(r"\x89Û¢", "", tweet)
    tweet = re.sub(r"\x89Û¢åÊ", "", tweet)
    tweet = re.sub(r"fromåÊwounds", "from wounds", tweet)
    tweet = re.sub(r"åÊ", "", tweet)
    tweet = re.sub(r"åÈ", "", tweet)
    tweet = re.sub(r"JapÌ_n", "Japan", tweet)    
    tweet = re.sub(r"Ì©", "e", tweet)
    tweet = re.sub(r"å¨", "", tweet)
    tweet = re.sub(r"SuruÌ¤", "Suruc", tweet)
    tweet = re.sub(r"åÇ", "", tweet)
    tweet = re.sub(r"å£3million", "3 million", tweet)
    tweet = re.sub(r"åÀ", "", tweet)


    tweet = re.sub(r"&gt;", ">", tweet)
    tweet = re.sub(r"&lt;", "<", tweet)
    tweet = re.sub(r"&amp;", "&", tweet)
    

    return tweet

In [27]:
def clean_text(text):
    
    text = text.lower()
    text = re.sub('\n', '', text)
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('[=><,*;_:#@&\']', '',text)
    
    return text

In [28]:
def pre_process_text(text):

    tokenizer = TweetTokenizer(reduce_len=True,strip_handles=False)

    processed_text = procesar_slang(text)
    processed_text = clean_text(processed_text)
    processed_text = tokenizer.tokenize(processed_text) 
    processed_text = ' '.join(processed_text)
    
    return processed_text

In [29]:
df_train_cont.iloc[0,3]

'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all'

In [30]:
df_train_cont['text'] = df_train_cont['text'].apply(lambda x : pre_process_text(x))
df_test_cont['text'] = df_test_cont['text'].apply(lambda x : pre_process_text(x))

In [31]:
df_train_cont.iloc[0,3]

'our deeds are the reason of this earthquake may allah forgive us all'

In [32]:
# para instalar textblob:  
#pip install -U textblob

#para instalar los datos para usar textblob: 
#python -m textblob.download_corpora

from textblob import Word
from textblob import TextBlob

In [33]:
def lematizar_texto(text):
    
    lem = []
    
    for i in text.split():
        word1= Word(i).lemmatize("n")
        word2= Word(word1).lemmatize("v")
        word3= Word(word2).lemmatize("a")
        lem.append(Word(word3).lemmatize())
    
    lem_text = " ".join(lem)
    return lem_text

In [34]:
df_train_cont.iloc[99,3]

'only had a car for not even a week and got in a fucking car accident .. mfs cannot fucking drive .'

In [35]:
df_train_cont['text'] = df_train_cont['text'].apply(lambda x: lematizar_texto(x))
df_test_cont['text'] = df_test_cont['text'].apply(lambda x: lematizar_texto(x))

In [36]:
df_train_cont.iloc[99,3]

'only have a car for not even a week and get in a fuck car accident .. mf cannot fuck drive .'

------------------------------------------------------

## Modelo Final: Red neuronal

In [None]:
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers.core import Activation, Dropout, Dense
from keras.layers import Flatten, LSTM
from keras.models import Model
from keras.layers.embeddings import Embedding
from keras.preprocessing.text import Tokenizer
from keras.layers import Input
from keras.layers import Bidirectional

#### abrimos y preparamos el pre-trained embedding

In [38]:
# abro el pre-trained embedding y me creo un diccionario que contenga
# todos sus elementos

embeddings_dictionary = dict()
glove_file = open('data/glove.twitter.27B.100d.txt', encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = np.asarray(records[1:], dtype='float32')
    embeddings_dictionary [word] = vector_dimensions

glove_file.close()

preparamos el texto del train para entrenar el modelo

In [39]:
# entreno el tokenizer con 

word_tok = Tokenizer(filters='=><*_,;:#@&\'')

word_tok.fit_on_texts(df_train_cont['text'])


#defino el vocab length (cant. de unique words +1 )
vocab_length = len(word_tok.word_index) + 1

#encodeo los tweets
embedded_train_text = word_tok.texts_to_sequences(df_train_cont['text'])

#cuantas palabras tiene el tweet mas largo
longest_text = max( df_train_cont['word_count'] )

#agrego padding para que la longitud de todos los tweets sea de 'longest_text'
padded_train_text = pad_sequences(embedded_train_text, longest_text, padding='post')

In [40]:
# creo la matriz de weights, que solo contiene los embeddings
# de las palabras que hay en el X_train

weights_matrix = np.zeros((vocab_length, 100))

for word, index in word_tok.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    
    if embedding_vector is not None:
        weights_matrix[index] = embedding_vector

In [41]:
#casteo el target a int
targets = df_train_cont['target'].astype(int)

In [42]:
model = Sequential()

embedding_layer = Embedding(vocab_length, 100, weights=[weights_matrix], input_length=longest_text, trainable=True)

model.add(embedding_layer)
model.add(Bidirectional(LSTM(6, dropout= 0.2)))

model.add(Dense(units=3, activation='relu'))
model.add(Dense(units=3, activation='relu'))

model.add(Dense(1, activation='sigmoid'))

In [43]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
print(model.summary())

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 34, 100)           1352900   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 12)                5136      
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 39        
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 12        
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 4         
Total params: 1,358,091
Trainable params: 1,358,091
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
model.fit(padded_train_text, targets, epochs=4, verbose=1)

le doy formato al texto del test para poder predecir

In [45]:
embedding_test = word_tok.texts_to_sequences(df_test_cont['text'])
padded_test = pad_sequences(embedding_test, longest_text, padding='post')

In [46]:
predictions = model.predict_classes(padded_test)

In [47]:
#las predictions es un array de listas, donde cada una tiene la prediccion,
# aca lo convertimos en una sola lista que contiene todas las predicciones
predictions.tolist()
form_predictions =[]

for pred in predictions:
    for target in pred:
        form_predictions.append(target)

In [48]:
# le doy el formato que pide kaggle y guardo el csv
pred = pd.DataFrame()
pred['id'] = df_test_cont['id']
pred['target'] = form_predictions

pred.to_csv('results/resultKeras.csv',index=False)