# Importación de librerías

In [348]:
import pandas as pd
import numpy as np
import re as re
import string
import nltk
nltk.download('wordnet')
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB

from sklearn import metrics 
from sklearn.svm import LinearSVC

import gensim
from gensim.parsing.preprocessing import remove_stopwords

[nltk_data] Downloading package wordnet to /home/dario/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Lectura de los datos

Leemos los datos que usaremos para entrenar nuestro modelo.

## Lectura

In [349]:
data = pd.read_csv('train.csv', usecols = ['text','target'])

### Primer vistazo

In [350]:
data

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...
7608,Two giant cranes holding a bridge collapse int...,1
7609,@aria_ahrary @TheTawniest The out of control w...,1
7610,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,Police investigating after an e-bike collided ...,1


# Limpieza de los datos

Realizamos una limpieza del texto que se corresponde a los tweets para obtener un mejor resultado a la hora de procesar los datos

## Definicion de funciones auxiliares

In [351]:
#Eliminar numeros de un texto
def eliminar_numeros(text):
    return re.sub("\d+", "",text)

#Eliminar puntuacion
def eliminar_puntuacion(text):
    return re.sub(r'[^\w\s]','',text)

#Pasar letras a minusculas
def minusculas(text):
    return text.lower()

#Eliminar caracteres especiales
def eliminar_caracteres(text):
    return re.sub('[^a-zA-Z0-9 \n\.]', '',text)

#Eliminar urls
def eliminar_url(text):
    url_reg = re.compile(r'https?://\S+|www\.\S+')
    return url_reg.sub(r'',text)

#Eliminar stopwords
def eliminar_stopwords(text):
    return remove_stopwords(text)

## Aplicamos las funciones a nuestros datos

In [352]:
data['text'] = data['text'].apply(eliminar_numeros)
data['text'] = data['text'].apply(eliminar_puntuacion)
data['text'] = data['text'].apply(minusculas)
data['text'] = data['text'].apply(eliminar_caracteres)
data['text'] = data['text'].apply(eliminar_url)
data['text'] = data['text'].apply(eliminar_stopwords)

### Salida

In [353]:
data

Unnamed: 0,text,target
0,deeds reason earthquake allah forgive,1
1,forest near la ronge sask canada,1
2,residents asked shelter place notified officer...,1
3,people receive wildfires evacuation orders cal...,1
4,got sent photo ruby alaska smoke wildfires pou...,1
...,...,...
7608,giant cranes holding bridge collapse nearby ho...,1
7609,ariaahrary thetawniest control wild fires cali...,1
7610,m utckm s volcano hawaii httptcozdtoydebj,1
7611,police investigating ebike collided car little...,1


## Elimino duplicados

In [354]:
data[data.duplicated()]

Unnamed: 0,text,target
48,check httptcoroinsmejj httptcotjzjin httptcoyd...,0
115,ir icemoon aftershock httptcovampodgyw djicemo...,0
119,ir icemoon aftershock httptcothyzomvwu djicemo...,0
164,experts france begin examining airplane debris...,1
228,luka die annihilated alois trancy,0
...,...,...
7604,worldnews fallen powerlines glink tram update ...,1
7607,stormchase violent record breaking ef el reno ...,1
7609,ariaahrary thetawniest control wild fires cali...,1
7610,m utckm s volcano hawaii httptcozdtoydebj,1


Luego de una primera limpieza notamos que aparecen algunos registros duplicados, esto se debe a que el campo de texto variaba en algo en específico como por ejemplo el url o si la letra está en mayúscula o minúscula, etc.

In [355]:
data.drop_duplicates(inplace = True)

# Remuevo contracciones

In [356]:
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are",
"thx"   : "thanks"
}


def eliminar_contracciones(text):
    resultado = ""
    for word in text.split():
        if word in contractions.keys():
            resultado += contractions[word]
        else:
            resultado += word
        resultado += " "
    return resultado

### Ejemplo

In [357]:
text = "what's your name?"
eliminar_contracciones(text)

'what is your name? '

In [358]:
data['text'] = data['text'].apply(eliminar_contracciones)

In [359]:
data

Unnamed: 0,text,target
0,deeds reason earthquake allah forgive,1
1,forest near la ronge sask canada,1
2,residents asked shelter place notified officer...,1
3,people receive wildfires evacuation orders cal...,1
4,got sent photo ruby alaska smoke wildfires pou...,1
...,...,...
7603,officials quarantine place alabama home possib...,1
7605,flip im walmart bomb evacuate stay tuned blow,1
7606,suicide bomber kills saudi security site mosqu...,1
7608,giant cranes holding bridge collapse nearby ho...,1


# Stemming

Stemming es un método para reducir una palabra a su stem. Aumenta el recall ya que, por ejemplo, ejemplo una consulta sobre "bibliotecas" también encuentra documentos en los que solo aparezca "bibliotecario" porque el stem de las dos palabras es el mismo.

In [360]:
def stemmer(text):
    tokenizer = nltk.tokenize.TreebankWordTokenizer()
    tokens = tokenizer.tokenize(text)
    
    stemmer = nltk.stem.PorterStemmer()
    
    return " ".join(stemmer.stem(token) for token in tokens) 

In [361]:
data['text'] = data['text'].apply(stemmer)

In [362]:
data

Unnamed: 0,text,target
0,deed reason earthquak allah forgiv,1
1,forest near la rong sask canada,1
2,resid ask shelter place notifi offic evacu she...,1
3,peopl receiv wildfir evacu order california,1
4,got sent photo rubi alaska smoke wildfir pour ...,1
...,...,...
7603,offici quarantin place alabama home possibl eb...,1
7605,flip im walmart bomb evacu stay tune blow,1
7606,suicid bomber kill saudi secur site mosqu reut...,1
7608,giant crane hold bridg collaps nearbi home htt...,1


# Lemmatization

La lematización es un proceso lingüístico que consiste en, dada una forma flexionada (es decir, en plural, en femenino, conjugada, etc), hallar el lema correspondiente. Hay varias palabras diferentes en representación de una misma palabra, por ejemplo sabemos que canto, cantas, canta, cantamos, cantáis, cantan son distintas formas (conjugaciones) de un mismo verbo (cantar)

In [363]:
def lemmatizer(text):
    tokenizer = nltk.tokenize.TreebankWordTokenizer()
    tokens = tokenizer.tokenize(text)
    
    lemmatizer=nltk.stem.WordNetLemmatizer()
    
    return " ".join(lemmatizer.lemmatize(token) for token in tokens)

In [364]:
data['text'] = data['text'].apply(lemmatizer)

In [365]:
data

Unnamed: 0,text,target
0,deed reason earthquak allah forgiv,1
1,forest near la rong sask canada,1
2,resid ask shelter place notifi offic evacu she...,1
3,peopl receiv wildfir evacu order california,1
4,got sent photo rubi alaska smoke wildfir pour ...,1
...,...,...
7603,offici quarantin place alabama home possibl eb...,1
7605,flip im walmart bomb evacu stay tune blow,1
7606,suicid bomber kill saudi secur site mosqu reut...,1
7608,giant crane hold bridg collaps nearbi home htt...,1


# Particionando los datos

Particionamos nuestros datos en dos ya que una parte de ellos será utilizada para entrenar a nuestro algorítmo y la otra será utilizada para probarlo. En nuestro caso las variables X respresentan a el texto de los tweets e Y el target asociado a los tweets. La intención de nuestro algorítmo es dado un determinado texto de un tweet(x) poder predecir su target(y)

In [366]:
X_train,X_test,y_train,y_test= train_test_split(data.text, data.target, test_size=0.1,random_state=1)

In [367]:
len(X_train)

6742

In [368]:
len(X_test)

750

# Bernoulli Naive Bayes (Naive Bayes Simple)

En primer lugar vamospor el camino de un modelo Naive Bayes Simple, tambi ́en conocido como Naive Bayes Bernoulli.En este caso un determinado token puede aparecer o no en un determinado documento. Bajo estaidea construiremos una matriz donde las columnas ser ́an los tokens extra ́ıdos de la totalidad detweets de nuestros datos y cada fila se corresponder ́a con un tweet. Si llamamos A a esta matrizentonces el elemento aij ser ́a 1 si el token j aparece en el tweet i o ser ́a 0 en caso contrario.

## Ejemplo

In [369]:
ejemplo = ['hola mi nombre mi federico', 'me gusta mi nombre', 'mi nombre nombre es federico']

In [370]:
vectEjemplo = CountVectorizer(analyzer='word', binary = True)

In [371]:
vectEjemplo.fit(ejemplo)

CountVectorizer(binary=True)

In [372]:
vectEjemplo.get_feature_names()

['es', 'federico', 'gusta', 'hola', 'me', 'mi', 'nombre']

In [373]:
ejemplo_dtm = vectEjemplo.transform(ejemplo)

In [374]:
ejemplo_array = ejemplo_dtm.toarray()

In [375]:
pd.DataFrame(ejemplo_array,columns = vectEjemplo.get_feature_names())

Unnamed: 0,es,federico,gusta,hola,me,mi,nombre
0,0,1,0,1,0,1,1
1,0,0,1,0,1,1,1
2,1,1,0,0,0,1,1


## Armando la matriz para nuestros datos

### Instanciando el vectorizador

In [376]:
vectBernoulliNB = CountVectorizer(analyzer='word',binary=True)

### Obtenemos una lista con todos los tokens obtenidos de los tweets

In [377]:
vectBernoulliNB.fit(X_train)

CountVectorizer(binary=True)

In [378]:
vectBernoulliNB.get_feature_names()[:30]

['aa',
 'aaaa',
 'aaaaaaallll',
 'aaaaaand',
 'aaarrrgghhh',
 'aaceorg',
 'aampb',
 'aampw',
 'aan',
 'aannnnd',
 'aar',
 'aaronthefm',
 'aashiqui',
 'ab',
 'aba',
 'abandon',
 'abandonedp',
 'abbandon',
 'abbott',
 'abbruchsimul',
 'abbswinston',
 'abbyairshow',
 'abc',
 'abcchicago',
 'abceyewit',
 'abcnew',
 'abcnorio',
 'abe',
 'aberdeen',
 'aberdeenfanpag']

### Armamos la matriz

Ahora  armemos  la  matriz  real  para  nuestros  datos  de  entrenamiento.  Sin  embargo,  como  lamayorıa de los tweets utilizan un conjunto reducido de tokens con respecto al total de tokens con el que trabajaremos, se espera que la mayoroıa de los elementos de la matriz sean cero y esto lo observaremos rapidamente a continuación

In [379]:
X_train_dtm_BernoulliNB = vectBernoulliNB.transform(X_train)
X_train_dtm_BernoulliNB 

<6742x16839 sparse matrix of type '<class 'numpy.int64'>'
	with 59055 stored elements in Compressed Sparse Row format>

In [380]:
X_train_Bernoulli_array = X_train_dtm_BernoulliNB.toarray()

In [381]:
pd.DataFrame(X_train_Bernoulli_array, columns = vectBernoulliNB.get_feature_names())

Unnamed: 0,aa,aaaa,aaaaaaallll,aaaaaand,aaarrrgghhh,aaceorg,aampb,aampw,aan,aannnnd,...,zone,zonesthank,zonewolf,zoom,zotar,zouma,zourryart,zrnf,zxatheti,zzzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6737,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6738,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6739,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6740,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Entrenando el modelo

Usaremos el clasificador Bernoulli Naive Bayes para realizar la clasificación

In [382]:
clasificadorBernoulliNB = BernoulliNB()

In [383]:
%time clasificadorBernoulliNB.fit(X_train_dtm_BernoulliNB, y_train)

CPU times: user 6.25 ms, sys: 0 ns, total: 6.25 ms
Wall time: 5.32 ms


BernoulliNB()

## Probamos el modelo

### Sobre el set de train

In [384]:
prediccionesBernoulli_train = clasificadorBernoulliNB.predict(X_train_dtm_BernoulliNB)

In [385]:
metrics.accuracy_score(y_train,prediccionesBernoulli_train)

0.9040344111539602

### Sobre el set de test

In [386]:
X_test_dtm_BernoulliNB = vectBernoulliNB.transform(X_test)
X_test_dtm_BernoulliNB 

<750x16839 sparse matrix of type '<class 'numpy.int64'>'
	with 5212 stored elements in Compressed Sparse Row format>

In [387]:
prediccionesBernoulli_test = nb.predict(X_test_dtm_BernoulliNB)

In [388]:
metrics.accuracy_score(y_test,prediccionesBernoulli_test)

0.808

# Multinomial Naive Bayes

Si elegimos el modelo Naive Bayes Multinomial un determinado token este puede aparecer nveces en un determinado tweet. Bajo esta idea construiremos una matriz donde las columnas ser ́anlos tokens extra ́ıdos de la totalidad de tweets de nuestros datos y cada fila se corresponder ́a conun tweet. Si llamamos A a esta matriz entonces el elemento aij ser ́a la cantidad de apariciones deltoken j en el tweet i. 

## Ejemplo

Repitamos el ejemplo anterior pero ahora aplicando esta nueva forma de construcción para la matriz

Tomemos tres tweets cualesquiera.

In [389]:
ejemplo = ['hola mi nombre mi federico', 'me gusta mi nombre', 'mi nombre nombre es federico']

In [390]:
vectEjemplo2 = CountVectorizer(analyzer='word',binary=False)

In [391]:
vectEjemplo2.fit(ejemplo)

CountVectorizer()

In [392]:
vectEjemplo2.get_feature_names()

['es', 'federico', 'gusta', 'hola', 'me', 'mi', 'nombre']

In [393]:
ejemplo2_dtm = vectEjemplo2.transform(ejemplo)
ejemplo2_dtm

<3x7 sparse matrix of type '<class 'numpy.int64'>'
	with 12 stored elements in Compressed Sparse Row format>

In [394]:
ejemplo_array = ejemplo2_dtm.toarray()

In [395]:
pd.DataFrame(ejemplo_array,columns = vectEjemplo2.get_feature_names())

Unnamed: 0,es,federico,gusta,hola,me,mi,nombre
0,0,1,0,1,0,2,1
1,0,0,1,0,1,1,1
2,1,1,0,0,0,1,2


### Armamos la matriz

Ahora  armemos  la  matriz  real  para  nuestros  datos  de  entrenamiento. Al igual que en el caso anterior se espera que la mayo parte de los elementos de la matriz sea cero.

### Instanciando el vectorizador

In [396]:
vectMultinomialNB  = CountVectorizer(analyzer='word',binary=False)

### Obtenemos una lista con todos los tokens obtenidos de los tweets

In [397]:
vectMultinomialNB.fit(X_train)

CountVectorizer()

In [398]:
vectMultinomialNB.get_feature_names()[:30]

['aa',
 'aaaa',
 'aaaaaaallll',
 'aaaaaand',
 'aaarrrgghhh',
 'aaceorg',
 'aampb',
 'aampw',
 'aan',
 'aannnnd',
 'aar',
 'aaronthefm',
 'aashiqui',
 'ab',
 'aba',
 'abandon',
 'abandonedp',
 'abbandon',
 'abbott',
 'abbruchsimul',
 'abbswinston',
 'abbyairshow',
 'abc',
 'abcchicago',
 'abceyewit',
 'abcnew',
 'abcnorio',
 'abe',
 'aberdeen',
 'aberdeenfanpag']

### Armamos la matriz

In [399]:
X_train_dtm_MultinomialNB = vectMultinomialNB.transform(X_train)
X_train_dtm_MultinomialNB

<6742x16839 sparse matrix of type '<class 'numpy.int64'>'
	with 59055 stored elements in Compressed Sparse Row format>

In [400]:
X_train_Multinomila_array=X_train_dtm_MultinomialNB.toarray()

In [401]:
pd.DataFrame(X_train_Multinomila_array ,columns = vectMultinomialNB.get_feature_names())

Unnamed: 0,aa,aaaa,aaaaaaallll,aaaaaand,aaarrrgghhh,aaceorg,aampb,aampw,aan,aannnnd,...,zone,zonesthank,zonewolf,zoom,zotar,zouma,zourryart,zrnf,zxatheti,zzzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6737,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6738,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6739,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6740,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Entrenando el modelo

Usaremos el clasificador multinomial Naive Bayes para realizar  la clasificación

In [402]:
clasificadorMultinomialNB = MultinomialNB()

In [403]:
%time clasificadorMultinomialNB.fit(X_train_dtm_MultinomialNB, y_train)

CPU times: user 4.94 ms, sys: 97 µs, total: 5.04 ms
Wall time: 6.11 ms


MultinomialNB()

## Probamos el modelo

### Sobre el set de train

In [404]:
prediccionesMultinomial_train = clasificadorMultinomialNB.predict(X_train_dtm_MultinomialNB)

In [405]:
metrics.accuracy_score(y_train,prediccionesMultinomial_train)

0.915752002373183

### Sobre el set de test

In [406]:
X_test_dtm_MultinomialNB = vectMultinomialNB.transform(X_test)
X_test_dtm_MultinomialNB

<750x16839 sparse matrix of type '<class 'numpy.int64'>'
	with 5212 stored elements in Compressed Sparse Row format>

In [407]:
prediccionesMultinomial_test = clasificadorMultinomialNB.predict(X_test_dtm_MultinomialNB)

In [408]:
metrics.accuracy_score(y_test,prediccionesMultinomial_test)

0.804

# Preparando el submit de Kaggle

### Leemos los datos

In [423]:
submit= pd.read_csv('test.csv')

In [424]:
submit

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,,,Storm in RI worse than last hurricane. My city...
3260,10868,,,Green Line derailment in Chicago http://t.co/U...
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...


### Limpiamos los datos

In [425]:
submit['text'] = submit['text'].apply(eliminar_numeros)
submit['text'] = submit['text'].apply(eliminar_puntuacion)
submit['text'] = submit['text'].apply(minusculas)
submit['text'] = submit['text'].apply(eliminar_url)
submit['text'] = submit['text'].apply(remove_stopwords)
submit['text'] = submit['text'].apply(stemmer)
submit['text'] = submit['text'].apply(lemmatizer)
submit['text'] = submit['text'].apply(eliminar_contracciones)

In [426]:
submit

Unnamed: 0,id,keyword,location,text
0,0,,,happen terribl car crash
1,2,,,heard earthquak differ citi stay safe
2,3,,,forest spot pond gee flee street save
3,9,,,apocalyps light spokan wildfir
4,11,,,typhoon soudelor kill china taiwan
...,...,...,...,...
3258,10861,,,earthquak safeti lo angel ûò safeti fasten xrwn
3259,10865,,,storm ri wors hurrican cityampoth hardest hit ...
3260,10868,,,green line derail chicago httptcoutbxlcbiuy
3261,10874,,,meg issu hazard weather outlook hwo httptcoxrb...


### Predecimos usando Bernoulli Naive Bayes

In [427]:
texts = vectBernoulliNB.transform(submit['text'])
predicciones_kaggle = clasificadorBernoulliNB.predict(texts)
submitRes1 = pd.DataFrame(submit['id'])
submitRes1['target'] = predicciones_kaggle
#submitRes1.to_csv('SUBMITS/submission-Beronoulli-bayes.csv',index=False)

### Predecimos usando Multinomial Naive Bayes

In [428]:
texts = vectMultinomialNB.transform(submit['text'])
predicciones_kaggle = clasificadorMultinomialNB.predict(texts)
submitRes2 = pd.DataFrame(submit['id'])
submitRes2['target'] = predicciones_kaggle
#submitRes2.to_csv('SUBMITS/submission-Multinomial-bayes.csv',index=False)