# Segunda semana - Clasificación de textos usando Naive Bayes

La segunda semana hablaron sobre otra técnica de clasificación: Naive Bayes.

In [1]:
from IPython.display import Image

## Leyendo los datos

In [2]:
import nltk
from nltk.corpus import twitter_samples
import numpy as np

# Descargando tweets
nltk.download('twitter_samples')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\jahaz\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


True

In [3]:
# Seleccionando una lista de tweets positivos y negativos
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

In [4]:
# Diviendo la lista de tweets en entrenamiento y test

train_positive_tweets = all_positive_tweets[:int(len(all_positive_tweets) * 0.8)]
train_negative_tweets = all_negative_tweets[:int(len(all_positive_tweets) * 0.8)]

test_positive_tweets = all_positive_tweets[int(len(all_positive_tweets) * 0.8):]
test_negative_tweets = all_negative_tweets[int(len(all_positive_tweets) * 0.8):]

In [5]:
# juntando los textos y generando una target (0: negativo, 1: positivo)

train_tweets = train_positive_tweets + train_negative_tweets
train_target = [1] * len(train_positive_tweets) + [0] * len(train_negative_tweets)

test_tweets = test_positive_tweets + test_negative_tweets
test_target = [1] * len(test_positive_tweets) + [0] * len(test_negative_tweets)

## Entrenamiento Naive Bayes

### Paso 0: recolectar datos

### Paso 1: Preprocesamiento de datos

In [6]:
# Descargando stopwords
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer

import string
import re

nltk.download('stopwords')

def procesa_tweet(tweet):
    '''
    Limpiamos un tweet con información importante para entrenar los modelos
        input: tweet
        output: tweet limpio
    '''
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    # Eliminamos símbolos de tickets de mercado $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # Eliminamos estilos antiguos de retweet "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # Eliminamos URL
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # Eliminamos hashtags
    # Solo eliminamos el símbolo hash # en la palabra
    tweet = re.sub(r'#', '', tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)
    
    tweets_clean = []
    
    for word in tweet_tokens:
        if word not in stopwords_english and word not in string.punctuation:
            tweets_clean.append(stemmer.stem(word))
    
    return tweets_clean

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jahaz\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
print(train_tweets[0])
print(procesa_tweet(train_tweets[0]))

#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)
['followfriday', 'top', 'engag', 'member', 'commun', 'week', ':)']


### Paso 2: Word Count

In [8]:
from tqdm.notebook import tqdm

def genera_frecuencias(lista_tweets, lista_target):
    ''' Generamos un diccionario con la cantidad de casos positivos
        y negativos para cada palabra 
    '''
    
    frecuencias = {}
    # Recorremos cada par tweet - target
    for tweet, target in tqdm(zip(lista_tweets, lista_target), total=len(lista_tweets)):
        # Tokenizamos el tweet
        for word in procesa_tweet(tweet=tweet):
            # Generamos el par (palabra, target)
            par = (word, target)
            # Actualizamos el diccionario de frecuencias
            if par in frecuencias:
                frecuencias[par] += 1
            else:
                frecuencias[par] = 1
    return frecuencias

In [9]:
frecuencias = genera_frecuencias(lista_tweets=train_tweets, lista_target=train_target)

HBox(children=(FloatProgress(value=0.0, max=8000.0), HTML(value='')))




In [10]:
frecuencias[(':)', 1)]

2847

In [11]:
frecuencias[(':)', 0)]

2

### Paso 3 y 4: Probabilidades y Lambda (log del ratio)

In [12]:
def calcula_prob(frecuencias):
    dict_prob = {}
    # Lista de palabras
    words = set([x[0] for x in frecuencias.keys()])
    V = len(words)
    
    N_pos = sum([v for k, v in frecuencias.items() if k[1] == 1])
    N_neg = sum([v for k, v in frecuencias.items() if k[1] == 0])

    for word in words:
        dict_prob[(word, 1)] = (frecuencias.get((word, 1), 0) + 1) / (N_pos + V)
        dict_prob[(word, 0)] = (frecuencias.get((word, 0), 0) + 1) / (N_neg + V)
    return dict_prob       

In [13]:
probabilidades_diccionario = calcula_prob(frecuencias)

In [14]:
probabilidades_diccionario[(':)', 1)]

0.07925641453776368

In [15]:
probabilidades_diccionario[(':)', 0)]

8.30357884248111e-05

In [16]:
sum([v for k, v in probabilidades_diccionario.items() if k[1] == 1])

1.0000000000000882

In [17]:
sum([v for k, v in probabilidades_diccionario.items() if k[1] == 0])

1.0000000000000886

In [18]:
def calcula_lambda(probabilidades_diccionario):
    diccionario_lambda = {}
    words = set([x[0] for x in probabilidades_diccionario.keys()])
    for word in words:
        diccionario_lambda[word] = np.log(probabilidades_diccionario[(word, 1)]) - np.log(probabilidades_diccionario[(word, 0)])
    return diccionario_lambda

In [19]:
diccionario_lambda = calcula_lambda(probabilidades_diccionario)

In [20]:
diccionario_lambda[':)']

6.861171928299829

### Paso 5: Calculando log prior

In [21]:
def calcula_log_prior(target):
    D_pos = np.sum(np.array(target) == 1)
    D_neg = np.sum(np.array(target) == 0)
    return np.log(D_pos) - np.log(D_neg)

In [22]:
log_prior = calcula_log_prior(train_target)

In [23]:
log_prior

0.0

## Inferencia

In [24]:
def predict_tweet(tweet, diccionario_lambda, return_class=True):
    tweet_procesado = procesa_tweet(tweet)
    
    p = 0
    
    for word in tweet_procesado:
        if word in diccionario_lambda:
            p += diccionario_lambda[word]
    
    if return_class:
        return int(p > 0)
    else:
        return p

In [25]:
predict_tweet('She smiled.', diccionario_lambda)

1

In [26]:
predict_tweet('She smiled.', diccionario_lambda, False)

1.5740278623499178

In [27]:
def predict_model(list_tweets, diccionario_lambda):
    y_predict = []
    for tweet in list_tweets:
        y_predict.append(predict_tweet(tweet, diccionario_lambda, True))
    return np.array(y_predict)

In [28]:
y_predict = predict_model(test_tweets, diccionario_lambda)

In [29]:
y_predict

array([1, 1, 1, ..., 0, 0, 0])

## Evaluación

In [30]:
from sklearn.metrics import accuracy_score

In [31]:
accuracy_score(test_target, y_predict)

0.994

## Supuestos en Naive Bayes

## Análisis de Error

In [32]:
predict_tweet('my beloved grandmother : )', diccionario_lambda, True)

0

In [33]:
predict_tweet('my beloved grandmother :)', diccionario_lambda, True)

1

In [34]:
predict_tweet('This is not good, because your attitude is not even close to being nice', diccionario_lambda, True)

1

In [35]:
predict_tweet('I am happy because I did not go', diccionario_lambda, True)

1

In [36]:
predict_tweet('I am not happy because I did go', diccionario_lambda, True)

1

In [37]:
predict_tweet('This is a ridiculously powerful movie. The plot was gripping and I cried right through until the ending', 
              diccionario_lambda, True)

0