#  Processamento de Linguagem natural (PLN) - PARTE 1

In [95]:
#############################################################################################################
##### Notebook Processamento de Linguagem natural (PLN)
##### Baseado em:
## Natural Language Processing with Python (book)
##
##############################################################################################################
## Objetivos:
##   Mostrar aplicações de metodos de linguagem natural aprendidos em aula
###################################################################################################################

## Importação dos Dados

In [96]:
# !pip install datasets
import datasets
import pandas as pd
from datasets import load_dataset 

In [97]:
pd.set_option('display.max_colwidth', None)  # Mostra o conteúdo completo das colunas
pd.set_option('display.max_columns', 100)  # Mostra o conteúdo completo das colunas
pd.set_option('display.max_rows', 100)  # Mostra o conteúdo completo das colunas

In [98]:
dataset = datasets.load_dataset("tweets_hate_speech_detection",  split="train")
print(dataset)

# Conversão para DataFrame do Pandas
dataset = dataset.to_pandas()

Dataset({
    features: ['label', 'tweet'],
    num_rows: 31962
})


In [99]:
# Selectionando uma amostra
df = dataset.sample(n=100, random_state=21)[["tweet"]].reset_index(drop=True)
df.tail()

Unnamed: 0,tweet
95,loving this conference call that i am on with one of my teammates.big things are happening within our team!
96,happy father's day to all dads! #fathersday #family #cooldad
97,great to be able to have a catchup with my lovely agent @user today and discuss happenings for the future! actor :)
98,this song brings back so many memories ð #memories #thegoodolddays
99,always be #whitesides


## Tokeninzação

> Método 1

In [100]:
from nltk.tokenize import WhitespaceTokenizer, WordPunctTokenizer, TreebankWordTokenizer

# Convertendo para minúsculas, para "normalização"
df['tweet'] = df['tweet'].str.lower()

tokenizers = {
    "WhitespaceTokenizer": WhitespaceTokenizer(),
    "WordPunctTokenizer": WordPunctTokenizer(),
    "TreebankWordTokenizer": TreebankWordTokenizer()
}

for name, tokenizer in tokenizers.items():
    df[name] = df["tweet"].apply(lambda x: tokenizer.tokenize(x))

df.head()

Unnamed: 0,tweet,WhitespaceTokenizer,WordPunctTokenizer,TreebankWordTokenizer
0,when cinema is an expression of freedom or cinema reflecting societal issues but india is not ready to show itself a mirror #udtapunjab,"[when, cinema, is, an, expression, of, freedom, or, cinema, reflecting, societal, issues, but, india, is, not, ready, to, show, itself, a, mirror, #udtapunjab]","[when, cinema, is, an, expression, of, freedom, or, cinema, reflecting, societal, issues, but, india, is, not, ready, to, show, itself, a, mirror, #, udtapunjab]","[when, cinema, is, an, expression, of, freedom, or, cinema, reflecting, societal, issues, but, india, is, not, ready, to, show, itself, a, mirror, #, udtapunjab]"
1,how far is #europe swinging to the right? - #nyt #libcrib #uniteblue #fascism #trump #fear #ignorance #hate,"[how, far, is, #europe, swinging, to, the, right?, -, #nyt, #libcrib, #uniteblue, #fascism, #trump, #fear, #ignorance, #hate]","[how, far, is, #, europe, swinging, to, the, right, ?, -, #, nyt, #, libcrib, #, uniteblue, #, fascism, #, trump, #, fear, #, ignorance, #, hate]","[how, far, is, #, europe, swinging, to, the, right, ?, -, #, nyt, #, libcrib, #, uniteblue, #, fascism, #, trump, #, fear, #, ignorance, #, hate]"
2,for a lot of people it's their paner who has to compete against the twitter. but #true,"[for, a, lot, of, people, it's, their, paner, who, has, to, compete, against, the, twitter., but, #true]","[for, a, lot, of, people, it, ', s, their, paner, who, has, to, compete, against, the, twitter, ., but, #, true]","[for, a, lot, of, people, it, 's, their, paner, who, has, to, compete, against, the, twitter., but, #, true]"
3,rest in peace christina grimmie. i loved your voice and your youtube covers â¤ï¸ #restinlovechristina #shocked #restinpiecechristina,"[rest, in, peace, christina, grimmie., i, loved, your, voice, and, your, youtube, covers, â¤ï¸, #restinlovechristina, #shocked, #restinpiecechristina]","[rest, in, peace, christina, grimmie, ., i, loved, your, voice, and, your, youtube, covers, â, ¤, ï, ¸, #, restinlovechristina, #, shocked, #, restinpiecechristina]","[rest, in, peace, christina, grimmie., i, loved, your, voice, and, your, youtube, covers, â¤ï¸, #, restinlovechristina, #, shocked, #, restinpiecechristina]"
4,come on england!!! â½ï¸â½ï¸â½ï¸ #euro2016 #england #football,"[come, on, england!!!, â½ï¸â½ï¸â½ï¸, #euro2016, #england, #football]","[come, on, england, !!!, â, , ½ï, ¸, â, , ½ï, ¸, â, , ½ï, ¸, #, euro2016, #, england, #, football]","[come, on, england, !, !, !, â½ï¸â½ï¸â½ï¸, #, euro2016, #, england, #, football]"


> Método 2

In [101]:
import nltk
from nltk import tokenize

# Percebe-se que o método word_tokenize é baseado no TreebankWordTokenizer, mas com algumas melhorias.
df['nltk_word_tokenize'] = df['tweet'].apply(tokenize.word_tokenize)
df.tail()

Unnamed: 0,tweet,WhitespaceTokenizer,WordPunctTokenizer,TreebankWordTokenizer,nltk_word_tokenize
95,loving this conference call that i am on with one of my teammates.big things are happening within our team!,"[loving, this, conference, call, that, i, am, on, with, one, of, my, teammates.big, things, are, happening, within, our, team!]","[loving, this, conference, call, that, i, am, on, with, one, of, my, teammates, ., big, things, are, happening, within, our, team, !]","[loving, this, conference, call, that, i, am, on, with, one, of, my, teammates.big, things, are, happening, within, our, team, !]","[loving, this, conference, call, that, i, am, on, with, one, of, my, teammates.big, things, are, happening, within, our, team, !]"
96,happy father's day to all dads! #fathersday #family #cooldad,"[happy, father's, day, to, all, dads!, #fathersday, #family, #cooldad]","[happy, father, ', s, day, to, all, dads, !, #, fathersday, #, family, #, cooldad]","[happy, father, 's, day, to, all, dads, !, #, fathersday, #, family, #, cooldad]","[happy, father, 's, day, to, all, dads, !, #, fathersday, #, family, #, cooldad]"
97,great to be able to have a catchup with my lovely agent @user today and discuss happenings for the future! actor :),"[great, to, be, able, to, have, a, catchup, with, my, lovely, agent, @user, today, and, discuss, happenings, for, the, future!, actor, :)]","[great, to, be, able, to, have, a, catchup, with, my, lovely, agent, @, user, today, and, discuss, happenings, for, the, future, !, actor, :)]","[great, to, be, able, to, have, a, catchup, with, my, lovely, agent, @, user, today, and, discuss, happenings, for, the, future, !, actor, :, )]","[great, to, be, able, to, have, a, catchup, with, my, lovely, agent, @, user, today, and, discuss, happenings, for, the, future, !, actor, :, )]"
98,this song brings back so many memories ð #memories #thegoodolddays,"[this, song, brings, back, so, many, memories, ð, #memories, #thegoodolddays]","[this, song, brings, back, so, many, memories, ð, , #, memories, #, thegoodolddays]","[this, song, brings, back, so, many, memories, ð, #, memories, #, thegoodolddays]","[this, song, brings, back, so, many, memories, ð, #, memories, #, thegoodolddays]"
99,always be #whitesides,"[always, be, #whitesides]","[always, be, #, whitesides]","[always, be, #, whitesides]","[always, be, #, whitesides]"


## Stemming

Stemming reduz a palavra, removendo sufixos e prefixos, podendo não gerar uma palavra válida

In [102]:
import nltk
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
df['stemmed'] = df['nltk_word_tokenize'].apply(lambda x: [stemmer.stem(word) for word in x])
print(df.loc[0 ,['nltk_word_tokenize', 'stemmed']])

nltk_word_tokenize    [when, cinema, is, an, expression, of, freedom, or, cinema, reflecting, societal, issues, but, india, is, not, ready, to, show, itself, a, mirror, #, udtapunjab]
stemmed                         [when, cinema, is, an, express, of, freedom, or, cinema, reflect, societ, issu, but, india, is, not, readi, to, show, itself, a, mirror, #, udtapunjab]
Name: 0, dtype: object


## Lematization

Lemmatization devolve a palavra raiz, considerando o contexto e o significado da palavra

In [103]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet') # dicionário de sinônimos

wnl = WordNetLemmatizer()
df['lemmatized'] = df['nltk_word_tokenize'].apply(lambda x: [wnl.lemmatize(word) for word in x])
print(df.loc[71 ,['nltk_word_tokenize', 'lemmatized']])
print()


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Masmok\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


nltk_word_tokenize    [@, user, #, gogirl, #, summer, #, camps, #, cardiff, #, girls, 10-17, #, confident, book, now, pls]
lemmatized              [@, user, #, gogirl, #, summer, #, camp, #, cardiff, #, girl, 10-17, #, confident, book, now, pls]
Name: 71, dtype: object



## Stopwords

Stopwords são palavras comuns que geralmente não agregam significado relevante ao texto


In [104]:
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

df['without_stopwords'] = df['nltk_word_tokenize'].apply(lambda x: [word for word in x if word.lower() not in stop_words])
df[['nltk_word_tokenize', 'without_stopwords']].head()



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Masmok\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,nltk_word_tokenize,without_stopwords
0,"[when, cinema, is, an, expression, of, freedom, or, cinema, reflecting, societal, issues, but, india, is, not, ready, to, show, itself, a, mirror, #, udtapunjab]","[cinema, expression, freedom, cinema, reflecting, societal, issues, india, ready, show, mirror, #, udtapunjab]"
1,"[how, far, is, #, europe, swinging, to, the, right, ?, -, #, nyt, #, libcrib, #, uniteblue, #, fascism, #, trump, #, fear, #, ignorance, #, hate]","[far, #, europe, swinging, right, ?, -, #, nyt, #, libcrib, #, uniteblue, #, fascism, #, trump, #, fear, #, ignorance, #, hate]"
2,"[for, a, lot, of, people, it, 's, their, paner, who, has, to, compete, against, the, twitter, ., but, #, true]","[lot, people, 's, paner, compete, twitter, ., #, true]"
3,"[rest, in, peace, christina, grimmie, ., i, loved, your, voice, and, your, youtube, covers, â¤ï¸, #, restinlovechristina, #, shocked, #, restinpiecechristina]","[rest, peace, christina, grimmie, ., loved, voice, youtube, covers, â¤ï¸, #, restinlovechristina, #, shocked, #, restinpiecechristina]"
4,"[come, on, england, !, !, !, â½ï¸â½ï¸â½ï¸, #, euro2016, #, england, #, football]","[come, england, !, !, !, â½ï¸â½ï¸â½ï¸, #, euro2016, #, england, #, football]"


## Bag of Words

### Tfidf - Caracterizacao das palavras

 Tfidf vai avaliar a importância de uma palavra em um tweet, considerando a frequência da palavra no próprio tweet e em todo o corpus

 Quanto mais rara a palavra no corpus, maior será seu peso

In [105]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(min_df=2, max_df=0.5, ngram_range=(1 ,2))
tfidf_matrix = tfidf.fit_transform(df['tweet'])

tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())
tfidf_df.head()

Unnamed: 0,about,affirmation,all,always,am,am thankful,amp,an,and,any,are,are you,as,at,back,be,beautiful,because,big,bihday,bihday to,business,but,can,can find,christina,christina grimmie,color,confident,cute,day,do,ever,family,feel,feeling,find,first,flower,for,for all,for the,from,fun,game,gay,getting,girls,glad,going,...,sun,sunday,team,thankful,thankful for,thankful positive,thanks,that,the,the sun,their,them,thinking,this,time,to,to be,to see,to the,to you,today,truth,udtapunjab,up,up with,user,user oh,user today,user user,waiting,warm,was,way,we,week,weekend,weeks,were,what,when,who,why,will,with,with my,would,year,yes,you,your
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.299156,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.299156,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.178559,0.0,0.0,0.0,0.0,0.0,0.0,0.319507,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.283371,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.28457,0.0,0.0,0.0,0.0,0.0,0.0,0.279421,0.0,0.0,0.499987,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.372165,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.244826,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.22623,0.0,0.372165,0.0,0.0,0.0,0.0,0.222136,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.352527,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.205501,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.361062,0.361062,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6113
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Wordvector - Caracterizacao das palavras

In [106]:
from gensim.models import Word2Vec

# vector_size: Dimensionalidade dos vetores de palavras
# window: Quantas palavras antes e depois da palavra alvo serão consideradas
# min_count: Ignora palavras com frequência menor que esse valor
# workers: Número de threads para treinamento
model = Word2Vec(sentences=df['nltk_word_tokenize'], vector_size=100, window=5, min_count=1, workers=4)
word_vectors = model.wv

print("Vector for 'love':", word_vectors['love'])  # Vetor da palavra 'love'
print("Similar words to 'love':", word_vectors.most_similar('love'))


Vector for 'love': [-7.63257220e-03 -1.82980392e-03 -7.57275056e-03  8.24001990e-03
  6.30763918e-03  2.90567777e-03  8.59658886e-03  2.17109662e-03
 -9.71628353e-03  8.13894346e-03 -5.55999810e-03  6.30609831e-03
  5.76864090e-03 -4.06302221e-04 -7.59733934e-03 -1.65341236e-03
  6.61904039e-03 -8.42325576e-03  1.29675248e-03 -1.04370890e-02
  9.86397453e-03 -2.24699709e-03  9.89400595e-03 -6.11570990e-03
 -1.00384932e-02 -8.49537551e-03 -5.04959049e-03  4.24119737e-03
 -1.02953252e-03  9.18981992e-03  4.34712414e-03  3.76335159e-03
  3.81689821e-03  7.15161394e-03 -3.33116343e-03  8.76694545e-03
 -9.16175265e-03  1.62772636e-03 -1.46711606e-03 -1.20701699e-03
  6.59864303e-03 -3.26838368e-03 -2.88574793e-03 -2.19577560e-04
 -1.37135485e-05 -4.24654363e-03  4.93441476e-03 -6.44675083e-03
  8.60239659e-03  7.60134833e-04  2.98946304e-03  2.47849361e-03
 -4.60073526e-04  1.88129616e-03 -3.65304691e-03  4.99762502e-03
  9.57880926e-04 -3.86714796e-03 -9.73483548e-03 -9.09287576e-03
  1.95