In [1]:
import pandas as pd
import numpy as np

## 1. Ler arquivo `.csv` utilizando [pandas](http://pandas.pydata.org/)

In [2]:
text = pd.read_csv('example.csv', sep=',')
text.head()

Unnamed: 0,title,description,text
0,Graphical model,A graphical model or probabilistic graphical m...,"Generally, probabilistic graphical models use ..."
1,Modelo en grafo,"En teoría de probabilidades y en estadística, ...","En el caso más sencillo, la estructura de la r..."
2,Modèle graphique,Un modèle graphique est une représentation d'o...,Un modèle graphique est un graphe orienté ou n...


In [3]:
primeiro_texto = text.description[0]
primeiro_texto

'A graphical model or probabilistic graphical model (PGM) is a probabilistic model for which a graph expresses the conditional dependence structure between random variables. They are commonly used in probability theory, statistics—particularly Bayesian statistics—and machine learning.'

## 2. Separar palavras (_tagging_)

### 2.1 [Python split](https://docs.python.org/3/library/stdtypes.html#str.split)

In [4]:
primeiro_texto.split()

['A',
 'graphical',
 'model',
 'or',
 'probabilistic',
 'graphical',
 'model',
 '(PGM)',
 'is',
 'a',
 'probabilistic',
 'model',
 'for',
 'which',
 'a',
 'graph',
 'expresses',
 'the',
 'conditional',
 'dependence',
 'structure',
 'between',
 'random',
 'variables.',
 'They',
 'are',
 'commonly',
 'used',
 'in',
 'probability',
 'theory,',
 'statistics—particularly',
 'Bayesian',
 'statistics—and',
 'machine',
 'learning.']

### 2.2 [NLTK](http://www.nltk.org/)

> NLTK is a leading platform for building Python programs to work with human language data. It provides easy-to-use interfaces to over 50 corpora and lexical resources such as WordNet, along with a suite of text processing libraries for classification, tokenization, stemming, tagging, parsing, and semantic reasoning, wrappers for industrial-strength NLP libraries, and an active discussion forum.

In [5]:
import nltk
nltk.word_tokenize(primeiro_texto)

['A',
 'graphical',
 'model',
 'or',
 'probabilistic',
 'graphical',
 'model',
 '(',
 'PGM',
 ')',
 'is',
 'a',
 'probabilistic',
 'model',
 'for',
 'which',
 'a',
 'graph',
 'expresses',
 'the',
 'conditional',
 'dependence',
 'structure',
 'between',
 'random',
 'variables',
 '.',
 'They',
 'are',
 'commonly',
 'used',
 'in',
 'probability',
 'theory',
 ',',
 'statistics—particularly',
 'Bayesian',
 'statistics—and',
 'machine',
 'learning',
 '.']

## 3. Filtro de palavras

In [6]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

### 3.1 Filtro por _stopwords_

In [7]:
# Stopwords
from nltk.corpus import stopwords

'"I" is a stopword?', 'i' in stopwords.words('english')

('"I" is a stopword?', True)

In [8]:
'"ronaldo" is a stopword?', 'ronaldo' in stopwords.words('english')

('"ronaldo" is a stopword?', False)

In [9]:
import enchant

palavras = set(primeiro_texto.split())
english_stop_words = set(stopwords.words('english'))

palavras_filtradas = palavras - english_stop_words
palavras_filtradas

{'(PGM)',
 'A',
 'Bayesian',
 'They',
 'commonly',
 'conditional',
 'dependence',
 'expresses',
 'graph',
 'graphical',
 'learning.',
 'machine',
 'model',
 'probabilistic',
 'probability',
 'random',
 'statistics—and',
 'statistics—particularly',
 'structure',
 'theory,',
 'used',
 'variables.'}

### 3.2 Filtro por palavras em um dicionário

In [10]:
# Instalar pyenchant (que contém dicionários)
!pip install pyenchant



In [11]:
dictionary_en_US = enchant.Dict("en_US")

'"Hello" in en_US?', dictionary_en_US.check('hello')

('"Hello" in en_US?', True)

In [12]:
'"Cata-vento" in en_US?', dictionary_en_US.check('Cata-vento')

('"Cata-vento" in en_US?', False)

In [13]:
palavras_filtradas = set(filter(dictionary_en_US.check, palavras))
palavras_filtradas

{'A',
 'Bayesian',
 'They',
 'a',
 'are',
 'between',
 'commonly',
 'conditional',
 'dependence',
 'expresses',
 'for',
 'graph',
 'graphical',
 'in',
 'is',
 'learning.',
 'machine',
 'model',
 'or',
 'probabilistic',
 'probability',
 'random',
 'structure',
 'the',
 'used',
 'variables.',
 'which'}

## 4. Tagging

**Categorizing and Tagging Words**: http://www.nltk.org/book/ch05.html

In [14]:
import nltk

sentence = """At eight o'clock on Thursday morning
... Arthur didn't feel very good."""
tokens = nltk.word_tokenize(sentence)

tagged = nltk.pos_tag(tokens)
tagged

[('At', 'IN'),
 ('eight', 'CD'),
 ("o'clock", 'NN'),
 ('on', 'IN'),
 ('Thursday', 'NNP'),
 ('morning', 'NN'),
 ('...', ':'),
 ('Arthur', 'NNP'),
 ('did', 'VBD'),
 ("n't", 'RB'),
 ('feel', 'VB'),
 ('very', 'RB'),
 ('good', 'JJ'),
 ('.', '.')]

## 5. Separar treino e teste

In [16]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(text, test_size=0.33)

In [17]:
train

Unnamed: 0,title,description,text
1,Modelo en grafo,"En teoría de probabilidades y en estadística, ...","En el caso más sencillo, la estructura de la r..."
0,Graphical model,A graphical model or probabilistic graphical m...,"Generally, probabilistic graphical models use ..."


In [18]:
test

Unnamed: 0,title,description,text
2,Modèle graphique,Un modèle graphique est une représentation d'o...,Un modèle graphique est un graphe orienté ou n...
