In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')
text = "Mary, don't slap the green witch."
print([str(token) for token in nlp(text.lower())])

['mary', ',', 'do', "n't", 'slap', 'the', 'green', 'witch', '.']


In [3]:
from nltk.tokenize import TweetTokenizer

In [4]:
tweet = u"If we don’t stand up for democracy, we’ll see this type of gerrymandering happen in even more states. It’s why @DemRedistrict is fighting for #FairMaps. And it’s why the Senate must pass the Freedom to Vote Act to protect our communities from partisan gerrymandering."
tokenizer = TweetTokenizer()
print(tokenizer.tokenize(tweet.lower()))

['if', 'we', 'don', '’', 't', 'stand', 'up', 'for', 'democracy', ',', 'we', '’', 'll', 'see', 'this', 'type', 'of', 'gerrymandering', 'happen', 'in', 'even', 'more', 'states', '.', 'it', '’', 's', 'why', '@demredistrict', 'is', 'fighting', 'for', '#fairmaps', '.', 'and', 'it', '’', 's', 'why', 'the', 'senate', 'must', 'pass', 'the', 'freedom', 'to', 'vote', 'act', 'to', 'protect', 'our', 'communities', 'from', 'partisan', 'gerrymandering', '.']


In [5]:
def n_grams(text, n):
    return [text[i:i+n] for i in range(len(text)-n+1)]

In [6]:
text = ['mary', ',', 'do', "n't", 'slap', 'the', 'green', 'witch', '.']
print(n_grams(text, 3))

[['mary', ',', 'do'], [',', 'do', "n't"], ['do', "n't", 'slap'], ["n't", 'slap', 'the'], ['slap', 'the', 'green'], ['the', 'green', 'witch'], ['green', 'witch', '.']]


In [7]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
ps = PorterStemmer()

In [8]:
example_words = ["better"]

for w in example_words:
    print(ps.stem(w))

better


In [9]:
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/marc/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [10]:
lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize("better", pos="a"))

good


In [11]:
import spacy

In [12]:
sp = spacy.load('en_core_web_sm')
token = sp(u'better')
for word in token:
    print(word.lemma_)

well


In [13]:
import spacy

In [14]:
nlp = spacy.load('en_core_web_sm')
doc = nlp(u'Charles Spencer Chaplin was born on 16 April 1889 to Hannah Chaplin (born Hannah Harriet Pedlingham Hill) and Charles Chaplin Sr .')

In [15]:
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.shape_, token.is_alpha, token.is_stop)

Charles Charles PROPN Xxxxx True False
Spencer Spencer PROPN Xxxxx True False
Chaplin Chaplin PROPN Xxxxx True False
was be AUX xxx True True
born bear VERB xxxx True False
on on ADP xx True True
16 16 NUM dd False False
April April PROPN Xxxxx True False
1889 1889 NUM dddd False False
to to ADP xx True True
Hannah Hannah PROPN Xxxxx True False
Chaplin Chaplin PROPN Xxxxx True False
( ( PUNCT ( False False
born bear VERB xxxx True False
Hannah Hannah PROPN Xxxxx True False
Harriet Harriet PROPN Xxxxx True False
Pedlingham Pedlingham PROPN Xxxxx True False
Hill Hill PROPN Xxxx True False
) ) PUNCT ) False False
and and CCONJ xxx True True
Charles Charles PROPN Xxxxx True False
Chaplin Chaplin PROPN Xxxxx True False
Sr Sr PROPN Xx True False
. . PUNCT . False False


In [16]:
for token in doc:
    print(token.text, token.dep_, token.head.text, token.head.pos_,
            [child for child in token.children])

Charles compound Chaplin PROPN []
Spencer compound Chaplin PROPN []
Chaplin nsubjpass born VERB [Charles, Spencer]
was auxpass born VERB []
born ROOT born VERB [Chaplin, was, on, to, .]
on prep born VERB [April]
16 nummod April PROPN []
April pobj on ADP [16, 1889]
1889 nummod April PROPN []
to prep born VERB [Chaplin]
Hannah compound Chaplin PROPN []
Chaplin pobj to ADP [Hannah, (, born, ), and, Sr]
( punct Chaplin PROPN []
born acl Chaplin PROPN [Hill]
Hannah compound Hill PROPN []
Harriet compound Hill PROPN []
Pedlingham compound Hill PROPN []
Hill dobj born VERB [Hannah, Harriet, Pedlingham]
) punct Chaplin PROPN []
and cc Chaplin PROPN []
Charles compound Sr PROPN []
Chaplin compound Sr PROPN []
Sr conj Chaplin PROPN [Charles, Chaplin]
. punct born VERB []


In [17]:
import spacy

In [18]:
nlp = spacy.load('en_core_web_sm')
doc = nlp(u'Mary slapped the green witch')

In [19]:
for chunk in doc.noun_chunks:
    print("{} - {}".format(chunk, chunk.label_))

Mary - NP
the green witch - NP


In [20]:
from sklearn.feature_extraction.text import CountVectorizer

documents = ["Dog bites man.", "Man bites dog.", "Dog eats meat.", "Man eats food."]
processed_docs = [doc.lower().replace(".","") for doc in documents]

In [21]:
count_vect = CountVectorizer()
bow_rep = count_vect.fit_transform(processed_docs)
count_vect.vocabulary_

{'dog': 1, 'bites': 0, 'man': 4, 'eats': 2, 'meat': 5, 'food': 3}

In [22]:
print("BoW representation for 'dog bites man': ", bow_rep[0].toarray())
print("BoW representation for 'man bites dog: ",bow_rep[1].toarray())

BoW representation for 'dog bites man':  [[1 1 0 0 1 0]]
BoW representation for 'man bites dog:  [[1 1 0 0 1 0]]


In [23]:
test = count_vect.transform(["dog and dog are friends"])
print("Bow representation for 'dog and dog are friends':", test.toarray())

Bow representation for 'dog and dog are friends': [[0 2 0 0 0 0]]


In [24]:
count_vect = CountVectorizer(binary=True)
count_vect.fit(processed_docs)
test = count_vect.transform(["dog and dog are friends"])
print("Bow representation for 'dog and dog are friends':", test.toarray())

Bow representation for 'dog and dog are friends': [[0 1 0 0 0 0]]


In [25]:
from sklearn.feature_extraction.text import CountVectorizer

documents = ["Dog bites man.", "Man bites dog.", "Dog eats meat.", "Man eats food."]
processed_docs = [doc.lower().replace(".","") for doc in documents]

In [26]:
count_vect = CountVectorizer(ngram_range=(1,3))
bow_rep = count_vect.fit_transform(processed_docs)
count_vect.vocabulary_

{'dog': 3,
 'bites': 0,
 'man': 12,
 'dog bites': 4,
 'bites man': 2,
 'dog bites man': 5,
 'man bites': 13,
 'bites dog': 1,
 'man bites dog': 14,
 'eats': 8,
 'meat': 17,
 'dog eats': 6,
 'eats meat': 10,
 'dog eats meat': 7,
 'food': 11,
 'man eats': 15,
 'eats food': 9,
 'man eats food': 16}

In [27]:
print("BoW representation for 'dog bites man': ", bow_rep[0].toarray())
print("BoW representation for 'man bites dog: ",bow_rep[1].toarray())

BoW representation for 'dog bites man':  [[1 0 1 1 1 1 0 0 0 0 0 0 1 0 0 0 0 0]]
BoW representation for 'man bites dog:  [[1 1 0 1 0 0 0 0 0 0 0 0 1 1 1 0 0 0]]


In [28]:
test = count_vect.transform(["dog and dog are friends"])

print("Bow representation for 'dog and dog are friends':", test.toarray())

Bow representation for 'dog and dog are friends': [[0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
