In [1]:
from textblob import TextBlob

text = "Have a great rest of your Monday!!"
blob = TextBlob(text)
print(blob.sentences)
print(blob.words)



[Sentence("Have a great rest of your Monday!"), Sentence("!")]
['Have', 'a', 'great', 'rest', 'of', 'your', 'Monday']


## Natural Language Processing
* NLP is the the processing of a text collection
    * Text collection: a corpus (corpora for plural)
    * Examples
        * Your responses to my poll question
        * Tweets
        * Facebook posts
        * Messaging conversations
        * The classics
        * Etc.
* NLP is notoriously difficulty because natural language lacks mathematical precision
* Thankfully, there are alot of really great Python libraries for NLP!!!
    * Let's start with TextBlob

In [2]:
from textblob import TextBlob

text = """
The academic heart of Gonzaga's liberal arts tradition lies in its University Core Curriculum, which integrates philosophy, theology, mathematics, literature, writing and speaking. A common thread throughout all of these disciplines is the value of the written word; students at Gonzaga carry out extensive writing projects throughout all their courses of study.

Gonzaga offers 75 undergraduate majors and programs, 26 master’s degrees, a Ph.D. in Leadership Studies, a Doctor of Nursing Practice degree, and a Juris Doctor through the School of Law. Explore these academic opportunities through the course catalog. Additionally, undergraduates may participate in several distinctive academic programs. The average undergraduate class size is 22 and the student-to-faculty ratio is 12:1. Gonzaga also offers study abroad programs in over 60 countries, including our longest-running program, Gonzaga-in-Florence, Italy.
"""

blob = TextBlob(text)

In [3]:
# parts of speech tagging
print(blob.tags)

[('The', 'DT'), ('academic', 'JJ'), ('heart', 'NN'), ('of', 'IN'), ('Gonzaga', 'NNP'), ("'s", 'POS'), ('liberal', 'JJ'), ('arts', 'NNS'), ('tradition', 'NN'), ('lies', 'VBZ'), ('in', 'IN'), ('its', 'PRP$'), ('University', 'NNP'), ('Core', 'NNP'), ('Curriculum', 'NNP'), ('which', 'WDT'), ('integrates', 'VBZ'), ('philosophy', 'NN'), ('theology', 'NN'), ('mathematics', 'NNS'), ('literature', 'NN'), ('writing', 'VBG'), ('and', 'CC'), ('speaking', 'NN'), ('A', 'DT'), ('common', 'JJ'), ('thread', 'NN'), ('throughout', 'IN'), ('all', 'DT'), ('of', 'IN'), ('these', 'DT'), ('disciplines', 'NNS'), ('is', 'VBZ'), ('the', 'DT'), ('value', 'NN'), ('of', 'IN'), ('the', 'DT'), ('written', 'VBN'), ('word', 'NN'), ('students', 'NNS'), ('at', 'IN'), ('Gonzaga', 'NNP'), ('carry', 'VBP'), ('out', 'RP'), ('extensive', 'JJ'), ('writing', 'NN'), ('projects', 'NNS'), ('throughout', 'IN'), ('all', 'DT'), ('their', 'PRP$'), ('courses', 'NNS'), ('of', 'IN'), ('study', 'NN'), ('Gonzaga', 'NNP'), ('offers', 'VBZ')

In [4]:
# noun phrases
print(blob.noun_phrases)

['academic heart', 'gonzaga', 'liberal arts tradition', 'core curriculum', 'integrates philosophy', 'common thread', 'gonzaga', 'gonzaga', 'undergraduate majors', 'master ’ s degrees', 'ph.d.', 'studies', 'practice degree', 'juris', 'explore', 'academic opportunities', 'course catalog', 'additionally', 'academic programs', 'average undergraduate class size', 'student-to-faculty ratio', 'gonzaga', 'gonzaga-in-florence', 'italy']


In [5]:
# sentiment analysis
for sentence in blob.sentences:
    print(sentence)
    print(sentence.sentiment)
    print()
# polarity is -1.0 (negative) to 1.0 (postive)
# subjectivity is 0.0 (objective) to 1.0 (subjective)


The academic heart of Gonzaga's liberal arts tradition lies in its University Core Curriculum, which integrates philosophy, theology, mathematics, literature, writing and speaking.
Sentiment(polarity=0.0, subjectivity=0.0)

A common thread throughout all of these disciplines is the value of the written word; students at Gonzaga carry out extensive writing projects throughout all their courses of study.
Sentiment(polarity=-0.15, subjectivity=0.41666666666666663)

Gonzaga offers 75 undergraduate majors and programs, 26 master’s degrees, a Ph.D. in Leadership Studies, a Doctor of Nursing Practice degree, and a Juris Doctor through the School of Law.
Sentiment(polarity=0.1, subjectivity=0.0)

Explore these academic opportunities through the course catalog.
Sentiment(polarity=0.0, subjectivity=0.0)

Additionally, undergraduates may participate in several distinctive academic programs.
Sentiment(polarity=0.0, subjectivity=0.0)

The average undergraduate class size is 22 and the student-to-f

In [6]:
# language detection
# uses Google Translate on the backend
print(blob.detect_language())

en


In [7]:
# language translation
print(blob.translate(to="es"))

El corazón académico de la tradición de las artes liberales de Gonzaga se encuentra en su plan de estudios universitario, que integra filosofía, teología, matemáticas, literatura, escritura y expresión oral. Un hilo común en todas estas disciplinas es el valor de la palabra escrita; los estudiantes de Gonzaga llevan a cabo extensos proyectos de escritura a lo largo de todos sus cursos de estudio.

Gonzaga ofrece 75 especializaciones y programas de pregrado, 26 maestrías, un doctorado. en Estudios de Liderazgo, un título de Doctor en Práctica de Enfermería y un Juris Doctor a través de la Facultad de Derecho. Explore estas oportunidades académicas a través del catálogo de cursos. Además, los estudiantes universitarios pueden participar en varios programas académicos distintivos. El tamaño promedio de las clases de pregrado es de 22 y la proporción de estudiantes por docente es de 12: 1. Gonzaga también ofrece programas de estudios en el extranjero en más de 60 países, incluido nuestro p

In [8]:
# inflection
# inflections are different forms of the same word (e.g. singular vs. plural)
# person vs. people
from textblob import Word

words = ["octopus", "goose", "deer"]
for word in words:
    word = Word(word)
    print("word:", word, "singular:", word.singularize(), "plural:", word.pluralize())

word: octopus singular: octopu plural: octopodes
word: goose singular: goose plural: geese
word: deer singular: deer plural: deer


In [10]:
# spell checking
word = Word("theyr")
print(word.spellcheck())
print(word.correct())

[('they', 0.5713042216741622), ('their', 0.42869577832583783)]
they


In [12]:
# normalization
# preparing words for analysis
# example: "program": "programs", "programmer", "programming", "programmatically", etc. 

# stemming: removes a prefix and/or a suffix from a word to get its stem
word = Word("programmer")
print(word.stem())
word = Word("programming")
print(word.stem())

programm
program


In [14]:
# lemmatization: is like stemming, but uses the part of speech and meaning to 
# provide a real word
word = Word("programmer")
print(word.lemmatize())
word = Word("programming")
print(word.lemmatize())

word = Word("varieties")
print(word.stem())
print(word.lemmatize())

programmer
programming
varieti
variety


In [16]:
# word frequencies
print(blob.word_counts["academic"])
print(blob.word_counts["writing"])
print(blob.word_counts["science"])

3
2
0


In [17]:
# definitions
# uses WordNet db
word = Word("pie")
print(word.definitions)

['dish baked in pastry-lined pan often with a pastry top', 'a prehistoric unrecorded language that was the ancestor of all Indo-European languages']


In [18]:
# synonyms
print(word.synsets)

[Synset('pie.n.01'), Synset('proto-indo_european.n.01')]


In [19]:
# n-grams
# an n-gram is a sequence of n text items (e.g. n letters in a word or n words
# in a sentence)
# typically uses to see what words (or letters) are written/types/spoken together
# (in sequence)
# examples: IDE like VS code code completion
# predictive text (e.g. messaging app, google search)

print(blob.ngrams()) # default n is 3

[WordList(['The', 'academic', 'heart']), WordList(['academic', 'heart', 'of']), WordList(['heart', 'of', 'Gonzaga']), WordList(['of', 'Gonzaga', "'s"]), WordList(['Gonzaga', "'s", 'liberal']), WordList(["'s", 'liberal', 'arts']), WordList(['liberal', 'arts', 'tradition']), WordList(['arts', 'tradition', 'lies']), WordList(['tradition', 'lies', 'in']), WordList(['lies', 'in', 'its']), WordList(['in', 'its', 'University']), WordList(['its', 'University', 'Core']), WordList(['University', 'Core', 'Curriculum']), WordList(['Core', 'Curriculum', 'which']), WordList(['Curriculum', 'which', 'integrates']), WordList(['which', 'integrates', 'philosophy']), WordList(['integrates', 'philosophy', 'theology']), WordList(['philosophy', 'theology', 'mathematics']), WordList(['theology', 'mathematics', 'literature']), WordList(['mathematics', 'literature', 'writing']), WordList(['literature', 'writing', 'and']), WordList(['writing', 'and', 'speaking']), WordList(['and', 'speaking', 'A']), WordList(['s