# Textblob

In [1]:
# !pip install textblob
# !python -m textblob.download_corpora

In [2]:
# First, the import.
from textblob import TextBlob
from textblob import Word

# from textblob.wordnet import VERB

from textblob.classifiers import NaiveBayesClassifier 

In [3]:
# Let’s create our first TextBlob object.
wiki = TextBlob("Python is a high-level, general-purpose programming language.")

In [4]:
wiki

TextBlob("Python is a high-level, general-purpose programming language.")

In [5]:
wiki.words

WordList(['Python', 'is', 'a', 'high-level', 'general-purpose', 'programming', 'language'])

## Tokenization

In [6]:
zen = TextBlob("Beautiful is better, than ugly. "
               "Explicit is better than implicit. "
               "Simple is better than complex.")

In [7]:
zen.words # tokenization

WordList(['Beautiful', 'is', 'better', 'than', 'ugly', 'Explicit', 'is', 'better', 'than', 'implicit', 'Simple', 'is', 'better', 'than', 'complex'])

In [8]:
zen.words.count('Better') # lowwer case is done

3

In [9]:
zen.sentences

[Sentence("Beautiful is better, than ugly."),
 Sentence("Explicit is better than implicit."),
 Sentence("Simple is better than complex.")]

## Words and noun phrase counts

### Using word_counts dictionary.

In [10]:
monty = TextBlob("We are no longer the Knights who say Delhi India. "
                 "We are now the Knights who say Ekki ekki ekki PTANG.")

In [11]:
# Import data
# tokenize the data
# remove stopwords + punctuations
# normalization
# Hello, hello, HEllo, HeLLO, HeLlO
# lemmatization/ Stemming / Custom stopwprd removal

In [12]:
monty.word_counts # gives us the number of times the word has appeared , everything converted to Lower case

defaultdict(int,
            {'we': 2,
             'are': 2,
             'no': 1,
             'longer': 1,
             'the': 2,
             'knights': 2,
             'who': 2,
             'say': 2,
             'delhi': 1,
             'india': 1,
             'now': 1,
             'ekki': 3,
             'ptang': 1})

In [13]:
monty.word_counts['ekki']

3

In [14]:
monty.word_counts['Ekki']

0

In [15]:
monty.word_counts['PTANG']

0

In [16]:
monty.word_counts['ptang']

1

If you access the frequencies this way, the search will not be case sensitive, and words that are not found will have a frequency of 0.

The second way is to use the count() method.

In [23]:
monty.words.count('Ekki', case_sensitive=True)

1

## Words Inflection and Lemmatization

Each word in TextBlob.words or Sentence.words is a Word object (a subclass of unicode) with useful methods, e.g. for word inflection.

In [17]:
sentence = TextBlob('Uses 4 spaces per indentation levels. We are no longer the Knights who says Delhi India Germany Bharat.')

In [18]:
sentence.words

WordList(['Uses', '4', 'spaces', 'per', 'indentation', 'levels', 'We', 'are', 'no', 'longer', 'the', 'Knights', 'who', 'says', 'Delhi', 'India', 'Germany', 'Bharat'])

In [19]:
sentence.words.singularize()

WordList(['Use', '4', 'space', 'per', 'indentation', 'level', 'We', 'are', 'no', 'longer', 'the', 'Knight', 'who', 'say', 'Delhi', 'Indium', 'Germany', 'Bharat'])

In [None]:
sentence.words[2].singularize()

'space'

In [27]:
sentence.words.pluralize()

WordList(['Usess', '4s', 'spacess', 'pers', 'indentations', 'levelss', 'Wes', 'ares', 'noes', 'longers', 'thes', 'Knightss', 'whoes', 'sayss', 'Delhis', 'Indias', 'Germanys', 'Bharats'])

In [26]:
sentence.words[-4:-1].pluralize()

WordList(['Delhis', 'Indias', 'Germanys'])

In [None]:
# Lemmatizers -- will always give the root word, will retain the meaning ---> keeping the Dictionary rules/words into consideration

# Stemming ---> will Trim the tokens from the end ---- es, ses, ed --> might happen that resultant word do not have any meaning

In [None]:
# lemmatization -- taked POS Tags  --- by default - NOUN,  it can be changed --- w.r.t tags , go lemmatization

Words can be lemmatized by calling the lemmatize method.

In [None]:
w = Word("stripes", pos_tag = 'n')

In [None]:
w.lemmatize()

'stripe'

In [None]:
w = Word("went")

In [None]:
w.lemmatize('v') 

'go'

## POS tagging

Part-of-speech tags can be accessed through the tags property.

In [28]:
zen = TextBlob("Beautiful is better than ugly. "
               "Explicit is better than implicit. "
               "Simple is better than complex.")

In [29]:
zen.tags

[('Beautiful', 'NNP'),
 ('is', 'VBZ'),
 ('better', 'JJR'),
 ('than', 'IN'),
 ('ugly', 'RB'),
 ('Explicit', 'NNP'),
 ('is', 'VBZ'),
 ('better', 'JJR'),
 ('than', 'IN'),
 ('implicit', 'NN'),
 ('Simple', 'NN'),
 ('is', 'VBZ'),
 ('better', 'JJR'),
 ('than', 'IN'),
 ('complex', 'JJ')]

In [None]:
for word, pos in zen.tags:
    print(word.lower() + " => " + pos)

beautiful => NNP
is => VBZ
better => JJR
than => IN
ugly => RB
explicit => NNP
is => VBZ
better => JJR
than => IN
implicit => NN
simple => NN
is => VBZ
better => JJR
than => IN
complex => JJ


## Noun Phrase Extraction

noun phrases are accessed through the noun_phrases property.

In [30]:
document = ("In computer science, artificial intelligence (AI), \
            sometimes called machine intelligence, is intelligence \
            demonstrated by machines, in contrast to the natural intelligence \
            displayed by humans and animals. Computer science defines AI \
            research as the study of \"intelligent agents\": any device that \
            perceives its environment and takes actions that maximize its\
            chance of successfully achieving its goals.[1] Colloquially,\
            the term \"artificial intelligence\" is used to describe machines\
            that mimic \"cognitive\" functions that humans associate with other\
            human minds, such as \"learning\" and \"problem solving\".[2]")

In [31]:
text_blob_object = TextBlob(document)

for noun_phrase in text_blob_object.noun_phrases:
    
    print(noun_phrase)

computer science
artificial intelligence
ai
machine intelligence
natural intelligence
computer
science defines
ai
intelligent agents
colloquially
artificial intelligence
describe machines
human minds


## Spelling Correction
Use the correct() method to attempt spelling correction.

Spelling correction is based on Peter Norvig’s “How to Write a Spelling Corrector” as implemented in the pattern library. It is about 70% accurate

In [33]:
b = TextBlob("I havv written goood speling!. speling corection is based. howw tooo writt a speling corect ")

for i in b.sentences:
    
    print(i.correct())


I have written good spelling!.
spelling correction is based.
how took write a spelling correct


Word objects have a spellcheck() Word.spellcheck() method that returns a list of (word, confidence) tuples with spelling suggestions.

In [None]:
from textblob import Word
w = Word('falibility')

w.spellcheck() # 1 is the confidnece

[('fallibility', 1.0)]

In [None]:
b = TextBlob("I havv goood speling!. speling corection is based. howw tooo writt a speling corect ")

for i in b.words:    
    print(i.spellcheck())

[('I', 1.0)]
[('have', 1.0)]
[('good', 1.0)]
[('spelling', 1.0)]
[('spelling', 1.0)]
[('correction', 1.0)]
[('is', 1.0)]
[('based', 1.0)]
[('how', 0.9924528301886792), ('howe', 0.004528301886792453), ('howl', 0.0030188679245283017)]
[('took', 0.5079787234042553), ('too', 0.4858156028368794), ('tool', 0.0062056737588652485)]
[('write', 0.4777777777777778), ('wrist', 0.37777777777777777), ('writ', 0.08333333333333333), ('writs', 0.06111111111111111)]
[('a', 1.0)]
[('spelling', 1.0)]
[('correct', 1.0)]


## Translation and Language Detection

One of the most powerful capabilities of the TextBlob library is to translate from one language to another. On the backend, the TextBlob language translator uses the __Google Translate API__



https://cloud.google.com/translate/docs/languages

In [20]:
en_blob = TextBlob(u'Simple is better than complex.')

In [22]:
en_blob.translate(to = 'es') # 

TextBlob("Lo simple es mejor que lo complejo.")

In [23]:
chinese_blob = TextBlob("美丽优于丑陋")

chinese_blob.translate(from_lang="zh-CN", to='en') # en == english

TextBlob("Beauty is better than ugly")

You can also attempt to detect a TextBlob’s language using TextBlob.detect_language().

In [25]:
b = TextBlob("بسيط هو أفضل من مجمع")
b.detect_language()

'ar'

In [28]:
b = TextBlob("tumi kemon aachon")
b.detect_language()

'bn'

In [41]:
b = TextBlob("क्या हाल है")
b.detect_language()

'hi'

In [42]:
b.translate(from_lang="hi", to='en')

TextBlob("How are you")

In [29]:
st =  ["क्या हाल है" ,"توهان ڪيئن آهيو" ,"আপনি কেমন আছেন" ,"நீங்கள் எப்படி" ,"இருக்கிறீர்கள்" ,"तिमीलाई कस्तो छ" ,"તમે કેમ છો" ,"तू कसा आहेस ","ന്തൊക്കെയുണ്ട്"]

In [30]:
for i in st:
    a = TextBlob(i)
    t = a.detect_language()
    print(t)
    print(a.translate(from_lang=t, to='en'))

hi
How are you
sd
how are you
bn
How are you
ta
How are you
ta
You are
ne
How are you
gu
how are you
mr
How are you
ml
And so on


## n-grams

N-Grams refer to n combination of words in a sentence. For instance, for a sentence "I love watching football", some 2-grams would be (I love), (love watching) and (watching football). 

N-Grams can play a crucial role in text classification.

The TextBlob.ngrams() method returns a list of tuples of n successive words.

In [46]:
blob = TextBlob("Now is better than never.")

In [47]:
blob.ngrams(n=3)

[WordList(['Now', 'is', 'better']),
 WordList(['is', 'better', 'than']),
 WordList(['better', 'than', 'never'])]

## WordNet Integration

WordNet is a database of English words that are linked together by their semantic relationships. It is like a supercharged dictionary/thesaurus with a graph structure.

TextBlob 0.7 now integrates __NLTK's WordNet__ interface, making it very simple to interact with WordNet.

### Synsets
As you know, synonyms are words that have similar meanings. A synonym set, or synset, is a group of synonyms. A synset, therefore, corresponds to an abstract concept.

In TextBlob, you can access the synsets that a word belongs to by accessing the synsets property of a Word object.

In [48]:
# from textblob import Word
word = Word("pass")
word.synsets

[Synset('base_on_balls.n.01'),
 Synset('pass.n.02'),
 Synset('pass.n.03'),
 Synset('pass.n.04'),
 Synset('pass.n.05'),
 Synset('pass.n.06'),
 Synset('pass.n.07'),
 Synset('pass.n.08'),
 Synset('pass.n.09'),
 Synset('pass.n.10'),
 Synset('bye.n.01'),
 Synset('pass.n.12'),
 Synset('pass.n.13'),
 Synset('crack.n.09'),
 Synset('pass.n.15'),
 Synset('passing.n.07'),
 Synset('pas.n.01'),
 Synset('pass.v.01'),
 Synset('travel_by.v.01'),
 Synset('legislate.v.01'),
 Synset('elapse.v.01'),
 Synset('pass.v.05'),
 Synset('run.v.03'),
 Synset('pass.v.07'),
 Synset('happen.v.01'),
 Synset('pass.v.09'),
 Synset('spend.v.01'),
 Synset('guide.v.05'),
 Synset('communicate.v.01'),
 Synset('evanesce.v.01'),
 Synset('pass.v.14'),
 Synset('exceed.v.02'),
 Synset('pass.v.16'),
 Synset('pass.v.17'),
 Synset('pass.v.18'),
 Synset('sink.v.03'),
 Synset('pass.v.20'),
 Synset('fall.v.21'),
 Synset('pass.v.22'),
 Synset('authorize.v.01'),
 Synset('die.v.01'),
 Synset('excrete.v.01'),
 Synset('passing.a.02')]

In [49]:
word.definitions

['(baseball) an advance to first base by a batter who receives four balls',
 '(military) a written leave of absence',
 '(American football) a play that involves one player throwing the ball to a teammate',
 'the location in a range of mountains of a geological formation that is lower than the surrounding peaks',
 'any authorization to pass or go somewhere',
 'a document indicating permission to do something without restrictions',
 'a flight or run by an aircraft over a target',
 'a bad or difficult situation or state of affairs',
 'a difficult juncture',
 'one complete cycle of operations (as by a computer)',
 'you advance to the next round in a tournament without playing an opponent',
 'a permit to enter or leave a military installation',
 'a complimentary ticket',
 'a usually brief attempt',
 '(sports) the act of throwing the ball to another member of your team',
 'success in satisfying a test or requirement',
 '(ballet) a step in dancing (especially in classical ballet)',
 'go acros

The synonyms contained within a synset are called lemmas. You can access the string versions of these synonyms via a Synset's lemma_names property.

## Converting to Upper and Lowercase
TextBlob objects are very similar to strings. You can convert them to upper case or lower case, change their values, and concatenate them together as well. In the following script, we convert the text from the TextBlob object to upper case:

In [None]:
text = "I love to watch football, but I have never played it"
text_blob_object = TextBlob(text)

print(text_blob_object.upper())

I LOVE TO WATCH FOOTBALL, BUT I HAVE NEVER PLAYED IT


In [None]:
text = "I LOVE TO WATCH FOOTBALL, BUT I HAVE NEVER PLAYED IT"
text_blob_object = TextBlob(text)

print(text_blob_object.lower())

i love to watch football, but i have never played it


## Sentiment Analysis

The sentiment property returns a named tuple of the form Sentiment(polarity, subjectivity). 

Polarity is a float value within the range [-1.0 to 1.0] where 

    0 indicates neutral, 
    +1 indicates a very positive sentiment and 
    -1 represents a very negative sentiment.

Subjectivity is a float value within the range [0.0 to 1.0] where 

    0.0 is very objective and 
    1.0 is very subjective. 


**Polarity score  [- 1, 1 ]** 

1.   closer to -1 -- negative sentiment
2.   closer to +1 - positive sentiment



**Subjectivity Score  [0,1 ]** 
 

1.   close to 1 mean more of personal opinion
2.   closer to 0 mean more of factual information

In [126]:
TextBlob("so the two together did the job").sentiment

Sentiment(polarity=0.0, subjectivity=0.0)

In [35]:
testimonial1 = TextBlob("so the two together did the job")
# testimonial2 = TextBlob("wood is dark brown in color ")
testimonial4 = TextBlob("today is a good day")

In [36]:
print('Sentiment 1: ', testimonial1.sentiment)
print('Sentiment 2: ', testimonial4.sentiment)

#print('Polarity: ', testimonial1.sentiment.polarity)

Sentiment 1:  Sentiment(polarity=0.0, subjectivity=0.0)
Sentiment 2:  Sentiment(polarity=0.7, subjectivity=0.6000000000000001)


In [37]:
print('Polarity: ', testimonial1.sentiment.polarity)

Polarity:  0.0


In [38]:
print('subjectivity: ', testimonial1.sentiment.subjectivity)

Polarity:  0.0


In [39]:
from textblob.classifiers import NaiveBayesClassifier

In [41]:
# create some training and test data.

# List of Tuples , first being the data , 2nd is the tag

train = [
     ('I love this sandwich.', 'pos'),  
     ('this is an amazing place!', 'pos'),
     ('I feel very good about these beers.', 'pos'),
     ('this is my best work.', 'pos'),
     ("what an awesome view", 'pos'),
     ('I do not like this restaurant', 'neg'),
     ('I am tired of this stuff.', 'neg'),
     ("I can't deal with this", 'neg'),
     ('he is my sworn enemy!', 'neg'),
     ('my boss is horrible.', 'neg')
 ]

In [42]:
test = [
     ('the beer was good.', 'pos'),
     ('I do not enjoy my job', 'neg'),
     ("I ain't feeling dandy today.", 'neg'),
     ("I feel amazing!", 'pos'),
     ('Gary is a friend of mine.', 'pos'),
     ("I can't believe I'm doing this.", 'neg')
 ]

In [43]:
cl = NaiveBayesClassifier(train) # Trained my classifier on Train data

In [5]:
# Loading Data from Files
# You can also load data from common file formats including CSV, JSON, and TSV.

# CSV files should be formatted like so:

# I love this sandwich.,pos
# This is an amazing place!,pos
# I do not like this restaurant,neg

In [47]:
# Classifying Text
# Call the classify(text) method to use the classifier.

cl.classify("This is best library!")

'pos'

In [48]:
# You can get the label probability distribution with the prob_classify(text) method.

In [49]:
prob_dist = cl.prob_classify("This one's a doozy.")

In [10]:
round(prob_dist.prob("pos"), 2)

0.63

In [51]:
round(prob_dist.prob("neg"), 2)

0.37

In [52]:
# Evaluating Classifiers
# To compute the accuracy on our test set, use the accuracy(test_data) method.

cl.accuracy(test)

0.8333333333333334

### Updating Classifiers with New Data

In [58]:
new_data = [('She is my best friend.', 'pos'),
             ("I'm happy to have a new friend.", 'pos'),
             ("Stay thirsty, my friend.", 'pos'),
             ("He ain't from around here.", 'neg')]

In [59]:
cl.update(new_data)

True

In [60]:
cl.accuracy(test)

1.0

In [61]:
cl.classify("This is an worst library!")

'pos'