## Natural Language Processing

In [1]:
from textblob import TextBlob


In [2]:
text = 'Today is the first day of the rest of your long life. Make it count'
blob = TextBlob(text)
blob

TextBlob("Today is the first day of the rest of your long life. Make it count")

In [3]:
blob.sentences

[Sentence("Today is the first day of the rest of your long life."),
 Sentence("Make it count")]

In [4]:
blob.words


WordList(['Today', 'is', 'the', 'first', 'day', 'of', 'the', 'rest', 'of', 'your', 'long', 'life', 'Make', 'it', 'count'])

In [5]:
blob.tags

[('Today', 'NN'),
 ('is', 'VBZ'),
 ('the', 'DT'),
 ('first', 'JJ'),
 ('day', 'NN'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('rest', 'NN'),
 ('of', 'IN'),
 ('your', 'PRP$'),
 ('long', 'JJ'),
 ('life', 'NN'),
 ('Make', 'VB'),
 ('it', 'PRP'),
 ('count', 'VB')]

In [67]:
blob.noun_phrases


WordList(['long life'])

#### Polarity indicates sentiment with a value from -1.0 (negative) to 1.0 (positive) with 0 being neutral.  The subjectivity is a value from 0.0 (objective) to 1.0 (subjective). 

In [68]:
blob.sentiment

Sentiment(polarity=0.1, subjectivity=0.3666666666666667)

In [69]:
%precision 3

'%.3f'

In [70]:
blob.sentiment.polarity

0.100

In [71]:
blob.sentiment.subjectivity

0.367

In [72]:
for s in blob.sentences:
  print(s.sentiment)

Sentiment(polarity=0.1, subjectivity=0.3666666666666667)
Sentiment(polarity=0.0, subjectivity=0.0)


#### By default TextBlob uses a PatternAnalyzer, which uses the same sentiment analysis as in the Pattern library.  TextBlob also has a NaiveBayesAnalyzer, which was trained on a movie review dataset.  Naive Bayes is a commonly used machine learning text-classification algorithm.  

#### The NaiveBayesAnalyzer returns a classification of positive or negative and the percentage positive and negative.

In [73]:
from textblob.sentiments import NaiveBayesAnalyzer

In [74]:
blob = TextBlob(text,analyzer=NaiveBayesAnalyzer())
blob

TextBlob("Today is the first day of the rest of your long life. Make it count")

In [75]:
blob.sentiment

Sentiment(classification='pos', p_pos=0.7655912735476013, p_neg=0.234408726452398)

In [76]:
for s in blob.sentences:
    print(s)
    print(s.sentiment)

Today is the first day of the rest of your long life.
Sentiment(classification='pos', p_pos=0.8117212577992605, p_neg=0.1882787422007389)
Make it count
Sentiment(classification='neg', p_pos=0.43102968235521993, p_neg=0.5689703176447805)


## Language Detection using TextBlob
#### NOTE: TextBlob deprecated the use of detect_language() and translate(). Use Google Translate API instead.

In [78]:
blob.detect_language()

HTTPError: HTTP Error 400: Bad Request

In [79]:
spanish = blob.translate(to='es')

AttributeError: 'list' object has no attribute 'strip'

In [80]:
spanish.detect_language()

NameError: name 'spanish' is not defined

In [81]:
blob.translate(to='zh')

AttributeError: 'list' object has no attribute 'strip'

# Language Detection and Translation using Google Translate

### Google Translate API
#### from Anaconda CMD prompt or Terminal (Mac):
 conda install googletrans 
 ##### or
 pip install googletrans
 
 #### or try this:
 pip3 uninstall googletrans
 
 pip3 install googletrans==3.1.0a0

In [82]:
import googletrans

print(googletrans.LANGUAGES)

{'af': 'afrikaans', 'sq': 'albanian', 'am': 'amharic', 'ar': 'arabic', 'hy': 'armenian', 'az': 'azerbaijani', 'eu': 'basque', 'be': 'belarusian', 'bn': 'bengali', 'bs': 'bosnian', 'bg': 'bulgarian', 'ca': 'catalan', 'ceb': 'cebuano', 'ny': 'chichewa', 'zh-cn': 'chinese (simplified)', 'zh-tw': 'chinese (traditional)', 'co': 'corsican', 'hr': 'croatian', 'cs': 'czech', 'da': 'danish', 'nl': 'dutch', 'en': 'english', 'eo': 'esperanto', 'et': 'estonian', 'tl': 'filipino', 'fi': 'finnish', 'fr': 'french', 'fy': 'frisian', 'gl': 'galician', 'ka': 'georgian', 'de': 'german', 'el': 'greek', 'gu': 'gujarati', 'ht': 'haitian creole', 'ha': 'hausa', 'haw': 'hawaiian', 'iw': 'hebrew', 'he': 'hebrew', 'hi': 'hindi', 'hmn': 'hmong', 'hu': 'hungarian', 'is': 'icelandic', 'ig': 'igbo', 'id': 'indonesian', 'ga': 'irish', 'it': 'italian', 'ja': 'japanese', 'jw': 'javanese', 'kn': 'kannada', 'kk': 'kazakh', 'km': 'khmer', 'ko': 'korean', 'ku': 'kurdish (kurmanji)', 'ky': 'kyrgyz', 'lo': 'lao', 'la': 'lat

In [83]:
from googletrans import Translator
translator = Translator()

In [84]:
from googletrans import Translator
translator = Translator()
ar = translator.translate('Hola que tal',src='es')
print(ar)
print(ar.text)
ar = translator.translate('이 문장은 한글로 쓰여졌습니다.')
ar.text

AttributeError: 'NoneType' object has no attribute 'group'

In [None]:
ar = translator.translate('veritas lux mea', src='la')
ar.text

In [None]:
lang = translator.detect('이 문장은 한글로 쓰여졌습니다.')
lang.lang

#### The confidence property of the Google Translate API (Application Programming Interface) is a value that represents the level of confidence the API has in the translation it has provided for a given text. It is a floating-point number between 0 and 1, where 0 indicates low confidence and 1 indicates high confidence.

In [None]:
lang.confidence

In [None]:
lang = translator.detect("carpe diem")
googletrans.LANGUAGES[lang.lang]

In [None]:
lang.confidence

## Word and WordLists support converting words to singular or plural forms

In [85]:
from textblob import Word

In [86]:
dog = Word('dog')

In [87]:
dog.pluralize()

'dogs'

In [88]:
dogs = Word('dogs')

In [89]:
dogs.singularize()

'dog'

In [90]:
word = Word('eith')
%precision 2

'%.2f'

In [91]:
word.spellcheck()

[('with', 1.00), ('keith', 0.00), ('edith', 0.00)]

In [92]:
word.correct()

'with'

In [93]:
word = Word('strawberries')
word.stem()

'strawberri'

In [94]:
word.lemmatize()

'strawberry'

In [95]:
word = Word('place')
word.definitions

['a point located with respect to surface features of some region',
 'any area set aside for a particular purpose',
 'an abstract mental location',
 'a general vicinity',
 'the post or function properly or customarily occupied or served by another',
 'a particular situation',
 'where you live at a particular time',
 'a job in an organization',
 'the particular portion of space occupied by something',
 'proper or designated social situation',
 'a space reserved for sitting (as in a theater or on a train or airplane)',
 'the passage that is being read',
 'proper or appropriate position or location',
 'a public square with room for pedestrians',
 'an item on a list or in a sequence',
 'a blank area',
 'put into a certain place or abstract location',
 'place somebody in a particular situation or location',
 'assign a rank or rating to',
 'assign a location to',
 'to arrange for',
 'take a place in a competition; often followed by an ordinal',
 'intend (something) to move towards a certain 

In [96]:
word.synsets

[Synset('topographic_point.n.01'),
 Synset('place.n.02'),
 Synset('place.n.03'),
 Synset('place.n.04'),
 Synset('stead.n.01'),
 Synset('place.n.06'),
 Synset('home.n.01'),
 Synset('position.n.06'),
 Synset('position.n.01'),
 Synset('place.n.10'),
 Synset('seat.n.01'),
 Synset('place.n.12'),
 Synset('place.n.13'),
 Synset('plaza.n.01'),
 Synset('place.n.15'),
 Synset('space.n.07'),
 Synset('put.v.01'),
 Synset('place.v.02'),
 Synset('rate.v.01'),
 Synset('locate.v.03'),
 Synset('place.v.05'),
 Synset('place.v.06'),
 Synset('target.v.01'),
 Synset('identify.v.01'),
 Synset('place.v.09'),
 Synset('set.v.09'),
 Synset('place.v.11'),
 Synset('place.v.12'),
 Synset('invest.v.01'),
 Synset('station.v.01'),
 Synset('place.v.15'),
 Synset('place.v.16')]

In [97]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/palumbo.giancarlo/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [98]:
from nltk.corpus import stopwords

In [99]:
stops = stopwords.words('english')
stops

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [100]:
blob = TextBlob('Today is a beautiful day')
[word for word in blob.words if word not in stops]

['Today', 'beautiful', 'day']

#### ngrams are a list of WordList n-grams of length three by default.  ngrams can be used to identify letters or words the frequently appear adjacent to one another.  

In [101]:
blob.ngrams()

[WordList(['Today', 'is', 'a']),
 WordList(['is', 'a', 'beautiful']),
 WordList(['a', 'beautiful', 'day'])]