#### Basic of Natural Language Processing

[Ref](https://docs.python.org/3/library/re.html)

In [1]:
import re # regular expression
import nltk # natural language toolkit

In [2]:
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\goura\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\goura\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\goura\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\goura\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Basic of regular expression

In [3]:
text = "I am in Data Science, session BY Avantika!!! Starts @ 7.30 PM IST...."
text

'I am in Data Science, session BY Avantika!!! Starts @ 7.30 PM IST....'

In [4]:
# convert all characters into lower case
text.lower()

'i am in data science, session by avantika!!! starts @ 7.30 pm ist....'

In [5]:
# convert all characters into upper case
text.upper()

'I AM IN DATA SCIENCE, SESSION BY AVANTIKA!!! STARTS @ 7.30 PM IST....'

In [6]:
# find Upper letter and replace them with "x"
# r -> regular expression
re.sub(r'[A-Z]','x',text)

'x am in xata xcience, session xx xvantika!!! xtarts @ 7.30 xx xxx....'

In [7]:
# find Lower case letter and replace them with "x"
re.sub(r'[a-z]','x',text)

'I xx xx Dxxx Sxxxxxx, xxxxxxx BY Axxxxxxx!!! Sxxxxx @ 7.30 PM IST....'

In [8]:
# remove number
re.sub(r'[0-9]','',text)

'I am in Data Science, session BY Avantika!!! Starts @ . PM IST....'

In [9]:
# i want only upper case letter
re.sub(r'[^A-Z]',' ',text)

'I       D    S                BY A           S             PM IST    '

In [10]:
# i want only upper case letter
re.sub(r'[^A-Z]','',text)

'IDSBYASPMIST'

In [11]:
# remove special characters
re.sub(r'[^A-Za-z0-9]',' ',text)

'I am in Data Science  session BY Avantika    Starts   7 30 PM IST    '

### Tokenization
- list of words

In [12]:
text = 'A very, very, very slow-moving, aimless movie about a distressed, drifting young man.'
text

'A very, very, very slow-moving, aimless movie about a distressed, drifting young man.'

- convert all characters into lower case 
- remove special characters
- convert word tokens

In [13]:
# step-1:
text_clean = text.lower()
# step-2:
text_clean = re.sub(r'[^a-z0-9]',' ',text_clean)
#
print(text_clean)

a very  very  very slow moving  aimless movie about a distressed  drifting young man 


In [14]:
print(text_clean.split(' '))

['a', 'very', '', 'very', '', 'very', 'slow', 'moving', '', 'aimless', 'movie', 'about', 'a', 'distressed', '', 'drifting', 'young', 'man', '']


In [15]:
tokens = nltk.word_tokenize(text_clean)
print(tokens)

['a', 'very', 'very', 'very', 'slow', 'moving', 'aimless', 'movie', 'about', 'a', 'distressed', 'drifting', 'young', 'man']


In [16]:

 ' '.join(tokens)

'a very very very slow moving aimless movie about a distressed drifting young man'

### Stemming

In [17]:
from nltk.stem import SnowballStemmer #, LancasterStemmer, PorterStemmer

In [18]:
snowball = SnowballStemmer(language='english')

In [19]:
word ='caring cared care' # expect is care

In [20]:
snowball.stem(word)

'caring cared car'

In [21]:
stem_words = []
for w in tokens:
    stem_words.append(snowball.stem(w))
    
print(stem_words)
print(" ".join(stem_words))

['a', 'veri', 'veri', 'veri', 'slow', 'move', 'aimless', 'movi', 'about', 'a', 'distress', 'drift', 'young', 'man']
a veri veri veri slow move aimless movi about a distress drift young man


### Lematization
- root word

In [22]:
lema = nltk.wordnet.WordNetLemmatizer()

n - noun
v - verb
r - adverb
a - adjective

In [23]:
lema.lemmatize('caring',pos='v')

'care'

### pos tagging -> parts of speech tagging

In [24]:
pos_list = nltk.pos_tag(tokens)

In [25]:
print(pos_list)

[('a', 'DT'), ('very', 'RB'), ('very', 'RB'), ('very', 'RB'), ('slow', 'JJ'), ('moving', 'VBG'), ('aimless', 'JJ'), ('movie', 'NN'), ('about', 'IN'), ('a', 'DT'), ('distressed', 'JJ'), ('drifting', 'NN'), ('young', 'JJ'), ('man', 'NN')]


In [26]:
lema_list = []
for word, p in pos_list:
    if p.startswith('V'):
        po = 'v'
    elif p.startswith('J'):
        po ='a'
    elif p.startswith('R'):
        po ='r'
    else:
        po ='n'
        
    lema_word =lema.lemmatize(word,pos=po)
    #print(word,'\t',po,'\t',lema_word)
    lema_list.append(lema_word)

In [27]:
" ".join(lema_list)

'a very very very slow move aimless movie about a distressed drifting young man'

    CC coordinating conjunction
    CD cardinal digit
    DT determiner
    EX existential there (like: "there is" ... think of it like "there exists")
    FW foreign word
    IN preposition/subordinating conjunction
    JJ adjective 'big'
    JJR adjective, comparative 'bigger'
    JJS adjective, superlative 'biggest'
    LS list marker 1)
    MD modal could, will
    NN noun, singular 'desk'
    NNS noun plural 'desks'
    NNP proper noun, singular 'Harrison'
    NNPS proper noun, plural 'Americans'
    PDT predeterminer 'all the kids'
    POS possessive ending parent's
    PRP personal pronoun I, he, she
    PRP$ possessive pronoun my, his, hers
    RB adverb very, silently,
    RBR adverb, comparative better
    RBS adverb, superlative best
    RP particle give up
    TO to go 'to' the store.
    UH interjection errrrrrrrm
    VB verb, base form take
    VBD verb, past tense took
    VBG verb, gerund/present participle taking
    VBN verb, past participle taken
    VBP verb, sing. present, non-3d take
    VBZ verb, 3rd person sing. present takes
    WDT wh-determiner which
    WP wh-pronoun who, what
    WP$ possessive wh-pronoun whose
    WRB wh-abverb where, when