In [None]:
!pip install spacy==3.0
!pip install nltk
!pip install textacy
!pip install thinc

# **Check impure data**

In [3]:
text = """
After viewing the [PINKIEPOOL Trailer](https://www.youtu.be/watch?v=ieHRoHUg)
it got me thinking about the best match ups.
<lb>Here's my take:<lb><lb>[](/sp)[](/ppseesyou) Deadpool<lb>[](/sp)[](/ajsly)
Captain America<lb>"""

print(text)


After viewing the [PINKIEPOOL Trailer](https://www.youtu.be/watch?v=ieHRoHUg)
it got me thinking about the best match ups.
<lb>Here's my take:<lb><lb>[](/sp)[](/ppseesyou) Deadpool<lb>[](/sp)[](/ajsly)
Captain America<lb>


In [4]:
import re

RE_SUSPICIOUS = re.compile(r'[&#:<>{}\[\]\\]')

## check how much special chars in corpus
def text_impurity(text, min_len=10):
    """returns the share of suspicious characters in a text"""
    if text == None or len(text) < min_len:
        return 0
    else:

      return len(RE_SUSPICIOUS.findall(text))/len(text)

print(text_impurity(text))

0.0990990990990991


# **Normalise text data , convert unicode into ASCII**

In [5]:
text = "The café “Saint-Raphaël” is loca-\nted on Côte dʼAzur."

In [6]:
import textacy
import textacy.preprocessing as tprep

if textacy.__version__ < '0.11':
    def normalize(text):
        text = tprep.normalize_hyphenated_words(text)
        text = tprep.normalize_quotation_marks(text)
        text = tprep.normalize_unicode(text)
        text = tprep.remove_accents(text)
        return text

else:
    # adjusted to textacy 0.11
    def normalize(text):
        text = tprep.normalize.hyphenated_words(text)
        text = tprep.normalize.quotation_marks(text)
        text = tprep.normalize.unicode(text)
        text = tprep.remove.accents(text)
        return text

In [7]:
print(normalize(text))

The cafe "Saint-Raphael" is located on Cote d'Azur.


In [8]:
text = """
2019-08-10 23:32: @pete/@louis - I don't have a well-designed 
solution for today's problem. The code of module AC68 should be -1. 
Have to think a bit... #goodnight ;-) 😩😬"""

In [9]:
tokens = re.findall(r'\w\w+', text)
print(*tokens, sep='|')

2019|08|10|23|32|pete|louis|don|have|well|designed|solution|for|today|problem|The|code|of|module|AC68|should|be|Have|to|think|bit|goodnight


In [10]:
RE_TOKEN = re.compile(r"""
               ( [#]?[@\w'’\.\-\:]*\w     # words, hash tags and email adresses
               | [:;<]\-?[\)\(3]          # coarse pattern for basic text emojis
               | [\U0001F100-\U0001FFFF]  # coarse code range for unicode emojis
               )
               """, re.VERBOSE)

def tokenize(text):
    return RE_TOKEN.findall(text)

tokens = tokenize(text)
print(*tokens, sep='|')

2019-08-10|23:32|@pete|@louis|I|don't|have|a|well-designed|solution|for|today's|problem|The|code|of|module|AC68|should|be|-1|Have|to|think|a|bit|#goodnight|;-)|😩|😬


# **Tokenize text**

In [None]:
!python -m spacy download en_core_web_sm

In [13]:
import spacy

nlp = spacy.load('en_core_web_sm')

In [14]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x7fa3435067d0>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x7fa343519fb0>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x7fa3438b1d70>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x7fa3438b1ec0>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x7fa3434bce10>),
 ('lemmatizer',
  <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x7fa343443b90>)]

In [15]:
nlp = spacy.load("en_core_web_sm")
text = "I love NLP , it is a branch of Artificial Intelligence"
doc = nlp(text)

In [16]:
for token in doc:
  print(token , end="|")


I|love|NLP|,|it|is|a|branch|of|Artificial|Intelligence|

In [19]:
import pandas as pd

def tokens_attributes(doc, include_punct=False):
    """Generate data frame for visualization of spaCy tokens."""
    rows = []
    for i, t in enumerate(doc):
        if not t.is_punct or include_punct:
            row = {'token': i,  'text': t.text, 'lemma_': t.lemma_, 
                   'is_stop': t.is_stop, 'is_alpha': t.is_alpha,
                   'pos_': t.pos_, 'dep_': t.dep_, 
                   'ent_type_': t.ent_type_, 'ent_iob_': t.ent_iob_}
            rows.append(row)
    
    df = pd.DataFrame(rows).set_index('token')
    df.index.name = None
    return df

In [20]:
tokens_attributes(doc)

Unnamed: 0,text,lemma_,is_stop,is_alpha,pos_,dep_,ent_type_,ent_iob_
0,I,I,True,True,PRON,nsubj,,O
1,love,love,False,True,VERB,ccomp,,O
2,NLP,NLP,False,True,PROPN,dobj,ORG,B
4,it,it,True,True,PRON,nsubj,,O
5,is,be,True,True,AUX,ROOT,,O
6,a,a,True,True,DET,det,,O
7,branch,branch,False,True,NOUN,attr,,O
8,of,of,True,True,ADP,prep,,O
9,Artificial,Artificial,False,True,PROPN,compound,ORG,B
10,Intelligence,Intelligence,False,True,PROPN,pobj,ORG,I


**Customize tokens**

In [21]:
text = "@Pete: choose low-carb #food #eat-smart. _url_ ;-) 😋👍"
nlp = spacy.load('en_core_web_sm') ###
doc = nlp(text)

for token in doc:
    print(token, end="|")

@Pete|:|choose|low|-|carb|#|food|#|eat|-|smart|.|_|url|_|;-)|😋|👍|

custmize tokens will consider below as one tokens

'low-carb' ,
'#food' ,
'#eat-smart' , '_url_'

In [22]:
import re ###
import spacy ###
from spacy.tokenizer import Tokenizer
from spacy.util import compile_prefix_regex, compile_infix_regex, compile_suffix_regex

def custom_tokenizer(nlp):
    
    # use default patterns except the ones matched by re.search
    prefixes = [pattern for pattern in nlp.Defaults.prefixes 
                if pattern not in ['-', '_', '#']]
    suffixes = [pattern for pattern in nlp.Defaults.suffixes
                if pattern not in ['_']]
    infixes  = [pattern for pattern in nlp.Defaults.infixes
                if not re.search(pattern, 'xx-xx')]

    return Tokenizer(vocab          = nlp.vocab, 
                     rules          = nlp.Defaults.tokenizer_exceptions,
                     prefix_search  = compile_prefix_regex(prefixes).search,
                     suffix_search  = compile_suffix_regex(suffixes).search,
                     infix_finditer = compile_infix_regex(infixes).finditer,
                     token_match    = nlp.Defaults.token_match)

nlp = spacy.load('en_core_web_sm')
nlp.tokenizer = custom_tokenizer(nlp)

doc = nlp(text)
for token in doc:
    print(token, end="|")

@Pete|:|choose|low-carb|#food|#eat-smart|.|_url_|;-)|😋|👍|

In [108]:
# Normalize tokens with a dict
token_map = { 
             'NLP':'Natural Language Processing',
             'AI': 'Artificial Intelligence'}

def token_normalizer(tokens):
    return [token_map.get(t, t) for t in tokens]


tokens = "NLP is a branch of AI".split()
tokens = token_normalizer(tokens)

print(*tokens, sep='|')

Natural Language Processing|is|a|branch|of|Artificial Intelligence


# **Stop Words**

In [23]:
nlp = spacy.load('en_core_web_sm') 
text="I love NLP, it is a branch of Artificial Intelligence"
doc = nlp(text)

not_stopwords = [t for t in doc if not t.is_stop and not t.is_punct]
print(not_stopwords)

[love, NLP, branch, Artificial, Intelligence]


Exclude or include words as  STOP words

In [24]:
## Set any word as STOP word or NOT-STOP words
nlp.vocab['Computer'].is_stop = True
nlp.vocab['it'].is_stop = False

In [25]:
# now computer is a stopword
doc = nlp("It is Computer")
stopwords = [t for t in doc if  t.is_stop ]
print(stopwords)

[It, is, Computer]


# **Part of Speech (POS)**

In [51]:
text="I love NLP, it is a branch of Artificial Intelligence"

doc = nlp(text)
for token in doc:
  print(token.text , token.pos_  , end="| " ,  sep=' => ')

I => PRON| love => VERB| NLP => PROPN| , => PUNCT| it => PRON| is => AUX| a => DET| branch => NOUN| of => ADP| Artificial => PROPN| Intelligence => PROPN| 

In [52]:
## only NOUNS
nouns = [t for t in doc if t.pos_ in ['NOUN', 'PROPN']]
print(nouns)

[NLP, branch, Artificial, Intelligence]


In [57]:
## method showing how to extract tokens for adjectives and nouns from the sample sentence

import textacy
doc=nlp( "My best friend likes adventure games.")

tokens = textacy.extract.words(doc, 
            filter_stops = True,           # default True, no stopwords
            filter_punct = True,           # default True, no punctuation
            filter_nums = True,            # default False, no numbers
            include_pos = ['ADJ', 'NOUN'], # default None = include all
            exclude_pos = None,            # default None = exclude none
            min_freq = 1)                  # minimum frequency of words

print([t for t in tokens], sep='|')


[best, friend, adventure, games]


Extracting NOUN Phrase

In [60]:
print(*doc.noun_chunks, sep='|')

My best friend|adventure games


# **Named Entities Recognition**


In [92]:
text = "Mukesh Ambani, chairman of Reliance Industries, lives in Mumbai."
doc = nlp(text)

for ent in doc.ents:
    print(f"({ent.text}, {ent.label_})", end=" ")

(Mukesh Ambani, PERSON) (Reliance Industries, ORG) (Mumbai, GPE) 

In [93]:
from spacy import displacy

displacy.render(doc, style='ent', jupyter=True)

In [94]:
def extract_entities(doc, include_types=None, sep='_'):

    ents = textacy.extract.entities(doc, 
             include_types=include_types, 
             exclude_types=None, 
             drop_determiners=True, 
             min_freq=1)
    
    return [sep.join([t.lemma_ for t in e])+'/'+e.label_ for e in ents]

In [95]:
print(extract_entities(doc, ['PERSON', 'GPE']))

['Mukesh_Ambani/PERSON', 'Mumbai/GPE']


In [96]:
def extract_noun_chunks(doc):
  return doc.noun_chunks

print(*extract_noun_chunks(doc), sep='|')

Mukesh Ambani|chairman|Reliance Industries|Mumbai


In [97]:
def extract_lemmas(doc, **kwargs):
    return [t.lemma_ for t in textacy.extract.words(doc, **kwargs)]

lemmas = extract_lemmas(doc, include_pos=['ADJ', 'NOUN'])
print(*lemmas, sep='|')

chairman


In [98]:
def extract_nlp(doc):
    return {
    'lemmas'          : extract_lemmas(doc, 
                                     exclude_pos = ['PART', 'PUNCT', 
                                        'DET', 'PRON', 'SYM', 'SPACE'],
                                     filter_stops = False),
    'adjs_verbs'      : extract_lemmas(doc, include_pos = ['ADJ', 'VERB']),
    'nouns'           : extract_lemmas(doc, include_pos = ['NOUN', 'PROPN']),
    'noun_phrases'    : extract_noun_chunks(doc),
    'adj_noun_phrases': extract_noun_chunks(doc),
    'entities'        : extract_entities(doc, ['PERSON', 'ORG', 'GPE', 'LOC'])
    }

In [99]:
text = "Mukesh Ambani, chairman of Reliance Industries, lives in Mumbai."
doc = nlp(text)
for col, values in extract_nlp(doc).items():
    print(f"{col}: {values}")

lemmas: ['Mukesh', 'Ambani', 'chairman', 'of', 'Reliance', 'Industries', 'live', 'in', 'Mumbai']
adjs_verbs: ['live']
nouns: ['Mukesh', 'Ambani', 'chairman', 'Reliance', 'Industries', 'Mumbai']
noun_phrases: <generator object at 0x7fa3436eab90>
adj_noun_phrases: <generator object at 0x7fa3436ea690>
entities: ['Mukesh_Ambani/PERSON', 'Reliance_Industries/ORG', 'Mumbai/GPE']


In [100]:
nlp_columns = list(extract_nlp(nlp.make_doc('')).keys())
print(nlp_columns)

['lemmas', 'adjs_verbs', 'nouns', 'noun_phrases', 'adj_noun_phrases', 'entities']


# **Database Connection Sample**

In [None]:
import sqlite3 
import pandas as pd

db_name = "mydb.db"
con = sqlite3.connect(db_name)
df = pd.read_sql("select * from tab", con)
con.close()

df['text'] = df['title'] + ': ' + df['text']