### Capturing Text Data

In [1]:
import os

# Read in a plain Text File

with open(os.path.join("Data","hieroglyph.txt"), "r") as f:
    text = f.read()
    print(text)

Hieroglyphic writing dates from c. 3000 BC, and is composed of hundreds of symbols. A hieroglyph can represent a word, a sound, or a silent determinative; and the same symbol can serve different purposes in different contexts. Hieroglyphs were a formal script, used on stone monuments and in tombs, that could be as detailed as individual works of art.



### Tabular Data

In [2]:
import pandas as pd

df = pd.read_csv(os.path.join("Data", "news.csv"))
df.head()[['publisher', 'title']]

Unnamed: 0,publisher,title
0,Livemint,Fed's Charles Plosser sees high bar for change...
1,IFA Magazine,US open: Stocks fall after Fed official hints ...
2,IFA Magazine,"Fed risks falling 'behind the curve', Charles ..."
3,Moneynews,Fed's Plosser: Nasty Weather Has Curbed Job Gr...
4,NASDAQ,Plosser: Fed May Have to Accelerate Tapering Pace


In [3]:
# Convert text column to lowercase
df['title'] = df['title'].str.lower()
df.head()[['publisher', 'title']]

Unnamed: 0,publisher,title
0,Livemint,fed's charles plosser sees high bar for change...
1,IFA Magazine,us open: stocks fall after fed official hints ...
2,IFA Magazine,"fed risks falling 'behind the curve', charles ..."
3,Moneynews,fed's plosser: nasty weather has curbed job gr...
4,NASDAQ,plosser: fed may have to accelerate tapering pace


### Online Resource

In [4]:
import requests
import json

# Fetch Data From a Rest API
r = requests.get("https://quotes.rest/qod.json")
res = r.json()
print(json.dumps(res, indent=4))

{
    "success": {
        "total": 1
    },
    "contents": {
        "quotes": [
            {
                "quote": "The human mind and body are truly extraordinary. They are the quintessence of excellence in motion. We talk, touch, see, hear, taste, smell, and feel. We dream, aspire, and become. All that we are is mind and body and spirit-that is our universe.",
                "length": "253",
                "author": "Lorii Myers",
                "tags": [
                    "human-body",
                    "inspire",
                    "life",
                    "mind"
                ],
                "category": "inspire",
                "language": "en",
                "date": "2020-11-28",
                "permalink": "https://theysaidso.com/quote/lorii-myers-the-human-mind-and-body-are-truly-extraordinary-they-are-the-quintes",
                "id": "dZWOGcDFcNB_jy_XhjmnDweF",
                "background": "https://theysaidso.com/img/qod/qod-inspire.jpg",
      

In [5]:
# Extract relevant object and field
q = res["contents"]["quotes"][0]
print(q["quote"], "\n--", q["author"])

The human mind and body are truly extraordinary. They are the quintessence of excellence in motion. We talk, touch, see, hear, taste, smell, and feel. We dream, aspire, and become. All that we are is mind and body and spirit-that is our universe. 
-- Lorii Myers


### Cleaning

In [6]:
import requests

r = requests.get('https://news.ycombinator.com')
print(r.text)

<html lang="en" op="news"><head><meta name="referrer" content="origin"><meta name="viewport" content="width=device-width, initial-scale=1.0"><link rel="stylesheet" type="text/css" href="news.css?RLwg8iqc2tR5TuQByAQO">
        <link rel="shortcut icon" href="favicon.ico">
          <link rel="alternate" type="application/rss+xml" title="RSS" href="rss">
        <title>Hacker News</title></head><body><center><table id="hnmain" border="0" cellpadding="0" cellspacing="0" width="85%" bgcolor="#f6f6ef">
        <tr><td bgcolor="#ff6600"><table border="0" cellpadding="0" cellspacing="0" width="100%" style="padding:2px"><tr><td style="width:18px;padding-right:4px"><a href="https://news.ycombinator.com"><img src="y18.gif" width="18" height="18" style="border:1px white solid;"></a></td>
                  <td style="line-height:12pt; height:10px;"><span class="pagetop"><b class="hnname"><a href="news">Hacker News</a></b>
              <a href="newest">new</a> | <a href="front">past</a> | <a href=

In [7]:
import re

pattern = re.compile(r'<.*?>')
print(pattern.sub('', r.text))


        
          
        Hacker News
        
                  Hacker News
              new | past | comments | ask | show | jobs | submit            
                              login
                          
              

              
      1.      Tony Hsieh has died (yahoo.com)
        782 points by MrMcDowall 10 hours ago  | hide | 178&nbsp;comments              
      
                
      2.      MODNet: Is a Green Screen Necessary for Real-Time Human Matting? (github.com/zhkkke)
        16 points by homarp 40 minutes ago  | hide | 4&nbsp;comments              
      
                
      3.      TSMC: SoIC (tsmc.com)
        59 points by blopeur 4 hours ago  | hide | 10&nbsp;comments              
      
                
      4.      Is Probability Real? (arameb.com)
        77 points by EbTech 2 hours ago  | hide | 52&nbsp;comments              
      
                
      5.      Datasette-ripgrep: a regular expression search engine for your source code (

In [8]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(r.text, "html5lib")
print(soup.get_text())


        
          
        Hacker News
        
                  Hacker News
              new | past | comments | ask | show | jobs | submit            
                              login
                          
              

              
      1.      Tony Hsieh has died (yahoo.com)
        782 points by MrMcDowall 10 hours ago  | hide | 178 comments              
      
                
      2.      MODNet: Is a Green Screen Necessary for Real-Time Human Matting? (github.com/zhkkke)
        16 points by homarp 40 minutes ago  | hide | 4 comments              
      
                
      3.      TSMC: SoIC (tsmc.com)
        59 points by blopeur 4 hours ago  | hide | 10 comments              
      
                
      4.      Is Probability Real? (arameb.com)
        77 points by EbTech 2 hours ago  | hide | 52 comments              
      
                
      5.      Datasette-ripgrep: a regular expression search engine for your source code (simonwillison.net)
 

In [9]:
# Find all articles
summaries = soup.find_all("tr", class_="athing")
summaries[0]

<tr class="athing" id="25235490">
      <td align="right" class="title" valign="top"><span class="rank">1.</span></td>      <td class="votelinks" valign="top"><center><a href="vote?id=25235490&amp;how=up&amp;goto=news" id="up_25235490"><div class="votearrow" title="upvote"></div></a></center></td><td class="title"><a class="storylink" href="https://www.yahoo.com/lifestyle/tony-hsieh-zappos-luminary-revolutionized-045239863.html">Tony Hsieh has died</a><span class="sitebit comhead"> (<a href="from?site=yahoo.com"><span class="sitestr">yahoo.com</span></a>)</span></td></tr>

In [10]:
summaries[0].find("a", class_="storylink").get_text().strip()

'Tony Hsieh has died'

In [11]:
# Find all articles, extract titles
articles = []
summaries = soup.find_all("tr", class_="athing")
for summary in summaries:
    title = summary.find("a", class_="storylink").get_text().strip()
    articles.append((title))
    
print(len(articles), "Article summaries found. Sample:")    
print(articles[0])

30 Article summaries found. Sample:
Tony Hsieh has died


### Case Normalization

In [12]:
# Sample text
text = "The first time you see The Second Renaissance it may look boring. Look at it at least twice and definitely watch part 2. It will change your view of the matrix. Are the human people the ones who started the war ? Is AI a bad thing ?"
print(text)

The first time you see The Second Renaissance it may look boring. Look at it at least twice and definitely watch part 2. It will change your view of the matrix. Are the human people the ones who started the war ? Is AI a bad thing ?


In [13]:
# Convert to lowercase
text = text.lower() 
print(text)

the first time you see the second renaissance it may look boring. look at it at least twice and definitely watch part 2. it will change your view of the matrix. are the human people the ones who started the war ? is ai a bad thing ?


### Punctuation Removal

In [14]:
# Way_01
text = re.sub(r"[^a-zA-Z0-9]", " ", text)
print(text)

the first time you see the second renaissance it may look boring  look at it at least twice and definitely watch part 2  it will change your view of the matrix  are the human people the ones who started the war   is ai a bad thing  


In [15]:
# Way_02
import string
text = "".join([word for word in text if word not in string.punctuation])
print(text)

the first time you see the second renaissance it may look boring  look at it at least twice and definitely watch part 2  it will change your view of the matrix  are the human people the ones who started the war   is ai a bad thing  


### Tokenization

In [16]:
words = text.split()
print(words)

['the', 'first', 'time', 'you', 'see', 'the', 'second', 'renaissance', 'it', 'may', 'look', 'boring', 'look', 'at', 'it', 'at', 'least', 'twice', 'and', 'definitely', 'watch', 'part', '2', 'it', 'will', 'change', 'your', 'view', 'of', 'the', 'matrix', 'are', 'the', 'human', 'people', 'the', 'ones', 'who', 'started', 'the', 'war', 'is', 'ai', 'a', 'bad', 'thing']


In [17]:
os.getcwd()

'D:\\Project\\Data Scientist with Python\\31_NLP_with_Python'

In [18]:
# Another sample text
text = "Dr. Smith graduated from the University of Washington. He later started an analytics firm called Lux, which catered to enterprise customers."
print(text)

Dr. Smith graduated from the University of Washington. He later started an analytics firm called Lux, which catered to enterprise customers.


In [19]:
from nltk.tokenize import word_tokenize

words = word_tokenize(text)
print(words)

['Dr.', 'Smith', 'graduated', 'from', 'the', 'University', 'of', 'Washington', '.', 'He', 'later', 'started', 'an', 'analytics', 'firm', 'called', 'Lux', ',', 'which', 'catered', 'to', 'enterprise', 'customers', '.']


In [20]:
from nltk.tokenize import sent_tokenize

sentences = sent_tokenize(text)
print(sentences)

['Dr. Smith graduated from the University of Washington.', 'He later started an analytics firm called Lux, which catered to enterprise customers.']


In [21]:
from nltk.corpus import stopwords
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [22]:
# Reset text
text = "The first time you see The Second Renaissance it may look boring. Look at it at least twice and definitely watch part 2. It will change your view of the matrix. Are the human people the ones who started the war ? Is AI a bad thing ?"

# Normalize it
text = re.sub(r'[^a-zA-Z0-9]', " ",text.lower())

# Tokenize it
words = text.split()
print(words)

['the', 'first', 'time', 'you', 'see', 'the', 'second', 'renaissance', 'it', 'may', 'look', 'boring', 'look', 'at', 'it', 'at', 'least', 'twice', 'and', 'definitely', 'watch', 'part', '2', 'it', 'will', 'change', 'your', 'view', 'of', 'the', 'matrix', 'are', 'the', 'human', 'people', 'the', 'ones', 'who', 'started', 'the', 'war', 'is', 'ai', 'a', 'bad', 'thing']


In [23]:
# Remove Stopwords
words = [word for word in words if word not in stopwords.words('english')]
print(words)

['first', 'time', 'see', 'second', 'renaissance', 'may', 'look', 'boring', 'look', 'least', 'twice', 'definitely', 'watch', 'part', '2', 'change', 'view', 'matrix', 'human', 'people', 'ones', 'started', 'war', 'ai', 'bad', 'thing']


### Sentence Parsing

In [24]:
import nltk
# Define a custom grammar
my_grammar = nltk.CFG.fromstring("""
S -> NP VP
PP -> P NP
NP -> Det N | Det N PP | 'I'
VP -> V NP | VP PP
Det -> 'an' | 'my'
N -> 'elephant' | 'pajamas'
V -> 'shot'
P -> 'in'
""")
parser = nltk.ChartParser(my_grammar)

sentence = word_tokenize("I shot an elephant in my pajamas")
for tree in parser.parse(sentence):
    print(tree)

(S
  (NP I)
  (VP
    (VP (V shot) (NP (Det an) (N elephant)))
    (PP (P in) (NP (Det my) (N pajamas)))))
(S
  (NP I)
  (VP
    (V shot)
    (NP (Det an) (N elephant) (PP (P in) (NP (Det my) (N pajamas))))))


### Stemming

In [25]:
from nltk.stem.porter import PorterStemmer

# Reduce words to their stems
stemmed = [PorterStemmer().stem(w) for w in words]
print(stemmed)

['first', 'time', 'see', 'second', 'renaiss', 'may', 'look', 'bore', 'look', 'least', 'twice', 'definit', 'watch', 'part', '2', 'chang', 'view', 'matrix', 'human', 'peopl', 'one', 'start', 'war', 'ai', 'bad', 'thing']


In [26]:
from nltk.stem.wordnet import WordNetLemmatizer

# Reduce words to their root form
lemmed = [WordNetLemmatizer().lemmatize(w) for w in words]
print(lemmed)

['first', 'time', 'see', 'second', 'renaissance', 'may', 'look', 'boring', 'look', 'least', 'twice', 'definitely', 'watch', 'part', '2', 'change', 'view', 'matrix', 'human', 'people', 'one', 'started', 'war', 'ai', 'bad', 'thing']


In [27]:
# Lemmatize verbs by specifying pos
lemmed = [WordNetLemmatizer().lemmatize(w, pos='v') for w in lemmed]
print(lemmed)

['first', 'time', 'see', 'second', 'renaissance', 'may', 'look', 'bore', 'look', 'least', 'twice', 'definitely', 'watch', 'part', '2', 'change', 'view', 'matrix', 'human', 'people', 'one', 'start', 'war', 'ai', 'bad', 'thing']


In [28]:
text = "Dr. Smith graduated from the University of Washington. He later started an analytics firm called Lux, which catered to enterprise customers."
from nltk.tokenize import word_tokenize
from nltk import pos_tag

sentence = word_tokenize(text)
print(pos_tag(sentence))

[('Dr.', 'NNP'), ('Smith', 'NNP'), ('graduated', 'VBD'), ('from', 'IN'), ('the', 'DT'), ('University', 'NNP'), ('of', 'IN'), ('Washington', 'NNP'), ('.', '.'), ('He', 'PRP'), ('later', 'RB'), ('started', 'VBD'), ('an', 'DT'), ('analytics', 'NNS'), ('firm', 'NN'), ('called', 'VBD'), ('Lux', 'NNP'), (',', ','), ('which', 'WDT'), ('catered', 'VBD'), ('to', 'TO'), ('enterprise', 'VB'), ('customers', 'NNS'), ('.', '.')]


# Visualize Parse Trees
sentence = word_tokenize("I shot an elephant in my pajamas")
for tree in parser.parse(sentence):
    tree.draw()

from nltk import pos_tag, ne_chunk
from nltk.tokenize import word_tokenize

ne_chunk(pos_tag(word_tokenize('Antonio joined Company in Mumbai')))