# NLTK Basics

In [1]:
import numpy as np
import pandas as pd

from sklearn.datasets import fetch_20newsgroups

In [2]:
text_data = fetch_20newsgroups()

In [3]:
type(text_data)

sklearn.utils._bunch.Bunch

In [4]:
raw_text = text_data.data[:4]

In [5]:
print(raw_text)

["From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n", "From: guykuo@carson.u.washington.edu (Guy Kuo)\nSubject: SI Clock Poll - Final Call\nSummary: Final call for SI clock reports\nKeywords: SI,acceleration,clock,upgrade\nArticle-I.D.: shelley.1qvfo9INNc3s\nOrganization: University of Washington\nLines: 1

In [6]:
clean_text_1 = []

def to_lower_case(data):
    for words in data:
        clean_text_1.append(str.lower(words))

In [7]:
to_lower_case(raw_text)

In [8]:
clean_text_1[0]

"from: lerxst@wam.umd.edu (where's my thing)\nsubject: what car is this!?\nnntp-posting-host: rac3.wam.umd.edu\norganization: university of maryland, college park\nlines: 15\n\n i was wondering if anyone out there could enlighten me on this car i saw\nthe other day. it was a 2-door sports car, looked to be from the late 60s/\nearly 70s. it was called a bricklin. the doors were really small. in addition,\nthe front bumper was separate from the rest of the body. this is \nall i know. if anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nthanks,\n- il\n   ---- brought to you by your neighborhood lerxst ----\n\n\n\n\n"

### sent_tokenize, word_tokenize

In [9]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [10]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\fross\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### sentence tokenization - sent_tokenize

In [11]:
sent_tok = []
for sent in clean_text_1:
    sent = sent_tokenize(sent)
    sent_tok.append(sent)
    
sent_tok

[["from: lerxst@wam.umd.edu (where's my thing)\nsubject: what car is this!?",
  'nntp-posting-host: rac3.wam.umd.edu\norganization: university of maryland, college park\nlines: 15\n\n i was wondering if anyone out there could enlighten me on this car i saw\nthe other day.',
  'it was a 2-door sports car, looked to be from the late 60s/\nearly 70s.',
  'it was called a bricklin.',
  'the doors were really small.',
  'in addition,\nthe front bumper was separate from the rest of the body.',
  'this is \nall i know.',
  'if anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.',
  'thanks,\n- il\n   ---- brought to you by your neighborhood lerxst ----'],
 ['from: guykuo@carson.u.washington.edu (guy kuo)\nsubject: si clock poll - final call\nsummary: final call for si clock reports\nkeywords: si,acceleration,clock,upgrade\narticle-i.d.',
  ': shelley.1qvfo9innc3s\norganization

### word tokenization - word_tokenize

In [12]:
clean_text_2 = [word_tokenize(_) for _ in clean_text_1]
clean_text_2

[['from',
  ':',
  'lerxst',
  '@',
  'wam.umd.edu',
  '(',
  'where',
  "'s",
  'my',
  'thing',
  ')',
  'subject',
  ':',
  'what',
  'car',
  'is',
  'this',
  '!',
  '?',
  'nntp-posting-host',
  ':',
  'rac3.wam.umd.edu',
  'organization',
  ':',
  'university',
  'of',
  'maryland',
  ',',
  'college',
  'park',
  'lines',
  ':',
  '15',
  'i',
  'was',
  'wondering',
  'if',
  'anyone',
  'out',
  'there',
  'could',
  'enlighten',
  'me',
  'on',
  'this',
  'car',
  'i',
  'saw',
  'the',
  'other',
  'day',
  '.',
  'it',
  'was',
  'a',
  '2-door',
  'sports',
  'car',
  ',',
  'looked',
  'to',
  'be',
  'from',
  'the',
  'late',
  '60s/',
  'early',
  '70s',
  '.',
  'it',
  'was',
  'called',
  'a',
  'bricklin',
  '.',
  'the',
  'doors',
  'were',
  'really',
  'small',
  '.',
  'in',
  'addition',
  ',',
  'the',
  'front',
  'bumper',
  'was',
  'separate',
  'from',
  'the',
  'rest',
  'of',
  'the',
  'body',
  '.',
  'this',
  'is',
  'all',
  'i',
  'know',
  '

In [13]:
sum([len(_) for _ in clean_text_2])

901

### removing symbols

In [14]:
import re

In [15]:
clean_text_3 = []

for words in clean_text_2:
    clean = []
    for w in words:
        res = re.sub(r'[^\w\s]', '', w)
        if res != '':
            clean.append(res)
    clean_text_3.append(clean)

In [16]:
clean_text_3

[['from',
  'lerxst',
  'wamumdedu',
  'where',
  's',
  'my',
  'thing',
  'subject',
  'what',
  'car',
  'is',
  'this',
  'nntppostinghost',
  'rac3wamumdedu',
  'organization',
  'university',
  'of',
  'maryland',
  'college',
  'park',
  'lines',
  '15',
  'i',
  'was',
  'wondering',
  'if',
  'anyone',
  'out',
  'there',
  'could',
  'enlighten',
  'me',
  'on',
  'this',
  'car',
  'i',
  'saw',
  'the',
  'other',
  'day',
  'it',
  'was',
  'a',
  '2door',
  'sports',
  'car',
  'looked',
  'to',
  'be',
  'from',
  'the',
  'late',
  '60s',
  'early',
  '70s',
  'it',
  'was',
  'called',
  'a',
  'bricklin',
  'the',
  'doors',
  'were',
  'really',
  'small',
  'in',
  'addition',
  'the',
  'front',
  'bumper',
  'was',
  'separate',
  'from',
  'the',
  'rest',
  'of',
  'the',
  'body',
  'this',
  'is',
  'all',
  'i',
  'know',
  'if',
  'anyone',
  'can',
  'tellme',
  'a',
  'model',
  'name',
  'engine',
  'specs',
  'years',
  'of',
  'production',
  'where',
 

In [17]:
sum([len(_) for _ in clean_text_3])

700

### removing stopwords

In [18]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\fross\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [19]:
from nltk.corpus import stopwords

In [34]:
clean_text_4 = []

for words in clean_text_3:
    w = []
    for word in words:
        if not word in stopwords.words('english'):
            w.append(word)
    clean_text_4.append(w)

In [35]:
clean_text_4

[['lerxst',
  'wamumdedu',
  'thing',
  'subject',
  'car',
  'nntppostinghost',
  'rac3wamumdedu',
  'organization',
  'university',
  'maryland',
  'college',
  'park',
  'lines',
  '15',
  'wondering',
  'anyone',
  'could',
  'enlighten',
  'car',
  'saw',
  'day',
  '2door',
  'sports',
  'car',
  'looked',
  'late',
  '60s',
  'early',
  '70s',
  'called',
  'bricklin',
  'doors',
  'really',
  'small',
  'addition',
  'front',
  'bumper',
  'separate',
  'rest',
  'body',
  'know',
  'anyone',
  'tellme',
  'model',
  'name',
  'engine',
  'specs',
  'years',
  'production',
  'car',
  'made',
  'history',
  'whatever',
  'info',
  'funky',
  'looking',
  'car',
  'please',
  'email',
  'thanks',
  'il',
  'brought',
  'neighborhood',
  'lerxst'],
 ['guykuo',
  'carsonuwashingtonedu',
  'guy',
  'kuo',
  'subject',
  'si',
  'clock',
  'poll',
  'final',
  'call',
  'summary',
  'final',
  'call',
  'si',
  'clock',
  'reports',
  'keywords',
  'si',
  'acceleration',
  'clock',

In [36]:
sum([len(_) for _ in clean_text_4])

415

### stemming

In [37]:
from nltk.stem.porter import PorterStemmer

In [38]:
port = PorterStemmer()

In [39]:
[port.stem(_) for _ in ['reading', 'washing', 'wash', 'Driving']]  # demonstration of how port.stem works

['read', 'wash', 'wash', 'drive']

In [40]:
clean_text_5 = []

for words in clean_text_4:
    w = []
    for word in words:
        w.append(word)
    clean_text_5.append(w)

In [41]:
clean_text_5

[['lerxst',
  'wamumdedu',
  'thing',
  'subject',
  'car',
  'nntppostinghost',
  'rac3wamumdedu',
  'organization',
  'university',
  'maryland',
  'college',
  'park',
  'lines',
  '15',
  'wondering',
  'anyone',
  'could',
  'enlighten',
  'car',
  'saw',
  'day',
  '2door',
  'sports',
  'car',
  'looked',
  'late',
  '60s',
  'early',
  '70s',
  'called',
  'bricklin',
  'doors',
  'really',
  'small',
  'addition',
  'front',
  'bumper',
  'separate',
  'rest',
  'body',
  'know',
  'anyone',
  'tellme',
  'model',
  'name',
  'engine',
  'specs',
  'years',
  'production',
  'car',
  'made',
  'history',
  'whatever',
  'info',
  'funky',
  'looking',
  'car',
  'please',
  'email',
  'thanks',
  'il',
  'brought',
  'neighborhood',
  'lerxst'],
 ['guykuo',
  'carsonuwashingtonedu',
  'guy',
  'kuo',
  'subject',
  'si',
  'clock',
  'poll',
  'final',
  'call',
  'summary',
  'final',
  'call',
  'si',
  'clock',
  'reports',
  'keywords',
  'si',
  'acceleration',
  'clock',

In [42]:
sum([len(_) for _ in clean_text_5])

415

### lemmatization

In [43]:
from nltk.stem.wordnet import WordNetLemmatizer

In [44]:
wnet = WordNetLemmatizer()

In [45]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\fross\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [46]:
lem = []

for words in clean_text_4:
    w = []
    for word in words:
        w.append(wnet.lemmatize(word))
    lem.append(w)

In [47]:
lem

[['lerxst',
  'wamumdedu',
  'thing',
  'subject',
  'car',
  'nntppostinghost',
  'rac3wamumdedu',
  'organization',
  'university',
  'maryland',
  'college',
  'park',
  'line',
  '15',
  'wondering',
  'anyone',
  'could',
  'enlighten',
  'car',
  'saw',
  'day',
  '2door',
  'sport',
  'car',
  'looked',
  'late',
  '60',
  'early',
  '70',
  'called',
  'bricklin',
  'door',
  'really',
  'small',
  'addition',
  'front',
  'bumper',
  'separate',
  'rest',
  'body',
  'know',
  'anyone',
  'tellme',
  'model',
  'name',
  'engine',
  'spec',
  'year',
  'production',
  'car',
  'made',
  'history',
  'whatever',
  'info',
  'funky',
  'looking',
  'car',
  'please',
  'email',
  'thanks',
  'il',
  'brought',
  'neighborhood',
  'lerxst'],
 ['guykuo',
  'carsonuwashingtonedu',
  'guy',
  'kuo',
  'subject',
  'si',
  'clock',
  'poll',
  'final',
  'call',
  'summary',
  'final',
  'call',
  'si',
  'clock',
  'report',
  'keywords',
  'si',
  'acceleration',
  'clock',
  'upgr

In [48]:
sum([len(_) for _ in lem])

415