# Natural Language Processing

Notebook ini merupakan bagian dari buku **Fundamental Machine Learning menggunakan Python** oleh **Fahmi Noor Fiqri**. Notebook ini berisi contoh kode untuk **Bab 12 - Natural Language Processing.**

In [1]:
import re

import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
nltk.download(["punkt", "stopwords", "wordnet", "averaged_perceptron_tagger", "universal_tagset", "maxent_ne_chunker", "words", "snowball_data"])

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\fahmi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\fahmi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\fahmi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\fahmi\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\fahmi\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\fahmi\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_

True

## Regex

In [3]:
# Pencocokan teks
re.findall('@[^\s]+', 'Follow Instagram dan Twitter saya, @fahminoorfiqri')

['@fahminoorfiqri']

In [4]:
# Penggantian teks
re.sub('[0-9]{2}', '01', 'Nomor rumah saya 10 di jalan 223 dan 3234')

'Nomor rumah saya 01 di jalan 013 dan 0101'

In [5]:
# Memisahkan teks
re.split('\s', 'Nama saya Fahmi')

['Nama', 'saya', 'Fahmi']

## Case Folding

In [6]:
# Ubah menjadi kapital
teks = "saya SUKA buah"
print(teks.upper())
print(teks.lower())

SAYA SUKA BUAH
saya suka buah


## Tokenization

In [7]:
# Tokenisasi kalimat
dokumen = "There's a lot of bats about five feet from that cave. Bats, so many bats."
list_kalimat = nltk.tokenize.sent_tokenize(dokumen)
print(list_kalimat)

["There's a lot of bats about five feet from that cave.", 'Bats, so many bats.']


In [8]:
# Tokenisasi kata
kalimat = "There's a lot of bats about five feet from that cave"
list_kata = nltk.tokenize.word_tokenize(kalimat)
print(list_kata)

['There', "'s", 'a', 'lot', 'of', 'bats', 'about', 'five', 'feet', 'from', 'that', 'cave']


## Stopword Removal

In [9]:
# Daftar stopword bahasa Inggris
list_stopwords = nltk.corpus.stopwords.words('english')
print(list_stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [10]:
# Hapus tanda baca
kalimat = "There's a lot of bats about five feet from that cave"
list_kata = nltk.tokenize.word_tokenize(kalimat)
list_kata = [kata for kata in list_kata if not kata in list_stopwords]
print(list_kata)

['There', "'s", 'lot', 'bats', 'five', 'feet', 'cave']


In [11]:
# Hapus stopword
list_kata = [kata for kata in list_kata if kata.isalnum()]
print(list_kata)

['There', 'lot', 'bats', 'five', 'feet', 'cave']


## Stemming

In [12]:
# Stemming menggunakan algoritma Porter
stemmer = nltk.stem.SnowballStemmer(language="english")
list_kata_stem = [stemmer.stem(kata) for kata in list_kata]
print(list_kata_stem)

['there', 'lot', 'bat', 'five', 'feet', 'cave']


## Lemmatization

In [13]:
# Lemmatize menggunakan WordNet
lemmatizer = nltk.WordNetLemmatizer()
list_kata_lemma = [lemmatizer.lemmatize(kata) for kata in list_kata]
print(list_kata_lemma)

['There', 'lot', 'bat', 'five', 'foot', 'cave']


## Part of Speech (POS) Tagging

In [14]:
# Tagging dengan Averaged Perceptron
tokens = nltk.tokenize.word_tokenize("Fahmi has a new computer")
nltk.pos_tag(tokens)

[('Fahmi', 'NNP'),
 ('has', 'VBZ'),
 ('a', 'DT'),
 ('new', 'JJ'),
 ('computer', 'NN')]

In [15]:
# Tagging dengan tag universal
tokens = nltk.tokenize.word_tokenize("Fahmi has a new computer")
nltk.pos_tag(tokens, tagset='universal')

[('Fahmi', 'NOUN'),
 ('has', 'VERB'),
 ('a', 'DET'),
 ('new', 'ADJ'),
 ('computer', 'NOUN')]

In [16]:
# Membuat tree hubungan tag
tokenize = nltk.tokenize.word_tokenize("The quick brown fox jumps over the lazy dog")
tagged = nltk.pos_tag(tokenize)

grammar = "NP: {<DT>?<JJ>*<NN>}"
cp = nltk.RegexpParser(grammar)

chunked = cp.parse(tagged)
chunked.draw()

## Count Vectorizer

In [17]:
# Unigram vectorizer
vectorizer = CountVectorizer(ngram_range=(1,1))

document = ["Fahmi has a new lovely computer", "My cat stuck on a tree"]
doc_transformed = vectorizer.fit_transform(document)

print(doc_transformed.toarray())
print(vectorizer.get_feature_names())

[[0 1 1 1 1 0 1 0 0 0]
 [1 0 0 0 0 1 0 1 1 1]]
['cat', 'computer', 'fahmi', 'has', 'lovely', 'my', 'new', 'on', 'stuck', 'tree']


In [18]:
# Bigram vectorizer
vectorizer = CountVectorizer(ngram_range=(2,2), stop_words=None, analyzer='word')

document = ["Fahmi has a new lovely computer", "My cat stuck on a tree"]
doc_transformed = vectorizer.fit_transform(document)

print(doc_transformed.toarray())
print(vectorizer.get_feature_names())

[[0 1 1 1 0 1 0 0]
 [1 0 0 0 1 0 1 1]]
['cat stuck', 'fahmi has', 'has new', 'lovely computer', 'my cat', 'new lovely', 'on tree', 'stuck on']


## TF-IDF Vectorizer

In [19]:
# Unigram vectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,1))

document = ["Fahmi has a new lovely computer", "My cat stuck on a tree above"]
doc_transformed = vectorizer.fit_transform(document)

print(doc_transformed.toarray())
print(vectorizer.get_feature_names())

[[0.         0.         0.4472136  0.4472136  0.4472136  0.4472136
  0.         0.4472136  0.         0.         0.        ]
 [0.40824829 0.40824829 0.         0.         0.         0.
  0.40824829 0.         0.40824829 0.40824829 0.40824829]]
['above', 'cat', 'computer', 'fahmi', 'has', 'lovely', 'my', 'new', 'on', 'stuck', 'tree']
