In [1]:
import nltk

In [5]:
nltk.download('punkt_tab')

from nltk.tokenize import word_tokenize
from nltk.tokenize import WordPunctTokenizer
from nltk.tokenize import TreebankWordTokenizer

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


In [19]:
text = "I can't understand why you wouldn't believe me!"

In [18]:
# 단어 기준으로 토큰화
# 부정을 정확하게 파악하기 위해서는 word_tokenize 사용

word_tokenize(text)

['I',
 'ca',
 "n't",
 'understand',
 'why-you',
 'would',
 "n't",
 'believe',
 'me',
 '!']

In [None]:
# 단순한 분리 방식
# 밀도 있는 분석에서는 word_tokenize()를 더 많이 사용
# 장점 : 빠르고, 보기 편하다.
punct_tokenizer = WordPunctTokenizer()
punct_tokenizer.tokenize(text)

['I',
 'can',
 "'",
 't',
 'understand',
 'why',
 'you',
 'wouldn',
 "'",
 't',
 'believe',
 'me',
 '!']

In [20]:
# 하이푼으로 구성된 단어는 하나로 유지한다.
# doesn't 와 같은 단어는 축약형으로 분리해준다.
tree_tokenizer = TreebankWordTokenizer()
tree_tokenizer.tokenize(text)

['I',
 'ca',
 "n't",
 'understand',
 'why',
 'you',
 'would',
 "n't",
 'believe',
 'me',
 '!']

In [21]:
text = """I can't believe it's already 2025! She said, This is a state-of-the-art technology. 
The well-known scientist doesn't agree with the theory. However, they've made significant progress. 
Dr. Smith earned $50,000 last year. The temperature was -5.5°C yesterday."""

In [None]:
# 점 단위로 분리하는 것이 아닌 내부적으로 규칙에 맞게 문장 단위로 분리한다.

nltk.sent_tokenize(text)

["I can't believe it's already 2025!",
 'She said, This is a state-of-the-art technology.',
 "The well-known scientist doesn't agree with the theory.",
 "However, they've made significant progress.",
 'Dr. Smith earned $50,000 last year.',
 'The temperature was -5.5°C yesterday.']

In [None]:
# 문장 토큰화

from nltk.corpus import stopwords

nltk.download('stopwords')

stop = set(stopwords.words('english'))
print(stop)

{'just', 'ain', 'where', 'ma', 'weren', 'doing', 'their', 'hasn', 'have', 'then', 'we', 'most', 'a', 'themselves', 'for', 'by', "weren't", 'hadn', 'is', 'below', 'haven', 'while', 'does', 'this', 'isn', "aren't", 'who', 'y', 'because', 'the', 'as', 'with', 'which', 'its', 'o', "hasn't", 'mightn', 'did', 'off', "they've", "should've", 'nor', "we've", 'were', "they'll", 'it', "it's", 'than', 'no', 'out', 'above', "hadn't", 'wouldn', "they'd", "you're", 'aren', 'are', "i'd", 'needn', 'or', "won't", "i'll", "she'll", 'hers', 'more', 'd', 'once', 'don', 'do', 'will', 'why', 'am', "i'm", 'such', 'through', 'shan', 'what', "wouldn't", 'herself', 'again', 'how', "isn't", 'to', 'very', 'should', "we'd", 'here', "wasn't", 'now', "she'd", "it'd", 'up', 'himself', 'being', 'these', 'between', 's', 'of', 'theirs', 'myself', 'but', "haven't", "he'll", 'couldn', 'under', 'further', "you've", 'him', 'she', "we're", 'me', 'our', 'own', 'any', 'so', 'before', "he's", "they're", "she's", 'didn', 'having'

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# nltk의 stopwords 는 전부 소문자로 이루어져 있기 때문에 lower() 함수로 token을 소문자 처리하는 것이 중요하다.
# len(tok.lower()) > 1 : tok 단어를 소문자로 변환을 하고 길이가 1 보다 크다면 True
# (tok.lower() not in stop) : tok 단어를 소문자로 변환을 하고 not in stop 이라는 객체안에 같은 단어가 없을 경우 True
# 최종적으로 불용어들은 False 불용어가 아닌 단어들은 clean_token에 append 되어서 출력함

sen = 'I want to go to shopping and a I want to buy some of snack'

tokens = word_tokenize(sen)

clean_token = []
for tok in tokens:
    if len(tok.lower()) > 1 and (tok.lower() not in stop):
        clean_token.append(tok)

print("불용어 : ", tokens)
print("불용어 미포함 : ", clean_token)

불용어 :  ['I', 'want', 'to', 'go', 'to', 'shopping', 'and', 'a', 'I', 'want', 'to', 'buy', 'some', 'of', 'snack']
불용어 미포함 :  ['want', 'go', 'shopping', 'want', 'buy', 'snack']


In [None]:
# 정규화 기법중에 하나인 stemming
# rule 알고리즘으로 이루어지기 때문에 등록 되어있지 않은 단어의 경우에는 잘못인식되는 경우 부정확할 수 있는 여지가 있다.
# 정규화를 위해서 word_tokenize() 함수를 사용해서 단어 단위로 토큰화를 하고, 각 단어의 stemming을 적용한다.

from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize


In [None]:

s = PorterStemmer()
text = """I can't believe it's already 2025! She said, This is a state-of-the-art technology. 
The well-known scientist doesn't agree with the theory. However, they've made significant progress. 
Dr. Smith earned $50,000 last year. The temperature was -5.5°C yesterday."""

words = word_tokenize(text)
print(words)

['I', 'ca', "n't", 'believe', 'it', "'s", 'already', '2025', '!', 'She', 'said', ',', 'This', 'is', 'a', 'state-of-the-art', 'technology', '.', 'The', 'well-known', 'scientist', 'does', "n't", 'agree', 'with', 'the', 'theory', '.', 'However', ',', 'they', "'ve", 'made', 'significant', 'progress', '.', 'Dr.', 'Smith', 'earned', '$', '50,000', 'last', 'year', '.', 'The', 'temperature', 'was', '-5.5°C', 'yesterday', '.']


In [31]:
for i in words:
    print(f"{i} => {s.stem(i)}")

I => i
ca => ca
n't => n't
believe => believ
it => it
's => 's
already => alreadi
2025 => 2025
! => !
She => she
said => said
, => ,
This => thi
is => is
a => a
state-of-the-art => state-of-the-art
technology => technolog
. => .
The => the
well-known => well-known
scientist => scientist
does => doe
n't => n't
agree => agre
with => with
the => the
theory => theori
. => .
However => howev
, => ,
they => they
've => 've
made => made
significant => signific
progress => progress
. => .
Dr. => dr.
Smith => smith
earned => earn
$ => $
50,000 => 50,000
last => last
year => year
. => .
The => the
temperature => temperatur
was => wa
-5.5°C => -5.5°c
yesterday => yesterday
. => .


In [32]:
# 정규화 기범중에 하나인 Lemmatization
# 문장 속에서 다양한 형태의 굴절어를 단어의 표제인lemma를 찾는 일을 한다.
# 단어의 원형 추출


from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...


True

In [None]:
# 동사를 정의

lemmatizer = WordNetLemmatizer()
words = ['walk', 'walked', 'walking', 'eat', 'ate', 'eating']



In [None]:
# 동사를 lematizer.lemmatize(i, pos='v') : 동사들을 반복을 돌리면서, lemmatize 에 i 와 단어가 동사, 품사라는 사실을 알려줄 수 있다.
# 즉, 단어들이 문장에서 동사로 쓰였다는 것을 알고, 표제어 추출기는 정보를 보존하면서 정확한 lemma를 출력

for i in words:
    lemma = lemmatizer.lemmatize(i, pos='v')
    print(f"{i} => {lemma}")

walk => walk
walked => walk
walking => walk
eat => eat
ate => eat
eating => eat


In [38]:
# Edit Distance
# 2개의 문자열이 얼만큼 다른가를 거리개념으로 치환해 숫자로 표현한 것

import nltk
from nltk.metrics import edit_distance



In [43]:
print(edit_distance("come", "he"))

3


In [63]:
import nltk
import urllib
import matplotlib.pyplot as plt
import requests
from nltk.corpus import stopwords
from bs4 import BeautifulSoup

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
url = 'https://pytorch.org/'
responce = urllib.request.urlopen(url= url)
bs4 = BeautifulSoup(responce, 'html.parser').get_text()

tokens = []

for i in bs4.split():
    tokens.append(i)

stop = set(stopwords.words('english'))

clean_token = []
for tok in tokens:
    if len(tok.lower()) > 1 and (tok.lower() not in stop):
        clean_token.append(tok)

Freq_dist_nltk = nltk.FreqDist(clean_token)
Freq_dist_nltk.plot(30, cumulative=False)

['PyTorch', 'Skip', 'to', 'main', 'content', 'Hit', 'enter', 'to', 'search', 'or', 'ESC', 'to', 'close', 'Search', 'Close', 'Search', 'search', 'Menu', 'Learn', 'Get', 'Started', 'Tutorials', 'Learn', 'the', 'Basics', 'PyTorch', 'Recipes', 'Intro', 'to', 'PyTorch', '–', 'YouTube', 'Series', 'Webinars', 'Community', 'Landscape', 'Join', 'the', 'Ecosystem', 'Community', 'Hub', 'Forums', 'Developer', 'Resources', 'Community', 'Events', 'PyTorch', 'Contributor', 'Awards', 'PyTorch', 'Ambassadors', 'Projects', 'PyTorch', 'vLLM', 'DeepSpeed', 'Host', 'Your', 'Project', 'RAY', 'Docs', 'PyTorch', 'Domains', 'Blog', '&', 'News', 'Blog', 'Announcements', 'Case', 'Studies', 'Events', 'Newsletter', 'About', 'PyTorch', 'Foundation', 'Members', 'Governing', 'Board', 'Technical', 'Advisory', 'Council', 'Cloud', 'Credit', 'Program', 'Staff', 'Contact', 'Brand', 'Guidelines', 'JOIN', 'github', 'search', 'Get', 'Started', 'Choose', 'Your', 'Path:', 'Install', 'PyTorch', 'Locally', 'or', 'Launch', 'Insta