In [None]:
"""
자연어처리 : 음성인식, 요약, 번역, 감성분석, 분류, 질의응답, 챗봇 등 
환경구성 : 아나콘다(머신러닝, 시각화, 데이터분석 nltk 등) + 텐서플로우, 젠심, 파이토치, konlpy 등 

1) 텍스트 전처리(토큰화, 정제, 어간추출, 불용어제거, 정수인코딩, 패딩)
2) 텍스트의 수치표현(BoW, DTM/TDM, TF-IDF)
3) 유사도(문서 / 단어 / 문장)
4) 머신/딥러닝 모델 생성 
"""

In [2]:
pip install konlpy

Note: you may need to restart the kernel to use updated packages.


In [4]:
from konlpy.tag import Okt

In [5]:
okt=Okt()

In [None]:
#konlpy.org

In [6]:
print(okt.morphs(u'단독입찰보다 복수입찰의 경우'))

['단독', '입찰', '보다', '복수', '입찰', '의', '경우']


In [7]:
print(okt.nouns(u'유일하게 항공기 체계 종합개발 경험을 갖고 있는 KAI는')) # 명사 추출

['항공기', '체계', '종합', '개발', '경험']


In [8]:
print(okt.pos(u'이것도 되나욬ㅋㅋ'))

[('이', 'Determiner'), ('것', 'Noun'), ('도', 'Josa'), ('되나욬', 'Noun'), ('ㅋㅋ', 'KoreanParticle')]


In [9]:
from konlpy.tag import *

In [10]:
okt=Okt()
han=Hannanum()
kkma=Kkma()

In [11]:
okt.pos('아버지가방에들어가신다')

[('아버지', 'Noun'), ('가방', 'Noun'), ('에', 'Josa'), ('들어가신다', 'Verb')]

In [15]:
han.pos('아버지가방에들어가신다')

[('아버지가방에들어가', 'N'), ('이', 'J'), ('시ㄴ다', 'E')]

In [14]:
kkma.pos('아버지가방에들어가신다')

[('아버지', 'NNG'),
 ('가방', 'NNG'),
 ('에', 'JKM'),
 ('들어가', 'VV'),
 ('시', 'EPH'),
 ('ㄴ다', 'EFN')]

In [17]:
okt.pos("정부가 발표하는 물가상승률과 소비자가 느끼는 물가상승률은 다르다.")

[('정부', 'Noun'),
 ('가', 'Josa'),
 ('발표', 'Noun'),
 ('하는', 'Verb'),
 ('물가상승률', 'Noun'),
 ('과', 'Josa'),
 ('소비자', 'Noun'),
 ('가', 'Josa'),
 ('느끼는', 'Verb'),
 ('물가상승률', 'Noun'),
 ('은', 'Josa'),
 ('다르다', 'Adjective'),
 ('.', 'Punctuation')]

In [18]:
han.pos("정부가 발표하는 물가상승률과 소비자가 느끼는 물가상승률은 다르다.")

[('정부', 'N'),
 ('가', 'J'),
 ('발표', 'N'),
 ('하', 'X'),
 ('는', 'E'),
 ('물가상승률', 'N'),
 ('과', 'J'),
 ('소비자', 'N'),
 ('가', 'J'),
 ('느끼', 'P'),
 ('는', 'E'),
 ('물가상승률', 'N'),
 ('은', 'J'),
 ('다르', 'P'),
 ('다', 'E'),
 ('.', 'S')]

In [19]:
kkma.pos("정부가 발표하는 물가상승률과 소비자가 느끼는 물가상승률은 다르다.")

[('정부', 'NNG'),
 ('가', 'JKS'),
 ('발표', 'NNG'),
 ('하', 'XSV'),
 ('는', 'ETD'),
 ('물가', 'NNG'),
 ('상승률', 'NNG'),
 ('과', 'JC'),
 ('소비자', 'NNG'),
 ('가', 'JKS'),
 ('느끼', 'VV'),
 ('는', 'ETD'),
 ('물가', 'NNG'),
 ('상승률', 'NNG'),
 ('은', 'JX'),
 ('다르', 'VA'),
 ('다', 'EFN'),
 ('.', 'SF')]

In [23]:
from math import log 
#df('바나나')=2 -> 몇개의 문서에서 해당 단어가 나왔나
#tf*idf= 문서에서 각 단어의 중요도를 나타낸 행렬
# ->각 문서에서 중요 단어가 무엇인지 알고자 함 => 키워드 => 토픽모델링
# tfidf=tf*idf
# -> df가 크다 = 단어가 흔한 단어 ( 많은 문서에서 등장 함 -> 많은 문서에서 등장하면 오히려 중요도 떨어짐)
# -> df는 tf랑 역순 관계 임으로 역수를 취해서 tfidf로 적용
#                 N(문서 갯수)
# ->   tf * log ( ----------- ) -> log를 안쓰면 N에따라 idf값이 기하 급수적으로 커져서 log씌어줌 
#                     df+1
docs = [
  '먹고 싶은 사과',
  '먹고 싶은 바나나',
  '길고 노란 바나나 바나나',
  '저는 과일이 좋아요'
]

In [29]:
vocab=list(set(w for doc in docs for w in doc.split()))   # docs에서 읽어진 정보는 doc으로, doc을 split 해서 w에 넣고 추출

In [30]:
vocab.sort()
vocab   # 코퍼스(말뭉치) : 전체 문서에 등장하는 단어 집합

['과일이', '길고', '노란', '먹고', '바나나', '사과', '싶은', '저는', '좋아요']

In [32]:
import pandas as pd

In [33]:
N = len(docs) 

def tf(t, d):
  return d.count(t)

def idf(t):
  df = 0
  for doc in docs:
    df += t in doc          #문서에 단어가 잇으면 +1 해준다
  return log(N/(df+1))

def tfidf(t, d):
  return tf(t,d)* idf(t)

In [34]:
result = []

# 각 문서에 대해서 아래 연산을 반복
for i in range(N):
  result.append([])  #[    []       ]
  d = docs[i]  # '먹고싶은 사과'
  for j in range(len(vocab)):#9번반복
    t = vocab[j] #과일이
    result[-1].append(tf(t, d))

tf_ = pd.DataFrame(result, columns = vocab)

In [35]:
tf_  # tf 행렬 or DTM(문서단어행렬)  ->단어의 빈도수

Unnamed: 0,과일이,길고,노란,먹고,바나나,사과,싶은,저는,좋아요
0,0,0,0,1,0,1,1,0,0
1,0,0,0,1,1,0,1,0,0
2,0,1,1,0,2,0,0,0,0
3,1,0,0,0,0,0,0,1,1


In [38]:
#tf구하기 위해 호출
from sklearn.feature_extraction.text import CountVectorizer

In [39]:
vec=CountVectorizer()

In [40]:
vec.fit_transform(docs).toarray()    # tf

array([[0, 0, 0, 1, 0, 1, 1, 0, 0],
       [0, 0, 0, 1, 1, 0, 1, 0, 0],
       [0, 1, 1, 0, 2, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 1, 1]], dtype=int64)

In [41]:
vec.vocabulary_

{'먹고': 3,
 '싶은': 6,
 '사과': 5,
 '바나나': 4,
 '길고': 1,
 '노란': 2,
 '저는': 7,
 '과일이': 0,
 '좋아요': 8}

In [None]:
#tfidf행렬 구하기 위해 호출
from sklearn.feature_extraction.text import TfidfVectorizer

In [42]:
result = []
for j in range(len(vocab)):  # vocab에는 9개 단어 들어가있음
    t = vocab[j]
    result.append(idf(t))

idf_ = pd.DataFrame(result, index=vocab, columns=["IDF"])
idf_
#idf가 크면 흔치않은 단어 작으면 흔한 단어

Unnamed: 0,IDF
과일이,0.693147
길고,0.693147
노란,0.693147
먹고,0.287682
바나나,0.287682
사과,0.693147
싶은,0.287682
저는,0.693147
좋아요,0.693147


In [43]:
tf_

Unnamed: 0,과일이,길고,노란,먹고,바나나,사과,싶은,저는,좋아요
0,0,0,0,1,0,1,1,0,0
1,0,0,0,1,1,0,1,0,0
2,0,1,1,0,2,0,0,0,0
3,1,0,0,0,0,0,0,1,1


In [44]:
result = []
for i in range(N):
  result.append([])
  d = docs[i]
  for j in range(len(vocab)):
    t = vocab[j]
    result[-1].append(tfidf(t,d))

tfidf_ = pd.DataFrame(result, columns = vocab)
tfidf_
#첫번째 문단 = 사과 가장 중요


Unnamed: 0,과일이,길고,노란,먹고,바나나,사과,싶은,저는,좋아요
0,0.0,0.0,0.0,0.287682,0.0,0.693147,0.287682,0.0,0.0
1,0.0,0.0,0.0,0.287682,0.287682,0.0,0.287682,0.0,0.0
2,0.0,0.693147,0.693147,0.0,0.575364,0.0,0.0,0.0,0.0
3,0.693147,0.0,0.0,0.0,0.0,0.0,0.0,0.693147,0.693147


In [None]:
"""
0.데이터 전처리
1.각 문서(영화)에서 중요 단어 추출
2.코사인 유사도 기반 가장 유사한 문서(영화) 검색
3.
"""

In [46]:
pip install tensorflow

Collecting tensorflow
  Obtaining dependency information for tensorflow from https://files.pythonhosted.org/packages/80/6f/57d36f6507e432d7fc1956b2e9e8530c5c2d2bfcd8821bcbfae271cd6688/tensorflow-2.14.0-cp311-cp311-win_amd64.whl.metadata
  Downloading tensorflow-2.14.0-cp311-cp311-win_amd64.whl.metadata (3.3 kB)
Collecting tensorflow-intel==2.14.0 (from tensorflow)
  Obtaining dependency information for tensorflow-intel==2.14.0 from https://files.pythonhosted.org/packages/ad/6e/1bfe367855dd87467564f7bf9fa14f3b17889988e79598bc37bf18f5ffb6/tensorflow_intel-2.14.0-cp311-cp311-win_amd64.whl.metadata
  Downloading tensorflow_intel-2.14.0-cp311-cp311-win_amd64.whl.metadata (4.8 kB)
Collecting absl-py>=1.0.0 (from tensorflow-intel==2.14.0->tensorflow)
  Obtaining dependency information for absl-py>=1.0.0 from https://files.pythonhosted.org/packages/01/e4/dc0a1dcc4e74e08d7abedab278c795eef54a224363bb18f5692f416d834f/absl_py-2.0.0-py3-none-any.whl.metadata
  Downloading absl_py-2.0.0-py3-none-any

In [47]:
pip install gensim

Collecting FuzzyTM>=0.4.0 (from gensim)
  Downloading FuzzyTM-2.0.5-py3-none-any.whl (29 kB)
Collecting pyfume (from FuzzyTM>=0.4.0->gensim)
  Downloading pyFUME-0.2.25-py3-none-any.whl (67 kB)
     ---------------------------------------- 0.0/67.1 kB ? eta -:--:--
     ---------------------------------------- 67.1/67.1 kB 3.6 MB/s eta 0:00:00
Collecting simpful (from pyfume->FuzzyTM>=0.4.0->gensim)
  Downloading simpful-2.11.0-py3-none-any.whl (32 kB)
Collecting fst-pso (from pyfume->FuzzyTM>=0.4.0->gensim)
  Downloading fst-pso-1.8.1.tar.gz (18 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting miniful (from fst-pso->pyfume->FuzzyTM>=0.4.0->gensim)
  Downloading miniful-0.0.6.tar.gz (2.8 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: fst-pso, miniful
  Building wheel for fst-pso (setup.py): started
  Building wheel for fs

In [48]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [51]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [49]:
#nltk는 영어로된 단어 퇴큰화
from nltk.tokenize import word_tokenize
from nltk.tokenize import WordPunctTokenizer
from tensorflow.keras.preprocessing.text import text_to_word_sequence

In [None]:
"""
토큰 : 자연어 처리 작업을 수행하는 기본 단위, 일반적으로 단어(문장,문단,문자)가 토큰
토큰화 : 주어진 코퍼스를 토큰 단위로 나누는 작업
자연어 -> 토큰화 -> 세부 작업
"""

In [52]:
print('단어 토큰화1 :',word_tokenize("Don't be fooled by the dark sounding name, Mr. Jone's Orphanage is as cheery as cheery goes for a pastry shop."))

단어 토큰화1 : ['Do', "n't", 'be', 'fooled', 'by', 'the', 'dark', 'sounding', 'name', ',', 'Mr.', 'Jone', "'s", 'Orphanage', 'is', 'as', 'cheery', 'as', 'cheery', 'goes', 'for', 'a', 'pastry', 'shop', '.']


In [53]:
#단어 단위로 분류
print('단어 토큰화2 :',WordPunctTokenizer().tokenize("Don't be fooled by the dark sounding name, Mr. Jone's Orphanage is as cheery as cheery goes for a pastry shop."))

단어 토큰화2 : ['Don', "'", 't', 'be', 'fooled', 'by', 'the', 'dark', 'sounding', 'name', ',', 'Mr', '.', 'Jone', "'", 's', 'Orphanage', 'is', 'as', 'cheery', 'as', 'cheery', 'goes', 'for', 'a', 'pastry', 'shop', '.']


In [54]:
#소문자로 바뀜, 마침표 제거, ','도 제거..구두점 제거함, 
print('단어 토큰화3 :',text_to_word_sequence("Don't be fooled by the dark sounding name, Mr. Jone's Orphanage is as cheery as cheery goes for a pastry shop."))

단어 토큰화3 : ["don't", 'be', 'fooled', 'by', 'the', 'dark', 'sounding', 'name', 'mr', "jone's", 'orphanage', 'is', 'as', 'cheery', 'as', 'cheery', 'goes', 'for', 'a', 'pastry', 'shop']


In [55]:
#문장 단위로 끊어주고 리스트로 들어감
from nltk.tokenize import sent_tokenize

text = "His barber kept his word. But keeping such a huge secret to himself was driving him crazy. Finally, the barber went up a mountain and almost to the edge of a cliff. He dug a hole in the midst of some reeds. He looked about, to make sure no one was near."
print('문장 토큰화1 :',sent_tokenize(text))

문장 토큰화1 : ['His barber kept his word.', 'But keeping such a huge secret to himself was driving him crazy.', 'Finally, the barber went up a mountain and almost to the edge of a cliff.', 'He dug a hole in the midst of some reeds.', 'He looked about, to make sure no one was near.']


In [56]:
#PH.D.은 마침표로 인식 안함 
text = "I am actively looking for Ph.D. students. and you are a Ph.D student."
print('문장 토큰화2 :',sent_tokenize(text))

문장 토큰화2 : ['I am actively looking for Ph.D. students.', 'and you are a Ph.D student.']


In [57]:
from konlpy.tag import Okt
from konlpy.tag import Kkma

okt = Okt()
kkma = Kkma()

print('OKT 형태소 분석 :',okt.morphs("열심히 코딩한 당신, 연휴에는 여행을 가봐요"))
print('OKT 품사 태깅 :',okt.pos("열심히 코딩한 당신, 연휴에는 여행을 가봐요"))
print('OKT 명사 추출 :',okt.nouns("열심히 코딩한 당신, 연휴에는 여행을 가봐요"))

OKT 형태소 분석 : ['열심히', '코딩', '한', '당신', ',', '연휴', '에는', '여행', '을', '가봐요']
OKT 품사 태깅 : [('열심히', 'Adverb'), ('코딩', 'Noun'), ('한', 'Josa'), ('당신', 'Noun'), (',', 'Punctuation'), ('연휴', 'Noun'), ('에는', 'Josa'), ('여행', 'Noun'), ('을', 'Josa'), ('가봐요', 'Verb')]
OKT 명사 추출 : ['코딩', '당신', '연휴', '여행']


In [58]:
print('꼬꼬마 형태소 분석 :',kkma.morphs("열심히 코딩한 당신, 연휴에는 여행을 가봐요"))
print('꼬꼬마 품사 태깅 :',kkma.pos("열심히 코딩한 당신, 연휴에는 여행을 가봐요"))
print('꼬꼬마 명사 추출 :',kkma.nouns("열심히 코딩한 당신, 연휴에는 여행을 가봐요"))

꼬꼬마 형태소 분석 : ['열심히', '코딩', '하', 'ㄴ', '당신', ',', '연휴', '에', '는', '여행', '을', '가보', '아요']
꼬꼬마 품사 태깅 : [('열심히', 'MAG'), ('코딩', 'NNG'), ('하', 'XSV'), ('ㄴ', 'ETD'), ('당신', 'NP'), (',', 'SP'), ('연휴', 'NNG'), ('에', 'JKM'), ('는', 'JX'), ('여행', 'NNG'), ('을', 'JKO'), ('가보', 'VV'), ('아요', 'EFN')]
꼬꼬마 명사 추출 : ['코딩', '당신', '연휴', '여행']


In [None]:
#정제, 대소문자통합, 불용어 제거(등장 빈도수가 낮은 단어)

In [60]:
import re
text = "I was wondering if anyone out there could enlighten me on this car."

# 길이가 1~2인 단어들을 정규 표현식을 이용하여 삭제
shortword = re.compile(r'\W*\b\w{1,2}\b')  
print(shortword.sub('', text))
# \W*: \W는 알파벳 문자나 숫자가 아닌 문자(공백, 구두점, 특수 문자 등)를 의미합니다. *는 0회 이상 반복을 나타내므로 \W*는 0회 이상의 알파벳 문자나 숫자가 아닌 문자가 나타날 수 있다는 것을 의미합니다.

#\b: 단어 경계를 나타냅니다. \b는 단어의 시작 또는 끝을 나타냅니다. 이것을 사용하여 정확한 단어 경계를 지정할 수 있습니다.

#\w{1,2}: \w는 알파벳 문자나 숫자를 나타냅니다. {1,2}는 알파벳 문자나 숫자가 1자 이상에서 2자 이하로 나타날 수 있다는 것을 나타냅니다.
#즉, 단어 길이가 1에서 2자인 단어를 찾는 패턴입니다.

 was wondering anyone out there could enlighten this car.


In [None]:
#형태소 = 어간(stem, 단어 중요 의미) + 접사(부가의미)

#dogs => dog + s

In [61]:
import nltk

In [62]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...


True

In [63]:
from nltk.stem import WordNetLemmatizer  #Lemmatizer = are, is, am => be -> 표제어 추출 해준다.

lemmatizer = WordNetLemmatizer()

words = ['policy', 'doing', 'organization', 'have', 'going', 'love', 'lives', 'fly', 'dies', 'watched', 'has', 'starting']

print('표제어 추출 전 :',words)
print('표제어 추출 후 :',[lemmatizer.lemmatize(word) for word in words])

표제어 추출 전 : ['policy', 'doing', 'organization', 'have', 'going', 'love', 'lives', 'fly', 'dies', 'watched', 'has', 'starting']
표제어 추출 후 : ['policy', 'doing', 'organization', 'have', 'going', 'love', 'life', 'fly', 'dy', 'watched', 'ha', 'starting']


In [67]:
lemmatizer.lemmatize("is","v") # is 라는 단어의 표제어를 추출하는데 is는 동사다
lemmatizer.lemmatize("are","v")
lemmatizer.lemmatize("watching","v")

'watch'

In [68]:
#어간(Stemming)

In [72]:
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer

from nltk.tokenize import word_tokenize


stemmer = PorterStemmer()
lancaster_stemmer = LancasterStemmer()

In [73]:
words = ['formalize', 'allowance', 'electricical']

In [74]:
print('어간 추출 후 :',[stemmer.stem(word) for word in words])

어간 추출 후 : ['formal', 'allow', 'electric']


In [75]:
print('어간 추출 후 :',[lancaster_stemmer.stem(word) for word in words])

어간 추출 후 : ['form', 'allow', 'elect']


In [78]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [79]:
from nltk.corpus import stopwords

In [80]:
stopwords.words('english')          #불용어들 

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [298]:
example = "Family is not an important thing. It's everything."
stop_words = set(stopwords.words('english')) 

word_tokens = word_tokenize(example)

result = []
for word in word_tokens: 
    if word not in stop_words: 
        result.append(word) 

print('불용어 제거 전 :',word_tokens) 
print('불용어 제거 후 :',result)

불용어 제거 전 : ['Family', 'is', 'not', 'an', 'important', 'thing', '.', 'It', "'s", 'everything', '.']
불용어 제거 후 : ['Family', 'important', 'thing', '.', 'It', "'s", 'everything', '.']


In [82]:
example = "고기를 아무렇게나 구우려고 하면 안 돼. 고기라고 다 같은 게 아니거든. 예컨대 삼겹살을 구울 때는 중요한 게 있지."
stop_words = "를 아무렇게나 구 우려 고 안 돼 같은 게 구울 때 는"

stop_words = set(stop_words.split(' '))
word_tokens = okt.morphs(example)

result = [word for word in word_tokens if not word in stop_words]

print('불용어 제거 전 :',word_tokens) 
print('불용어 제거 후 :',result)

불용어 제거 전 : ['고기', '를', '아무렇게나', '구', '우려', '고', '하면', '안', '돼', '.', '고기', '라고', '다', '같은', '게', '아니거든', '.', '예컨대', '삼겹살', '을', '구울', '때', '는', '중요한', '게', '있지', '.']
불용어 제거 후 : ['고기', '하면', '.', '고기', '라고', '다', '아니거든', '.', '예컨대', '삼겹살', '을', '중요한', '있지', '.']


In [189]:
import numpy as np

In [83]:
###영화 overview ###

In [284]:
data=pd.read_csv('archive/movies_metadata.csv')
data

  data=pd.read_csv('archive/movies_metadata.csv')


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45461,False,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 10751, 'n...",http://www.imdb.com/title/tt6209470/,439050,tt6209470,fa,رگ خواب,Rising and falling between a man and woman.,...,,0.0,90.0,"[{'iso_639_1': 'fa', 'name': 'فارسی'}]",Released,Rising and falling between a man and woman,Subdue,False,4.0,1.0
45462,False,,0,"[{'id': 18, 'name': 'Drama'}]",,111109,tt2028550,tl,Siglo ng Pagluluwal,An artist struggles to finish his work while a...,...,2011-11-17,0.0,360.0,"[{'iso_639_1': 'tl', 'name': ''}]",Released,,Century of Birthing,False,9.0,3.0
45463,False,,0,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...",,67758,tt0303758,en,Betrayal,"When one of her hits goes wrong, a professiona...",...,2003-08-01,0.0,90.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,A deadly game of wits.,Betrayal,False,3.8,6.0
45464,False,,0,[],,227506,tt0008536,en,Satana likuyushchiy,"In a small town live two brothers, one a minis...",...,1917-10-21,0.0,87.0,[],Released,,Satan Triumphant,False,0.0,0.0


In [316]:
docs=data['overview'][:5000]
len(docs)
docs.isna().sum()
#docs.dropna(inplace=True)
docs[0]


"Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences."

In [286]:
docs.info()

<class 'pandas.core.series.Series'>
RangeIndex: 5000 entries, 0 to 4999
Series name: overview
Non-Null Count  Dtype 
--------------  ----- 
4979 non-null   object
dtypes: object(1)
memory usage: 39.2+ KB


In [318]:
docs=docs.astype(str)
#docs.drop(32)

In [365]:
# result=[]
# for i, example in enumerate(docs):
#     example=docs[i]
    
#     stop_words = set(stopwords.words('english'))
    
#     word_tokens = text_to_word_sequence(example)
    
    
#     for word in word_tokens:
#          if word not in stop_words:
#             result.append(word)
            
# print(result)
# result = []  # 결과를 저장할 빈 리스트

# for i, example in enumerate(docs):
#     example=docs[i]
    
#     stop_words = set(stopwords.words('english'))
#     word_tokens = text_to_word_sequence(example)
    
#     for word in word_tokens:
#         if word not in stop_words:
#             result.append(word)
            
#print(result)

result = []  # 모든 결과를 저장할 빈 리스트

for i,example in enumerate(docs):
    stop_words = set(stopwords.words('english'))
    word_tokens = text_to_word_sequence(example)
    
   
    filtered_words = [word for word in word_tokens if word not in stop_words]
    result.append(filtered_words)

df=pd.DataFrame(result)



#example = "Family is not an important thing. It's everything."
#stop_words = set(stopwords.words('english')) 

#word_tokens = word_tokenize(example)

#result = []
#for word in word_tokens: 
#    if word not in stop_words: 
#        result.append(word) 

#print('불용어 제거 전 :',word_tokens) 
#print('불용어 제거 후 :',result)

In [366]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,101,102,103,104,105,106,107,108,109,110
0,led,woody,andy's,toys,live,happily,room,andy's,birthday,brings,...,,,,,,,,,,
1,siblings,judy,peter,discover,enchanted,board,game,opens,door,magical,...,,,,,,,,,,
2,family,wedding,reignites,ancient,feud,next,door,neighbors,fishing,buddies,...,,,,,,,,,,
3,cheated,mistreated,stepped,women,holding,breath,waiting,elusive,good,man,...,,,,,,,,,,
4,george,banks,recovered,daughter's,wedding,receives,news,pregnant,george's,wife,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,calving,arctic,iceberg,releases,giant,praying,mantis,trapped,suspended,animation,...,,,,,,,,,,
4996,grieving,doctor,contacted,late,wife,patients,near,death,experiences,,...,,,,,,,,,,
4997,lestat,de,lioncourt,awakened,slumber,bored,existence,become,generations,new,...,,,,,,,,,,
4998,vietnam,veteran,leon,barlow,struggling,writer,personal,life,much,better,...,,,,,,,,,,


In [361]:
result=pd.Series(result)    
pd.DataFrame(result)

Unnamed: 0,0
0,"[led, woody, andy's, toys, live, happily, room..."
1,"[siblings, judy, peter, discover, enchanted, b..."
2,"[family, wedding, reignites, ancient, feud, ne..."
3,"[cheated, mistreated, stepped, women, holding,..."
4,"[george, banks, recovered, daughter's, wedding..."
...,...
4995,"[calving, arctic, iceberg, releases, giant, pr..."
4996,"[grieving, doctor, contacted, late, wife, pati..."
4997,"[lestat, de, lioncourt, awakened, slumber, bor..."
4998,"[vietnam, veteran, leon, barlow, struggling, w..."


In [359]:
result.explode()

0                led
0              woody
0             andy's
0               toys
0               live
            ...     
4999    relationship
4999           lance
4999            tran
4999            able
4999         improve
Length: 156596, dtype: object

In [360]:
data=result.explode()
data.groupby(data.index).apply(lambda x: pd.Series(x.values))

0     0              led
      1            woody
      2           andy's
      3             toys
      4             live
                ...     
4999  35    relationship
      36           lance
      37            tran
      38            able
      39         improve
Length: 156596, dtype: object

In [334]:
 result.apply(lambda x: pd.Series(x)).stack().reset_index(drop=True)

  result.apply(lambda x: pd.Series(x)).stack().reset_index(drop=True)


0                  led
1                woody
2               andy's
3                 toys
4                 live
              ...     
156589    relationship
156590           lance
156591            tran
156592            able
156593         improve
Length: 156594, dtype: object

In [330]:
tfidf=TfidfVectorizer()

In [354]:
tfidf_mat=tfidf.fit_transform(data)

ValueError: np.nan is an invalid document, expected byte or unicode string.

In [329]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import wordnet