#### spaCy 설치

In [1]:
# !conda install -y -c conda-forge spacy

In [2]:
# !pip install -U spacy

In [3]:
# !python -m spacy download en_core_web_sm

#### 형태소 분석과 표제어 추출

In [4]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [5]:
text = "Wikipedia is maintained by volunteers."
doc = nlp(text)

In [6]:
for token in doc:
    print(token.text, 
          token.lemma_,  # 표제어
          token.pos_,    # 단어의 품사
          token.tag_,    # 자세한 품사
          token.dep_,    # 문법적 의존 관계
          token.is_stop) # 불용어 여부

Wikipedia Wikipedia PROPN NNP nsubjpass False
is be AUX VBZ auxpass True
maintained maintain VERB VBN ROOT False
by by ADP IN agent True
volunteers volunteer NOUN NNS pobj False
. . PUNCT . punct False


#### 용어

In [7]:
spacy.explain('PROPN')

'proper noun'

#### 명사와 동사의 표제어로 단어 문서 행렬 만들기

In [15]:
def extract_nv(text):
    doc = nlp(text)
    words = []
    for token in doc:
        if token.tag_[0] in 'N':
            words.append(token.lemma_.lower())
    return words


In [16]:
extract_nv('Apple is a company')

['apple', 'company']

In [17]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=500, tokenizer=extract_nv)

In [18]:
import pandas as pd
df = pd.read_excel('imdb.xlsx', index_col=0)
df.head()

Unnamed: 0,review,sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [19]:
tdm = cv.fit_transform(df['review'])

In [20]:
wc = pd.DataFrame({
    '단어': cv.get_feature_names(),
    '빈도': tdm.sum(axis=0).flat
})

In [21]:
wc.sort_values('빈도', ascending=False).head()

Unnamed: 0,단어,빈도
281,movie,211
161,film,187
66,character,59
453,time,48
6,acting,37
