# Feature Extraction - English

## 이선우 (20223888)

In [1]:
corpus = [
    "The elephant sneezed at the sight of potatoes.",
    "Bats can see via echolocation. See the bat sight sneeze!",
    "Wondering, she opened the door to the studio.",
]

## 1. Bag of Words

### (1) NLTK & gensim

In [15]:
import nltk #토큰화
import string #Feature Extraction 할때

In [16]:
def tokenize (text): #from nltk import word_tokenize 일케 해두 되긴함
    text = text.lower()
    stemmer = nltk.stem.SnowballStemmer('english') #stemmer는 기계적으로 어미자르기 등등 (nltk stem 안에 뭐 있는지 보면 이것저것 나옴)

    for token in nltk.word_tokenize(text):
        if token in string.punctuation:
            continue

        yield stemmer.stem(token)


In [17]:
for doc in corpus:
    print(list(tokenize(doc)))

['the', 'eleph', 'sneez', 'at', 'the', 'sight', 'of', 'potato']
['bat', 'can', 'see', 'via', 'echoloc', 'see', 'the', 'bat', 'sight', 'sneez']
['wonder', 'she', 'open', 'the', 'door', 'to', 'the', 'studio']


In [18]:
tokenized_corpus = [list(tokenize(doc)) for doc in corpus]

In [19]:
for doc in tokenized_corpus:
    print(doc)

['the', 'eleph', 'sneez', 'at', 'the', 'sight', 'of', 'potato']
['bat', 'can', 'see', 'via', 'echoloc', 'see', 'the', 'bat', 'sight', 'sneez']
['wonder', 'she', 'open', 'the', 'door', 'to', 'the', 'studio']


In [20]:
import gensim

In [22]:
lexicon = gensim.corpora.Dictionary(tokenized_corpus) #토큰화 시킨 텍스트를 읽으면서 gensim이 세서 각 vocabulary에 숫자를 주어서 차원으로 보내줌

In [23]:
for x in lexicon.items():
    print(x)

(0, 'at')
(1, 'eleph')
(2, 'of')
(3, 'potato')
(4, 'sight')
(5, 'sneez')
(6, 'the')
(7, 'bat')
(8, 'can')
(9, 'echoloc')
(10, 'see')
(11, 'via')
(12, 'door')
(13, 'open')
(14, 'she')
(15, 'studio')
(16, 'to')
(17, 'wonder')


In [24]:
for doc in tokenized_corpus:
    vec = lexicon.doc2bow(doc) #벡터로 바꿔줘 
    print(vec)

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 2)]
[(4, 1), (5, 1), (6, 1), (7, 2), (8, 1), (9, 1), (10, 2), (11, 1)]
[(6, 2), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1)]


위에 거가 Feature Extraction임 (벡터로 바꿔주는것)

sparse vector <=> dense vector

벡터를 표현하는 두가지 방법이 있다. 

sparse의 경우 0을 줠래 찍는 것보다 효율적이다. 

필요할때는 dense로 전환함

### (2) Scikit-Learn

In [25]:
from sklearn.feature_extraction.text import CountVectorizer

In [27]:
vectorizer = CountVectorizer() #요런걸 인스턴스화 한다는 것 (객체로 사용) (내가 쓰겟다구한것)

In [28]:
results = vectorizer.fit_transform(corpus)

In [30]:
print(results.A) #단어를 찍어서 벡터화

[[1 0 0 0 0 0 1 1 0 1 0 0 1 0 1 0 2 0 0 0]
 [0 1 1 1 0 1 0 0 0 0 2 0 1 1 0 0 1 0 1 0]
 [0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 1 2 1 0 1]]


단어를 확인하는 방법은?

In [31]:
vectorizer.vocabulary_

{'the': 16,
 'elephant': 6,
 'sneezed': 14,
 'at': 0,
 'sight': 12,
 'of': 7,
 'potatoes': 9,
 'bats': 2,
 'can': 3,
 'see': 10,
 'via': 18,
 'echolocation': 5,
 'bat': 1,
 'sneeze': 13,
 'wondering': 19,
 'she': 11,
 'opened': 8,
 'door': 4,
 'to': 17,
 'studio': 15}

In [32]:
import pandas as pd

In [33]:
df = pd.DataFrame(data=results.A)

In [37]:
inverse = { v: k for k, v in vectorizer.vocabulary_.items() } # (단어 > 숫자) 에서 (숫자 > 단어) 로 바꿔준다

In [38]:
inverse

{16: 'the',
 6: 'elephant',
 14: 'sneezed',
 0: 'at',
 12: 'sight',
 7: 'of',
 9: 'potatoes',
 2: 'bats',
 3: 'can',
 10: 'see',
 18: 'via',
 5: 'echolocation',
 1: 'bat',
 13: 'sneeze',
 19: 'wondering',
 11: 'she',
 8: 'opened',
 4: 'door',
 17: 'to',
 15: 'studio'}

In [39]:
df = df.rename(columns=inverse)

In [40]:
df

Unnamed: 0,at,bat,bats,can,door,echolocation,elephant,of,opened,potatoes,see,she,sight,sneeze,sneezed,studio,the,to,via,wondering
0,1,0,0,0,0,0,1,1,0,1,0,0,1,0,1,0,2,0,0,0
1,0,1,1,1,0,1,0,0,0,0,2,0,1,1,0,0,1,0,1,0
2,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,2,1,0,1


### Compare nltk & sklearn

sklearn 어근 목록

In [43]:
vectorizer.vocabulary_.keys()

dict_keys(['the', 'elephant', 'sneezed', 'at', 'sight', 'of', 'potatoes', 'bats', 'can', 'see', 'via', 'echolocation', 'bat', 'sneeze', 'wondering', 'she', 'opened', 'door', 'to', 'studio'])

nltk 어근 목록

In [46]:
list(lexicon.values())

['at',
 'eleph',
 'of',
 'potato',
 'sight',
 'sneez',
 'the',
 'bat',
 'can',
 'echoloc',
 'see',
 'via',
 'door',
 'open',
 'she',
 'studio',
 'to',
 'wonder']

In [47]:
'''sklearn에선 있으나 nltk엔 없는것'''
set(vectorizer.vocabulary_.keys()) - set(list(lexicon.values())) 

{'bats',
 'echolocation',
 'elephant',
 'opened',
 'potatoes',
 'sneeze',
 'sneezed',
 'wondering'}

In [48]:
'''nlkt에선 있으나 sklearn엔 없는것'''
set(list(lexicon.values())) - set(vectorizer.vocabulary_.keys())

{'echoloc', 'eleph', 'open', 'potato', 'sneez', 'wonder'}

위 결과를 비교해보면 stemmer가 있고 없고의 차이를 알 수 있다.

sklearn에서는 stemmer가 없어서, sneeze와 sneezed가 다른 단어로 나옴 (복수, 단수 등의 차이를 알 수 없다)

## 2. One-Hot Encoding

cat이나 dog가 언급되는 횟수는 중요하지 않을수도 있다아, 그래서 나온거

### (1) Gensim

튜플 (dim, freq) 에서 강제로 freq를 1로 할당 (즉 있냐 없냐만 보겠다)

In [51]:
for doc in tokenized_corpus:
    vec = lexicon.doc2bow(doc)
    vec = [(x[0], 1) for x in vec]
    print(vec)

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)]
[(4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1)]
[(6, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1)]


### (2) Scikit-Learn

Binarizer를 활용 (0과 1로 바꿔주겠다)

In [52]:
from sklearn.preprocessing import Binarizer

In [54]:
vectorizer = CountVectorizer()
vectors = vectorizer.fit_transform(corpus)

In [55]:
vectors.toarray()

array([[1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 2, 0, 0, 0],
       [0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 2, 0, 1, 1, 0, 0, 1, 0, 1, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 2, 1, 0, 1]])

In [56]:
onehot = Binarizer()

In [57]:
vectors = onehot.fit_transform(vectors)

In [60]:
vectors.A

array([[1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0],
       [0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1]])

### Tf-Idf (Term frequency to Inverse document frequency)

$$
\begin{align}
    tf (t, d) &= 1 + \log \, f_{t,d}  \\
    idf (t,D) &= \log 1 + \frac{N}{n_t} \\
    tf-idf (t, d, D) &= tf (t,d) \cdot idf (t,D)
\end{align}
$$

### (1) Gensim

In [61]:
tfidf = gensim.models.TfidfModel(dictionary=lexicon, normalize=True) #norm은 정규화

In [65]:
for doc in tokenized_corpus:
    vec = lexicon.doc2bow(doc)
    vec = tfidf[vec] #위치 찍힌거 좌표를 보여줌
    vec = [(tfidf.id2word[x[0]], x[1]) for x in vec] #일케 위치를 보여주는뎅 The 가 사라짐 (모든 문서에서 나와서 자동으로 사라지게 되었음, 모든 문서에 포함되는 단어는 알아서 빼준다)

    print(vec)

[('at', 0.4837965208957426), ('eleph', 0.4837965208957426), ('of', 0.4837965208957426), ('potato', 0.4837965208957426), ('sight', 0.17855490118826325), ('sneez', 0.17855490118826325)]
[('sight', 0.10992597952954358), ('sneez', 0.10992597952954358), ('bat', 0.5956913654963344), ('can', 0.2978456827481672), ('echoloc', 0.2978456827481672), ('see', 0.5956913654963344), ('via', 0.2978456827481672)]
[('door', 0.408248290463863), ('open', 0.408248290463863), ('she', 0.408248290463863), ('studio', 0.408248290463863), ('to', 0.408248290463863), ('wonder', 0.408248290463863)]


### (2) Scikit-Learn

In [70]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [71]:
tfidf = TfidfVectorizer()

In [72]:
vectors = tfidf.fit_transform(corpus)

In [73]:
print(vectors.A)

[[0.37867627 0.         0.         0.         0.         0.
  0.37867627 0.37867627 0.         0.37867627 0.         0.
  0.28799306 0.         0.37867627 0.         0.44730461 0.
  0.         0.        ]
 [0.         0.30251368 0.30251368 0.30251368 0.         0.30251368
  0.         0.         0.         0.         0.60502736 0.
  0.23006945 0.30251368 0.         0.         0.17866945 0.
  0.30251368 0.        ]
 [0.         0.         0.         0.         0.36772387 0.
  0.         0.         0.36772387 0.         0.         0.36772387
  0.         0.         0.         0.36772387 0.43436728 0.36772387
  0.         0.36772387]]
