In [5]:
text = "John likes to watch movies. Mary likes movies too. \
    Mary also likes to watch football games"

words = text.replace('.','').split()
print(words)

['John', 'likes', 'to', 'watch', 'movies', 'Mary', 'likes', 'movies', 'too', 'Mary', 'also', 'likes', 'to', 'watch', 'football', 'games']


In [None]:
import numpy as np
word_count = np.unique(words, return_counts = True) # 유용하게 사용할 수 있음.
print(word_count)

(array(['John', 'Mary', 'also', 'football', 'games', 'likes', 'movies',
       'to', 'too', 'watch'], dtype='<U8'), array([1, 2, 1, 1, 1, 3, 2, 2, 1, 2]))


In [10]:
word_to_cnt = {}
for words, cnt in zip(*word_count):
    word_to_cnt[words] = cnt
print(word_to_cnt)

{'John': 1, 'Mary': 2, 'also': 1, 'football': 1, 'games': 1, 'likes': 3, 'movies': 2, 'to': 2, 'too': 1, 'watch': 2}


In [11]:
print(word_to_cnt['John'])

1


In [None]:
# sklearn
# - 예측 : 분류, 회귀 fit(),predict(),predict_proba()
# - 변환 : 전처리 등 fit(), transform(),fit_transfrom()

In [29]:
corpus = ["John likes to watch movies. Mary likes movies too.",
    "Mary also likes to watch football games"]

from sklearn.feature_extraction.text import CountVectorizer
vector = CountVectorizer()
tdm_array = vector.fit_transform(corpus).toarray() #학습. -> 변환모델, 단어 사전을 생성.
# -> 여기서 알파벳순으로 정렬이 미리 되버림.
tf_dic = vector.vocabulary_ # 생성된 단어 사전을 확인. -> 각 문장의 단어 빈도 벡터. -> 단어가 몇번째 열에 해당하는지 확인.

print(tdm_array)# 각 값은 단어가 해당 문장에 몇번 나왔는지.
print(tf_dic)

[[0 0 0 1 2 1 2 1 1 1]
 [1 1 1 0 1 1 0 1 0 1]]
{'john': 3, 'likes': 4, 'to': 7, 'watch': 9, 'movies': 6, 'mary': 5, 'too': 8, 'also': 0, 'football': 1, 'games': 2}


In [22]:
X = vector.fit_transform(corpus).toarray()
X

array([[0, 0, 0, 1, 2, 1, 2, 1, 1, 1],
       [1, 1, 1, 0, 1, 1, 0, 1, 0, 1]])

In [18]:
import pandas as pd
tf_dic_sorted = dict(sorted(tf_dic.items(),key = lambda item : item[1]))
tdm = pd.DataFrame(tdm_array, columns = tf_dic_sorted.keys())
tdm

Unnamed: 0,also,football,games,john,likes,mary,movies,to,too,watch
0,0,0,0,1,2,1,2,1,1,1
1,1,1,1,0,1,1,0,1,0,1


### IRIS 데이터 분류를 위한 ML 모델

1. 모델을 만들고 (RandomForestClassifier)  
2. 훈련 데이터로 학습시키고 (fit)  
3. 새로운 데이터에 예측하고 (predict)  
4. 정확도를 평가하고 (accuracy_score)  
5. 예측 결과의 확신 정도까지 본다 (predict_proba)  

In [23]:
from sklearn import datasets
iris = datasets.load_iris()
X, y = iris.data, iris.target

In [24]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.3)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=1)
# 랜덤 포레스트 : 많은 결정트리를 만들어서 투표로 결과를 정하는 알고리즘.
# 여러개의 트리에게 물음 -> 가장 많은 답변이 나온걸로 채택.
rf.fit(X_train, y_train) # 학습
y_pred = rf.predict(X_test) # pred 저장
print(y_pred)
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred)) #얼마나 정확한지.

[0 1 1 0 2 1 2 0 0 2 1 0 2 1 1 0 1 1 0 0 1 1 2 0 2 1 0 0 1 2 1 2 1 2 2 0 1
 0 1 2 2 0 1 2 1]
0.9555555555555556


In [30]:
y_pred_proba = rf.predict_proba(X_test) # 결과가 몇%의 확신으로 0 또는 1인지를 보여주는 함수.
print(y_pred_proba)

[[0.95 0.05 0.  ]
 [0.   0.96 0.04]
 [0.   0.98 0.02]
 [1.   0.   0.  ]
 [0.   0.   1.  ]
 [0.   0.85 0.15]
 [0.   0.01 0.99]
 [1.   0.   0.  ]
 [1.   0.   0.  ]
 [0.   0.   1.  ]
 [0.   1.   0.  ]
 [1.   0.   0.  ]
 [0.   0.   1.  ]
 [0.   0.97 0.03]
 [0.   0.98 0.02]
 [1.   0.   0.  ]
 [0.   0.99 0.01]
 [0.01 0.99 0.  ]
 [1.   0.   0.  ]
 [1.   0.   0.  ]
 [0.   0.98 0.02]
 [0.03 0.88 0.09]
 [0.   0.07 0.93]
 [1.   0.   0.  ]
 [0.   0.02 0.98]
 [0.   1.   0.  ]
 [0.99 0.01 0.  ]
 [1.   0.   0.  ]
 [0.   0.98 0.02]
 [0.   0.11 0.89]
 [0.   0.98 0.02]
 [0.   0.   1.  ]
 [0.   1.   0.  ]
 [0.   0.   1.  ]
 [0.   0.01 0.99]
 [1.   0.   0.  ]
 [0.   1.   0.  ]
 [1.   0.   0.  ]
 [0.   0.96 0.04]
 [0.   0.06 0.94]
 [0.   0.   1.  ]
 [1.   0.   0.  ]
 [0.   0.78 0.22]
 [0.   0.   1.  ]
 [0.   0.99 0.01]]


# TF-IDF : 문서별 단어의 빈도수를 계산해 행렬로 만들어둔 것.
### TF(w) : 문서에서 w의 빈도 / 총 단어수
### IDF(w) : log(용어를 포함하는 문서 수) / (총 문서 수 + 1)
### 점수가 클수록 문서 상 관련성이 더 높음

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vec = TfidfVectorizer()
tfidf_array = tfidf_vec.fit_transform(corpus).toarray()
tfidf_dic = tfidf_vec.vocabulary_

tfidf_dic_sorted = dict(sorted(tfidf_dic.items(),key = lambda item:item[1]))
tfidf_dtm =pd.DataFrame(tfidf_array,columns = tfidf_dic_sorted.keys())
tfidf_dtm

Unnamed: 0,also,football,games,john,likes,mary,movies,to,too,watch
0,0.0,0.0,0.0,0.323699,0.460629,0.230315,0.647398,0.230315,0.323699,0.230315
1,0.446101,0.446101,0.446101,0.0,0.317404,0.317404,0.0,0.317404,0.0,0.317404
