# Term Frequency

In [1]:
text = 'John likes to watch movies. Mary likes movies too. \
Mary also likes to watch football games.'

In [2]:
words = text.replace('.', '').split()
print(words)

['John', 'likes', 'to', 'watch', 'movies', 'Mary', 'likes', 'movies', 'too', 'Mary', 'also', 'likes', 'to', 'watch', 'football', 'games']


In [3]:
import numpy as np
word_count = np.unique(words, return_counts=True)
print(word_count)

(array(['John', 'Mary', 'also', 'football', 'games', 'likes', 'movies',
       'to', 'too', 'watch'], dtype='<U8'), array([1, 2, 1, 1, 1, 3, 2, 2, 1, 2], dtype=int64))


In [4]:
word_to_cnt = {}
for word, cnt in zip(*word_count):
    word_to_cnt[word] = cnt
print(word_to_cnt)

{'John': 1, 'Mary': 2, 'also': 1, 'football': 1, 'games': 1, 'likes': 3, 'movies': 2, 'to': 2, 'too': 1, 'watch': 2}


# Term Document Matrix

In [5]:
corpus = [
    'John likes to watch movies. Mary likes movies too.',
    'Mary also likes to watch football games.'
]

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
vector = CountVectorizer()
tdm_array = vector.fit_transform(corpus).toarray()
tf_dic = vector.vocabulary_
print(tdm_array)
print(tf_dic)

[[0 0 0 1 2 1 2 1 1 1]
 [1 1 1 0 1 1 0 1 0 1]]
{'john': 3, 'likes': 4, 'to': 7, 'watch': 9, 'movies': 6, 'mary': 5, 'too': 8, 'also': 0, 'football': 1, 'games': 2}


In [7]:
import pandas as pd
tf_dic_sorted = dict(sorted(tf_dic.items(), key=lambda item: item[1]))
tdm = pd.DataFrame(tdm_array, columns=tf_dic_sorted.keys())
print(tdm)

   also  football  games  john  likes  mary  movies  to  too  watch
0     0         0      0     1      2     1       2   1    1      1
1     1         1      1     0      1     1       0   1    0      1


# TF-IDF

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vec = TfidfVectorizer()
tfidf_array = tfidf_vec.fit_transform(corpus).toarray()
tfidf_dic = tfidf_vec.vocabulary_
tfidf_dic_sorted = dict(sorted(tfidf_dic.items(), key=lambda item: item[1]))

tfidf_tdm = pd.DataFrame(tfidf_array, columns=tfidf_dic_sorted.keys()) #24쪽 코드 수정
print(tfidf_tdm)

       also  football     games      john     likes      mary    movies  \
0  0.000000  0.000000  0.000000  0.323699  0.460629  0.230315  0.647398   
1  0.446101  0.446101  0.446101  0.000000  0.317404  0.317404  0.000000   

         to       too     watch  
0  0.230315  0.323699  0.230315  
1  0.317404  0.000000  0.317404  


# gensim

In [9]:
corpus = [
    'John likes to watch movies. Mary likes movies too.',
    'Mary also likes to watch football games.'
]

word_list= []
for word in corpus:
    word_list.append(word.replace('.', '').split())

# conda activate tf2.14
# pip install gensim

# ImportError: cannot import name 'triu' from 'scipy.linalg' 발생시
# pip install scipy==1.12 (원래 버전 1.14.0)

from gensim.models import Word2Vec
model = Word2Vec(word_list, sg=0, vector_size=100, window=3, min_count=1)

print(model.wv.most_similar('likes'))
print(model.wv.similarity('movies', 'games'))

[('John', 0.21617142856121063), ('also', 0.09291722625494003), ('too', 0.027057476341724396), ('football', 0.016134677454829216), ('Mary', -0.010840574279427528), ('to', -0.02775036357343197), ('movies', -0.05234673246741295), ('games', -0.059876296669244766), ('watch', -0.111670583486557)]
0.0640898


# DNN

### 1) 기본

In [10]:
# 참고자료
# https://github.com/hjk7902/nlp/blob/main/1.%20DNN%20%EC%8B%A4%EC%8A%B5%20-%20%EB%A1%9C%EC%9D%B4%ED%84%B0%20%EA%B8%B0%EC%82%AC%20%EB%B6%84%EB%A5%98.ipynb

from tensorflow.keras import Sequential
from tensorflow.keras import layers

model = Sequential([
    layers.Input(shape=(2500,)),
    layers.Dense(512, activation='relu'),
    layers.Dropout(0, 5),
    layers.Dense(46, activation='softmax')
])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 512)               1280512   
                                                                 
 dropout (Dropout)           (None, 512)               0         
                                                                 
 dense_1 (Dense)             (None, 46)                23598     
                                                                 
Total params: 1304110 (4.97 MB)
Trainable params: 1304110 (4.97 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [11]:
from tensorflow.keras.datasets import reuters
(X_train, y_train), (X_test, y_test) = reuters.load_data(num_words=2500)
print(X_train.shape, X_test.shape)

(8982,) (2246,)


In [12]:
print(X_train[0]) # 숫자로 매핑되어있음을 확인

[1, 2, 2, 8, 43, 10, 447, 5, 25, 207, 270, 5, 2, 111, 16, 369, 186, 90, 67, 7, 89, 5, 19, 102, 6, 19, 124, 15, 90, 67, 84, 22, 482, 26, 7, 48, 4, 49, 8, 864, 39, 209, 154, 6, 151, 6, 83, 11, 15, 22, 155, 11, 15, 7, 48, 9, 2, 1005, 504, 6, 258, 6, 272, 11, 15, 22, 134, 44, 11, 15, 16, 8, 197, 1245, 90, 67, 52, 29, 209, 30, 32, 132, 6, 109, 15, 17, 12]


In [13]:
print(y_train[0])

3


In [14]:
labels = reuters.get_label_names()
print(labels)

('cocoa', 'grain', 'veg-oil', 'earn', 'acq', 'wheat', 'copper', 'housing', 'money-supply', 'coffee', 'sugar', 'trade', 'reserves', 'ship', 'cotton', 'carcass', 'crude', 'nat-gas', 'cpi', 'money-fx', 'interest', 'gnp', 'meal-feed', 'alum', 'oilseed', 'gold', 'tin', 'strategic-metal', 'livestock', 'retail', 'ipi', 'iron-steel', 'rubber', 'heat', 'jobs', 'lei', 'bop', 'zinc', 'orange', 'pet-chem', 'dlr', 'gas', 'silver', 'wpi', 'hog', 'lead')


In [15]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [16]:
len(X_train[0]), len(X_train[1])

(87, 56)

In [17]:
from tensorflow.keras.preprocessing.text import Tokenizer
tok = Tokenizer(num_words=2500)
X_train_tok = tok.sequences_to_matrix(X_train, mode='count')
X_test_tok = tok.sequences_to_matrix(X_test, mode='count')

In [18]:
print(type(X_train_tok), X_train_tok.shape)

<class 'numpy.ndarray'> (8982, 2500)


In [19]:
print(X_train_tok[0])

[0. 1. 4. ... 0. 0. 0.]


In [20]:
model.fit(X_train_tok, y_train, epochs=20, batch_size=64)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x25a70e13b90>

In [21]:
model.evaluate(X_test_tok, y_test)



[1.1147968769073486, 0.799198567867279]

In [22]:
sample = X_train_tok[333].reshape(-1, 2500)
print(sample.shape)

(1, 2500)


In [23]:
pred = model.predict(sample)
print(pred)

[[6.29868955e-05 8.33811995e-04 2.01068015e-05 9.85312402e-01
  2.30382429e-03 1.93045344e-05 1.05868939e-05 1.86735542e-05
  3.47305904e-04 4.90160892e-04 4.26667102e-04 1.20029994e-03
  3.11418611e-04 1.16212483e-04 1.56585666e-05 7.86202872e-06
  4.21627919e-04 7.90592367e-06 5.37756059e-05 4.65283031e-03
  1.23642269e-03 1.00586760e-04 9.21211540e-06 7.21404940e-05
  7.26803701e-05 4.48649189e-05 4.36036862e-06 2.08681945e-06
  5.08273894e-04 1.12201988e-05 1.16126736e-04 8.62091747e-06
  9.86109444e-05 1.84067380e-06 7.66225567e-05 9.49348851e-06
  7.22183497e-04 1.89901002e-05 1.41779732e-04 9.02555894e-06
  5.53047030e-05 7.00668488e-06 5.55707675e-06 1.49424186e-05
  5.86144915e-06 1.26884352e-05]]


In [24]:
import numpy as np
print(np.argmax(pred, axis=1))

[3]


In [25]:
y_test[333]

3

### 2)
### 인공신경망의 기본 이론을 알고 있으면 어떤 모델을 사용하더라도
### 입력과 출력의 shape만 맞춰주면 학습은 된다는 것을 보여줌

In [26]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Conv2D, MaxPooling2D, Flatten

model = Sequential()
model.add(Conv2D(32, (3,3), input_shape=(50,50,1), activation='relu'))
model.add(MaxPooling2D())
model.add(Conv2D(64, (3,3), activation='relu'))
model.add(MaxPooling2D())
model.add(Flatten())
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(46,activation='softmax'))
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 48, 48, 32)        320       
                                                                 
 max_pooling2d (MaxPooling2  (None, 24, 24, 32)        0         
 D)                                                              
                                                                 
 conv2d_1 (Conv2D)           (None, 22, 22, 64)        18496     
                                                                 
 max_pooling2d_1 (MaxPoolin  (None, 11, 11, 64)        0         
 g2D)                                                            
                                                                 
 flatten (Flatten)           (None, 7744)              0         
                                                                 
 dense_2 (Dense)             (None, 512)              

In [27]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [28]:
from tensorflow.keras.preprocessing.text import Tokenizer
tok = Tokenizer(num_words=2500)
X_train_tok = tok.sequences_to_matrix(X_train, mode='count')
X_test_tok = tok.sequences_to_matrix(X_test, mode='count')

In [29]:
X_train_tok = X_train_tok.reshape(-1, 50, 50, 1)

In [30]:
model.fit(X_train_tok, y_train, epochs=20, batch_size=64)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x25a6e7d0210>

In [31]:
sample = X_train_tok[333].reshape(-1, 50, 50, 1)
print(sample.shape)

(1, 50, 50, 1)


In [32]:
pred = model.predict(sample)

import numpy as np
print(np.argmax(pred, axis=1))

[3]


# RNN

In [33]:
from tensorflow.keras import Sequential, layers

In [35]:
# RNN의 입력으로
# 입력 데이터(텍스트를 숫자로 매핑한 데이터)를 DTM 행렬을 만들어 입력
# 입력 데이터를 같은 길이의 데이터로 자르거나 채워서 입력, 내부적으로 임베딩을 하도록 

In [34]:
model = Sequential([
    layers.Input(shape=(80,)),
    layers.Embedding(input_dim=10000, output_dim=32),
    layers.SimpleRNN(64),
    layers.Dense(2, activation='softmax') # Loss = 'sparse_categorical_crossentropy'
    # layers.Dense(2, activation='sigmoid') # Loss = 'binary_crossentropy'
])

model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 80, 32)            320000    
                                                                 
 simple_rnn (SimpleRNN)      (None, 64)                6208      
                                                                 
 dense_4 (Dense)             (None, 2)                 130       
                                                                 
Total params: 326338 (1.24 MB)
Trainable params: 326338 (1.24 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
