In [64]:
from pymongo import MongoClient
import pandas as pd 
import re
from collections import Counter
from konlpy.tag import Okt
from mecab import MeCab
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer # word embedding
from sklearn.metrics.pairwise import cosine_similarity

# 네트워크 연결 규칙 : protocol://ip:port/path 
client = MongoClient('mongodb://192.168.0.50:27017/')

db_name = client["DB_SGMN"]
collection = db_name["COL_SCRAPPING_TOSS_COMMENT_HISTORY"]

In [65]:
find_data = collection.find()
data_list = list(find_data)

In [66]:
df_data = pd.DataFrame(data_list)
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 251836 entries, 0 to 251835
Data columns (total 7 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   _id         251836 non-null  object        
 1   COMMENT     251836 non-null  object        
 2   CREATED_AT  251836 non-null  datetime64[ns]
 3   DATE        251836 non-null  object        
 4   DATETIME    251836 non-null  object        
 5   SYMBOL      251836 non-null  object        
 6   UPDATED_AT  251836 non-null  object        
dtypes: datetime64[ns](1), object(6)
memory usage: 13.4+ MB


In [67]:
okt = Okt()
mecab_inst = MeCab()

In [68]:
# week = 49
# df_week = df_data[df_data['WEEK'] == week]['COMMENT']
start_index = 100000
end_index =110000
df_comment = df_data['COMMENT'][start_index:end_index]
string_nouns = []
for text in df_comment:
    # nouns_text = mecab_inst.nouns(text)
    # filtered_nouns = [noun for noun in nouns_text if len(noun) > 1]

    # modify_nouns = ['엔비디아' if noun in ['엔비', '비디아'] else noun for noun in filtered_nouns]
    # string_nouns.append(' '.join(modify_nouns))  # join() 메서드 사용
    nouns_text = mecab_inst.morphs(text)   
    # print(nouns_text)
    # for noun, _ in nouns_text:
    #     if noun in ['엔비', '비디아']:
    #         print(nouns_text)  # '엔비' 또는 '비디아'일 경우 nouns_text 전체 출력
    # modify_nouns = ['엔비디아' if noun in ['엔비', '비디아'] else noun for noun in nouns_text]
    # verb_text = okt.pos(text, stem=True)
    # verb_list = [ word for word, pos in verb_text if pos == 'Verb']
    #verb_list = [ word for word, pos in verb_text]
  
    string_nouns.append(' '.join(nouns_text)) 

    
tfidfvectorizer = TfidfVectorizer()
    # stop_words=stopwords, 
    # ngram_range=ngramrange)
tfidfvectorizer.fit(string_nouns) # 학습
result_vectors = tfidfvectorizer.transform(string_nouns) 


In [69]:
new_sentence = ['사이버 보안은 디지털 시대의 중요한 이슈로 부상하고 있으며, 지속적인 관심과 개선이 요구된다.']

new_tfidfvectorizer = tfidfvectorizer.transform(new_sentence) # 입력 text를 vocablary를 기준으로 embedding 함
new_tfidfvectorizer

<1x9759 sparse matrix of type '<class 'numpy.float64'>'
	with 2 stored elements in Compressed Sparse Row format>

In [70]:
similarity_array = cosine_similarity(new_tfidfvectorizer, result_vectors)
# 닮은것 찾기 어려움 그래서 많은 문장과 단어를 학습해 두어야 함.
similarity_array

array([[0., 0., 0., ..., 0., 0., 0.]])

In [71]:
idx = similarity_array[0].argmax()
df_comment.reset_index(drop=True, inplace=True)
df_comment[idx], similarity_array[0].max()

('사이버트럭이나 하이랜드 하자 이슈 하나 뜨면 바로 조정각임. 그만큼 지금 수익실현 직전이 많음 조심들하세요',
 0.2100355871544949)

In [73]:
import pickle

# 모든 모델 정보를 저장할 딕셔너리 생성
model_info = {
    'TfidfVectorizer': tfidfvectorizer,  # 카테고리형 변수들의 원핫인코더 모델들
    'start_index': start_index,
    'end_index': end_index,
    'vocabulary_vector' : result_vectors
}

# 파일 경로 설정
save_file_name = '../models/cosine_similarity_learnings_TFIDF.pkl'

# 모든 모델 정보를 함께 저장
with open(save_file_name, 'wb') as save_file:
    pickle.dump(model_info, save_file)

print("모든 모델 정보가 저장되었습니다.")

모든 모델 정보가 저장되었습니다.
