## BERT 임베딩 차원에서 유사도 비교

In [1]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gmleh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [3]:
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np

# Function to calculate cosine similarity
def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

# Initialize model and tokenizer
model_name = "bert-base-uncased" # BERT와 FinBERT 비교
# model_name = "ProsusAI/finbert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Get all tokens
all_tokens = list(tokenizer.get_vocab().keys())

# Dictionary to store all embeddings
all_embeddings = {}

# Generate embeddings
for token in tqdm(all_tokens, desc="Generating embeddings"):
    tokenized = tokenizer(token, return_tensors='pt')
    with torch.no_grad():
        outputs = model(**tokenized)
    embeddings = torch.mean(outputs.last_hidden_state, dim=1).squeeze().numpy()
    all_embeddings[token] = embeddings

# Function to find most similar words
def find_most_similar(word, num_similar=100):
    if word not in all_embeddings:
        print("Word not in vocabulary")
        return
    
    similarities = {}
    target_embedding = all_embeddings[word]

    for token, embedding in tqdm(all_embeddings.items(), desc=f"Calculating similarities for {word}"):
        similarity = cosine_similarity(target_embedding, embedding)
        similarities[token] = similarity

    sorted_similarities = sorted(similarities.items(), key=lambda x: x[1], reverse=True)
    
    return sorted_similarities[:num_similar]


Some weights of the model checkpoint at ProsusAI/finbert were not used when initializing BertModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Generating embeddings: 100%|█████████████████████████████████████████████████████| 30522/30522 [15:11<00:00, 33.50it/s]


In [4]:
# operate function : increase / decrease
word1 = 'increase'
word2 = 'decrease'
variation_list = []

most_similar_words1 = find_most_similar(word1)
print(f"Top 100 words most similar to '{word1}':")

for similar_word, similarity in most_similar_words1:
    variation_list.append(similar_word)
    
most_similar_words2 = find_most_similar(word2)
print(f"Top 100 words most similar to '{word2}':")

for similar_word, similarity in most_similar_words2:
    variation_list.append(similar_word)

Calculating similarities for increase: 100%|██████████████████████████████████| 30522/30522 [00:00<00:00, 64793.04it/s]


Top 100 words most similar to 'increase':


Calculating similarities for decrease: 100%|██████████████████████████████████| 30522/30522 [00:00<00:00, 70308.42it/s]

Top 100 words most similar to 'decrease':





In [5]:
filtered_variation_list = [word for word in variation_list if word not in stop_words]

print(filtered_variation_list)
len(filtered_variation_list)

['increase', 'increases', 'promote', 'reduce', 'builds', 'modify', 'removes', 'extends', 'increasing', 'easing', 'improving', 'reduces', 'generate', 'suspend', 'lifts', 'drill', 'establish', 'ensured', 'removing', 'lowers', '明', 'repair', 'token', 'enhance', 'curses', 'promotes', 'accelerated', 'alleviate', 'neighborhood', 'levy', 'insignia', 'additions', 'organist', 'captains', 'enhancing', 'defining', 'bared', 'amendment', 'concealed', 'amplified', 'establishes', 'emblem', 'bomb', 'advertisement', 'opponent', 'created', 'exercises', 'worker', 'patch', 'appointing', 'boulevard', 'extend', '1625', 'revoked', 'quran', 'bearer', 'strikeouts', 'rescues', 'establishing', 'flaps', 'calculating', 'blouse', '33rd', 'aroused', '1886', 'concourse', 'operatives', '信', 'asserting', 'destroys', 'promoting', '秋', '1888', 'elector', 'inversion', 'exhibit', 'prelude', 'improve', 'noel', 'wept', 'decreasing', 'intimidation', 'develop', 'adjective', 'awakened', 'cooked', 'exodus', 'assassin', 'arrestin

200

In [6]:
variation_set = set(filtered_variation_list)
len(variation_set)
variation_list_dup = list(variation_set)
variation_list_dup

['informing',
 'builds',
 'convenience',
 'therese',
 'ვ',
 'φ',
 'promote',
 'defining',
 'virtues',
 'created',
 'enhance',
 'assassin',
 '秋',
 'extend',
 'boulevard',
 'easing',
 'adultery',
 'noel',
 'accomplishment',
 'delight',
 '♭',
 'exercises',
 'insulted',
 'あ',
 'reduces',
 'prosperity',
 'routines',
 'ₑ',
 '1774',
 'destroys',
 'cornice',
 'quran',
 'decreasing',
 'bearer',
 'extends',
 'believer',
 'preserving',
 'prelude',
 'wept',
 'rescues',
 '33rd',
 'え',
 'generate',
 'adviser',
 'sheppard',
 'modify',
 'stairwell',
 'implementations',
 '1761',
 'remarried',
 'removes',
 'reduce',
 'neighborhood',
 'ي',
 'flaps',
 'elector',
 'token',
 'repair',
 '1886',
 'せ',
 'insignia',
 'witty',
 'ʲ',
 'v6',
 'theorists',
 'suspend',
 'ambiguity',
 '州',
 '生',
 'amplified',
 'improving',
 'intimidation',
 'drill',
 'establishes',
 'arresting',
 '我',
 'exposition',
 'murderers',
 'indies',
 'satisfying',
 'nouns',
 'revoked',
 'captains',
 'ᵢ',
 'increasing',
 'rudolph',
 '1625',
 '

In [7]:
import pandas as pd

df11 = pd.DataFrame({'Name': variation_list_dup})
df11.to_csv('LSTWORD_finbert.csv', index = False)

In [8]:
len(variation_set)

196

In [17]:
# 중복된 단어 확인

from collections import Counter

word_counts = Counter(variation_list)

duplicate_words = [word for word, count in word_counts.items() if count > 1]

duplicate_words

['increasing',
 'increases',
 'decrease',
 'reduce',
 'adjust',
 'decreased',
 'expands']

## 단어가 포함된 문장만 살리기

In [19]:
import pandas as pd

df = pd.read_csv('textmining/concatenated_dataset_1003_v6.csv')
df

Unnamed: 0,Title,Original_Text,Sentence
0,Yes Bonds Could Still Rally Even With Yields B...,Bond Yields Hit Record Lows is a headline tha...,bond yields hit record lows is a headline tha...
1,Yes Bonds Could Still Rally Even With Yields B...,Bond Yields Hit Record Lows is a headline tha...,first the 30year bond yield fell past 2% then ...
2,Yes Bonds Could Still Rally Even With Yields B...,Bond Yields Hit Record Lows is a headline tha...,a few market veterans also recall bondmarket r...
3,Yes Bonds Could Still Rally Even With Yields B...,Bond Yields Hit Record Lows is a headline tha...,and even fewer stuck their necks out then to d...
4,Yes Bonds Could Still Rally Even With Yields B...,Bond Yields Hit Record Lows is a headline tha...,today some of these early and prescient bond b...
...,...,...,...
2372395,Poised to Leave Bankruptcy iHeartMedia Files f...,iHeartMedia Inc. the nations largest radio bro...,the radio broadcasters restructuring plan eras...
2372396,Poised to Leave Bankruptcy iHeartMedia Files f...,iHeartMedia Inc. the nations largest radio bro...,it also transfers control of the company to ih...
2372397,Poised to Leave Bankruptcy iHeartMedia Files f...,iHeartMedia Inc. the nations largest radio bro...,franklin advisers inc. and its affiliates and ...
2372398,Poised to Leave Bankruptcy iHeartMedia Files f...,iHeartMedia Inc. the nations largest radio bro...,chief executive bob pittman and chief financia...


In [20]:
import re

def filter_rows_by_words_and_numbers(df, words_list):
    tqdm.pandas(desc="Filtering rows")
    return df[df['Sentence'].progress_apply(lambda x: any(word in x.split() for word in words_list) or bool(re.search(r'\d', x)))]

filtered_df = filter_rows_by_words_and_numbers(df, variation_list_dup)

Filtering rows: 100%|██████████████████████████████████████████████████████| 2372400/2372400 [06:17<00:00, 6276.79it/s]


In [21]:
filtered_df

Unnamed: 0,Title,Original_Text,Sentence
1,Yes Bonds Could Still Rally Even With Yields B...,Bond Yields Hit Record Lows is a headline tha...,first the 30year bond yield fell past 2% then ...
2,Yes Bonds Could Still Rally Even With Yields B...,Bond Yields Hit Record Lows is a headline tha...,a few market veterans also recall bondmarket r...
3,Yes Bonds Could Still Rally Even With Yields B...,Bond Yields Hit Record Lows is a headline tha...,and even fewer stuck their necks out then to d...
4,Yes Bonds Could Still Rally Even With Yields B...,Bond Yields Hit Record Lows is a headline tha...,today some of these early and prescient bond b...
7,Yes Bonds Could Still Rally Even With Yields B...,Bond Yields Hit Record Lows is a headline tha...,i never bought bonds for yield he said.
...,...,...,...
2372388,Poised to Leave Bankruptcy iHeartMedia Files f...,iHeartMedia Inc. the nations largest radio bro...,the nations largest radio broadcaster said wed...
2372389,Poised to Leave Bankruptcy iHeartMedia Files f...,iHeartMedia Inc. the nations largest radio bro...,the company with 848 radio stations intends to...
2372390,Poised to Leave Bankruptcy iHeartMedia Files f...,iHeartMedia Inc. the nations largest radio bro...,the company could increase the offering size d...
2372394,Poised to Leave Bankruptcy iHeartMedia Files f...,iHeartMedia Inc. the nations largest radio bro...,the company has said it expects to exit from b...


In [22]:
# csv 저장

filtered_df.to_csv('textmining/filtered_df_small_ver1106.csv', index=False)

In [None]:
# revised_news_and_EDA.ipynb 파일로 가서 추가 전처리