In [1]:
# https://huggingface.co/
import pandas as pd
from transformers import pipeline

In [2]:
df = pd.read_excel('news_20230401-20230501.xlsx')
col = ['일자', '제목']
df = df[col]
df

  warn("Workbook contains no default style, apply openpyxl's default")


Unnamed: 0,일자,제목
0,20230501,무역적자 터널 끝 보이나 향후 '반도체 對中 수출' 관건
1,20230501,"반도체 불황 언제까지 삼성전자 SK하이닉스, 하반기에 사활"
2,20230501,"""돌아온 관광객"" 서울 주요 상권 공실률, 1분기 어떻게 변했나"
3,20230501,삼성 ‘엑시노스’ 부활 없으면 반도체도 스마트폰도 위험하다
4,20230501,백선엽 장군 장녀 “10대영웅 타임스퀘어 영상 기뻐하셨을 것”
...,...,...
1391,20230401,“당신의 부캐는 무엇인가요” 세계 명품업계가 환호하는 청년 일러스트레이터의 열정[정...
1392,20230401,드라마에서 가전제품까지 파고든 메타버스
1393,20230401,"60대 이상 주주가 47%...한전주 3분의 1 토막, 고령화도 한몫?"
1394,20230401,집값 최대 40% 떨어진 ‘광교’ 갭투자 다시 꿈틀?


## 1. Sentiment Analysis

In [8]:
classifier = pipeline(
    "sentiment-analysis", model="sangrimlee/bert-base-multilingual-cased-nsmc")
sentences = list(df['제목'].values)
sent = classifier(sentences)
sent = pd.DataFrame(sent)
sent.loc[sent['label']=='negative', 'score'] = 1 - sent.loc[sent['label']=='negative', 'score']
df['sent'] = sent['score']
df

## 2. Pretrained Embedding

In [16]:
from transformers import AutoTokenizer, AutoModel
import torch

sentences = list(df['제목'].values)

def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
tokenizer = AutoTokenizer.from_pretrained('jhgan/ko-sroberta-multitask')
model = AutoModel.from_pretrained('jhgan/ko-sroberta-multitask')
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
with torch.no_grad():
    model_output = model(**encoded_input)
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])


print("Sentence embeddings:")
print(sentence_embeddings)

Sentence embeddings:
tensor([[ 0.1454, -0.1580, -0.1695,  ..., -0.4415, -0.3096, -0.1395],
        [-0.3729, -0.0614,  0.0580,  ..., -0.5778,  0.2011,  0.3196],
        [-0.1640, -0.3545,  0.3887,  ..., -0.7750,  0.0411, -0.1825],
        ...,
        [-0.4709,  0.2851, -0.7905,  ..., -0.3372,  0.1963, -0.5992],
        [ 0.2174, -0.5213,  0.8721,  ..., -0.6822, -0.0059, -0.5523],
        [ 0.4777,  0.1305, -0.2526,  ..., -0.7375,  0.0703, -0.4759]])


In [18]:
from sklearn.decomposition import FactorAnalysis
my_dim = 100
trans = FactorAnalysis(n_components=my_dim)
embed_trans = trans.fit_transform(sentence_embeddings)
print(sentence_embeddings.shape)
print(embed_trans.shape)
embed_trans = pd.DataFrame(embed_trans, columns = ['F_%02d' % i for i in range(1,my_dim+1)])
embed_trans

torch.Size([1396, 768])
(1396, 100)


Unnamed: 0,F_01,F_02,F_03,F_04,F_05,F_06,F_07,F_08,F_09,F_10,...,F_91,F_92,F_93,F_94,F_95,F_96,F_97,F_98,F_99,F_100
0,1.074810,-0.675250,1.527125,-0.608116,0.145455,-2.039154,-0.356377,0.169035,-0.908060,0.724947,...,0.071425,-1.062580,-0.234850,0.233350,1.137440,-1.181192,-0.541339,-0.253997,-0.725074,0.120757
1,1.860668,-0.107722,1.150962,-0.026327,0.435288,-0.431876,-0.955980,-0.653820,0.350755,0.887397,...,0.183401,-0.343881,-0.430444,0.402081,1.463855,-0.260589,0.959016,-0.609474,0.050491,-0.112216
2,-0.238496,-0.860263,-0.944048,-1.133423,0.011915,-0.880327,0.447691,-0.488874,0.013181,-0.240383,...,-1.818270,-0.848864,-1.856786,-1.118632,-0.677506,-0.533692,-0.661467,-1.100023,0.090603,1.923676
3,1.107354,0.887596,-0.002669,1.141106,0.413901,0.371563,0.117888,-0.987580,0.072582,0.314833,...,0.513452,-0.438574,0.283294,1.199079,-0.752889,-0.861740,0.168321,-1.279075,1.714448,-0.647411
4,-1.759321,0.114746,-1.629050,1.325312,-1.335847,-0.050511,-0.735866,-2.002965,0.186277,-0.921976,...,-0.981919,-1.331850,0.726990,0.086052,-0.539056,1.264236,-0.683049,-0.190046,-0.722764,-0.132306
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1391,-1.321677,-0.005508,-0.805321,-0.318684,1.012847,-0.513893,-0.221452,-1.942079,-0.363143,-0.055778,...,-3.093338,0.701582,-0.842988,0.681198,-0.078363,-0.439164,-0.482518,1.265737,0.248980,-1.170680
1392,-0.801811,0.440646,0.094156,-0.571156,1.328228,-0.406265,0.247021,-0.620144,0.458174,0.730794,...,-0.425449,-0.939080,-1.984424,0.910352,1.236781,0.277103,0.081908,1.556707,-1.789887,-1.056365
1393,0.294544,-0.932224,-1.071457,-0.274989,0.031464,0.804085,0.193178,-0.546041,0.237760,-1.041942,...,0.579143,-1.067826,-1.087434,1.337471,0.281982,-0.831723,-0.079577,0.018935,0.903893,0.637011
1394,0.654386,-1.147173,-0.495196,-0.708989,0.742653,-1.084357,0.053336,-0.872428,2.082739,-0.735325,...,0.485766,-0.223262,0.779898,0.909231,-1.510355,0.132273,-0.838965,0.080785,0.107012,-0.076202


In [19]:
df = df.join(embed_trans)
df

Unnamed: 0,일자,제목,sent,F_01,F_02,F_03,F_04,F_05,F_06,F_07,...,F_91,F_92,F_93,F_94,F_95,F_96,F_97,F_98,F_99,F_100
0,20230501,무역적자 터널 끝 보이나 향후 '반도체 對中 수출' 관건,0.043718,1.074810,-0.675250,1.527125,-0.608116,0.145455,-2.039154,-0.356377,...,0.071425,-1.062580,-0.234850,0.233350,1.137440,-1.181192,-0.541339,-0.253997,-0.725074,0.120757
1,20230501,"반도체 불황 언제까지 삼성전자 SK하이닉스, 하반기에 사활",0.042600,1.860668,-0.107722,1.150962,-0.026327,0.435288,-0.431876,-0.955980,...,0.183401,-0.343881,-0.430444,0.402081,1.463855,-0.260589,0.959016,-0.609474,0.050491,-0.112216
2,20230501,"""돌아온 관광객"" 서울 주요 상권 공실률, 1분기 어떻게 변했나",0.043943,-0.238496,-0.860263,-0.944048,-1.133423,0.011915,-0.880327,0.447691,...,-1.818270,-0.848864,-1.856786,-1.118632,-0.677506,-0.533692,-0.661467,-1.100023,0.090603,1.923676
3,20230501,삼성 ‘엑시노스’ 부활 없으면 반도체도 스마트폰도 위험하다,0.249694,1.107354,0.887596,-0.002669,1.141106,0.413901,0.371563,0.117888,...,0.513452,-0.438574,0.283294,1.199079,-0.752889,-0.861740,0.168321,-1.279075,1.714448,-0.647411
4,20230501,백선엽 장군 장녀 “10대영웅 타임스퀘어 영상 기뻐하셨을 것”,0.753363,-1.759321,0.114746,-1.629050,1.325312,-1.335847,-0.050511,-0.735866,...,-0.981919,-1.331850,0.726990,0.086052,-0.539056,1.264236,-0.683049,-0.190046,-0.722764,-0.132306
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1391,20230401,“당신의 부캐는 무엇인가요” 세계 명품업계가 환호하는 청년 일러스트레이터의 열정[정...,0.914584,-1.321677,-0.005508,-0.805321,-0.318684,1.012847,-0.513893,-0.221452,...,-3.093338,0.701582,-0.842988,0.681198,-0.078363,-0.439164,-0.482518,1.265737,0.248980,-1.170680
1392,20230401,드라마에서 가전제품까지 파고든 메타버스,0.313122,-0.801811,0.440646,0.094156,-0.571156,1.328228,-0.406265,0.247021,...,-0.425449,-0.939080,-1.984424,0.910352,1.236781,0.277103,0.081908,1.556707,-1.789887,-1.056365
1393,20230401,"60대 이상 주주가 47%...한전주 3분의 1 토막, 고령화도 한몫?",0.039421,0.294544,-0.932224,-1.071457,-0.274989,0.031464,0.804085,0.193178,...,0.579143,-1.067826,-1.087434,1.337471,0.281982,-0.831723,-0.079577,0.018935,0.903893,0.637011
1394,20230401,집값 최대 40% 떨어진 ‘광교’ 갭투자 다시 꿈틀?,0.062059,0.654386,-1.147173,-0.495196,-0.708989,0.742653,-1.084357,0.053336,...,0.485766,-0.223262,0.779898,0.909231,-1.510355,0.132273,-0.838965,0.080785,0.107012,-0.076202


In [20]:
df = df.drop(columns='제목')
df.to_csv('news_vector.csv', encoding='utf-8-sig', index=False)