In [1]:
import pandas as pd
import numpy as np
import pickle
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import FastText, Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

## load data

In [2]:
with open('zebal_version2.pickle', 'rb') as f:
    df = pickle.load(f)

In [3]:
# 가입 인사 제거
df = df[df['board']!='가입 인사 (등업 필수)']
df = df.reset_index()

In [4]:
contents = []
for m in df['morphs']:
    string = ' '.join(m) # string per post
    contents.append(string) # a list of post strings

## TF-IDF vector

In [5]:
tfidf = TfidfVectorizer(lowercase=False, token_pattern=r'\S+', min_df=20)
# 자동적으로 lowercase로 바꿔주고 기호 종류는 모두 토큰으로 취급하지 않기 때문에 위와 같은 argument를 넣어줍니다.

In [6]:
vect = tfidf.fit_transform(contents).todense()

In [7]:
dtm = pd.DataFrame(vect, columns=tfidf.get_feature_names())
print(dtm.shape) # documents x terms
dtm.head()

(28569, 9843)


Unnamed: 0,!,!!,!!!,!!!!,!!!!!,!!!!!!,!!!!!!!,!!!(,!!!),!!!<,...,힘내다,힘드다,힘들다,힘쓰다,힘없다,힘차다,힙합,힝,﻿,＃
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 지금까지 완성된 data

In [8]:
with open('all_embbeding_df_v2.pickle', 'rb') as f:
    embed = pickle.load(f)

In [9]:
dtm['morphs'] = df['morphs']

In [10]:
dtm.head()

Unnamed: 0,!,!!,!!!,!!!!,!!!!!,!!!!!!,!!!!!!!,!!!(,!!!),!!!<,...,힘드다,힘들다,힘쓰다,힘없다,힘차다,힙합,힝,﻿,＃,morphs
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[오프라인, 톡방, 개시, ', 한, 강의, 온도, 시즌, 1, ', 지역, 제한,..."
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[[, 오픈, 톡방, ], TEA, &, MBTI, STORY, 주제, :, 차, ..."
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[우리, 카페, 통계, 몇, 가지, <, 2월, >, 우리, 카페, 의, 통계, 정..."
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[몇, 달동, 안이, 나, 미루다, 오다, 당부, 의, 말씀, 을, 회원, 님, 들..."
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[Estp, 여자, 이미지, 어떤, 가요, ?, 참고, 로, 에니어그램, 은, 8,..."


In [13]:
w2v = Word2Vec.load("word2vec_cbow.model")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [14]:
df['morphs'][4]

['Estp',
 '여자',
 '이미지',
 '어떤',
 '가요',
 '?',
 '참고',
 '로',
 '에니어그램',
 '은',
 '8',
 'w',
 '7',
 '이다',
 '~~']

In [15]:
w2v.wv['여자']

array([-1.30976513e-01, -1.66101411e-01, -5.20755537e-02, -4.41010594e-01,
        6.58510029e-01, -8.80238533e-01,  1.29653560e-03, -4.40452946e-03,
       -3.32174599e-01,  2.04511248e-02,  1.42542839e-01, -4.21399504e-01,
       -1.30315542e-01,  1.83752790e-01, -1.46023571e+00, -3.37984562e-02,
       -1.20631540e+00, -6.61046505e-01, -6.73100293e-01, -8.79625440e-01,
       -3.87979388e-01,  1.28995970e-01,  4.92022812e-01, -1.27542242e-01,
       -1.09600508e+00,  5.59089519e-03, -8.24525118e-01, -1.61638349e-01,
        1.24012291e-01,  1.17009175e+00,  2.97523886e-01, -8.85237232e-02,
       -3.81901592e-01,  3.45201641e-01,  2.22476602e-01,  6.66045189e-01,
       -1.30145156e+00, -8.83954585e-01,  3.81915383e-02, -6.44417822e-01,
        1.08689344e+00,  1.22526502e-02,  1.15788198e+00, -2.22774632e-02,
       -6.02097869e-01, -1.31189322e+00, -2.63060570e-01,  4.74034071e-01,
       -4.97671038e-01, -5.25662959e-01, -1.38611913e-01,  2.91912168e-01,
        4.85915869e-01,  

In [16]:
dtm['여자'][4]

0.23501880082506796

In [17]:
dtm['여자'][4] * w2v.wv['여자']

array([-3.07819434e-02, -3.90369557e-02, -1.22387344e-02, -1.03645779e-01,
        1.54762238e-01, -2.06872612e-01,  3.04710236e-04, -1.03514723e-03,
       -7.80672804e-02,  4.80639888e-03,  3.35002467e-02, -9.90368053e-02,
       -3.06266025e-02,  4.31853607e-02, -3.43182862e-01, -7.94327259e-03,
       -2.83506811e-01, -1.55358359e-01, -1.58191219e-01, -2.06728518e-01,
       -9.11824554e-02,  3.03164795e-02,  1.15634613e-01, -2.99748257e-02,
       -2.57581800e-01,  1.31396553e-03, -1.93778902e-01, -3.79880518e-02,
        2.91452203e-02,  2.74993569e-01,  6.99237064e-02, -2.08047405e-02,
       -8.97540525e-02,  8.11288804e-02,  5.22861853e-02,  1.56533137e-01,
       -3.05865586e-01, -2.07745954e-01,  8.97572935e-03, -1.51450306e-01,
        2.55440384e-01,  2.87960330e-03,  2.72124052e-01, -5.23562264e-03,
       -1.41504318e-01, -3.08319569e-01, -6.18241802e-02,  1.11406922e-01,
       -1.16962053e-01, -1.23540677e-01, -3.25764045e-02,  6.86048493e-02,
        1.14199370e-01,  

In [19]:
temp = np.zeros((len(df),300))
cnt = 0
for i in range(0,len(df['morphs'])):
    for token in df['morphs'][i]:
        try:
            temp[i] += w2v.wv[token] * dtm[token][i]
        except:
            cnt += 1

In [21]:
temp

(28569, 300)

In [23]:
with open('tfidf_weighted_w2v_cbow2.pickle', 'wb') as f:
    pickle.dump(temp, f)

In [22]:
ft = FastText.load("FastText_cbow.model")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [24]:
temp2 = np.zeros((len(df),300))
cnt = 0
for i in range(0,len(df['morphs'])):
    for token in df['morphs'][i]:
        try:
            temp2[i] += ft.wv[token] * dtm[token][i]
        except:
            cnt += 1

In [27]:
cnt

271797

In [26]:
temp2.shape

(28569, 300)

In [28]:
with open('tfidf_weighted_ft_cbow.pickle', 'wb') as f:
    pickle.dump(temp2, f)

In [21]:
def tfidf_weighted(a):
    temp = np.zeros(300)
    cnt = 0
    for token in a:
        try:
            #temp += w2v.wv[token] * 
            temp += dtm[token]
        except:
            cnt += 1
    return temp #/(len(a)-cnt)

In [44]:
with open('tfidf_weighted_w2v_cbow.pickle', 'wb') as f:
    pickle.dump(dtm['tfidf_weighted_w2v_cbow'], f)

## 다른 사람 코드 참고

In [45]:
from gensim.corpora import Dictionary
from gensim.models.tfidfmodel import TfidfModel
from gensim.matutils import sparse2full

In [46]:
docs_dict = Dictionary(df['morphs'])
docs_dict.filter_extremes(no_below=20, no_above=0.2)
docs_dict.compactify()

In [50]:
docs_corpus = [docs_dict.doc2bow(doc) for doc in df['morphs']]
model_tfidf = TfidfModel(docs_corpus, id2word=docs_dict)
docs_tfidf  = model_tfidf[docs_corpus]

In [53]:
docs_vecs   = np.vstack([sparse2full(c, len(docs_dict)) for c in docs_tfidf])

In [64]:
tfidf_emb_vecs = np.vstack([w2v.wv[docs_dict[i]] for i in range(len(docs_dict))])

In [76]:
docs_vecs.shape

(28569, 9798)

In [77]:
tfidf_emb_vecs.shape

(9798, 300)

In [65]:
docs_emb = np.dot(docs_vecs, tfidf_emb_vecs)

In [66]:
docs_emb.shape

(28569, 300)

In [74]:
with open('tfidf_weighted_w2v_cbow_v2.pickle', 'wb') as f:
    pickle.dump(docs_emb, f)