In [1]:
from konlpy.tag import Okt
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer

import numpy as np
import os

# 명사 추출

In [2]:
genre_txt = ['synopsis_art.txt', 'synopsis_gen.txt', 'synopsis_SF.txt', 'synopsis_family.txt', 'synopsis_show.txt', 
             'synopsis_horror.txt', 'synopsis_etc.txt', 'synopsis_documentary.txt', 'synopsis_drama.txt', 'synopsis_romance.txt', 
             'synopsis_musical.txt', 'synopsis_mystery.txt', 'synopsis_crime.txt', 'synopsis_historical.txt', 
             'synopsis_western.txt', 'synopsis_adult.txt', 'synopsis_thriller.txt', 'synopsis_animation.txt', 'synopsis_action.txt', 
             'synopsis_adventure.txt', 'synopsis_war.txt', 'synopsis_comedy.txt', 'synopsis_fantasy.txt']
genre_name = ['예술', '상업', 'SF', '가족', '공연', '공포(호러)', '기타', '다큐멘터리', '드라마', '멜로로맨스', '뮤지컬', '미스터리', '범죄', '사극', '서부극(웨스턴)',
         '성인물(에로)', '스릴러', '애니메이션', '액션', '어드벤처', '전쟁', '코미디', '판타지']
print(len(genre_txt))
print(len(genre_name))

23
23


In [None]:
okt = Okt()
tokenized_dict = dict()
for idx in range(len(genre_txt)):
    temp = list()
    with open(os.getenv('HOME') + f'/aiffel/weat/{genre_txt[idx]}', 'r') as file:
        while True:
            line = file.readline()
            if not line: break
            words = okt.pos(line, stem=True, norm=True)
            res = []
            for w in words:
                if w[1] in ["Noun"]:      # "Adjective", "Verb" 등을 포함할 수도 있습니다.
                    res.append(w[0])    # 명사일 때만 tokenized 에 저장하게 됩니다. 
            temp.append(res)
    tokenized_dict[genre_name[idx]] = temp

In [None]:
tokenized = sum(list(tokenized_dict.values()), [])
tokenized[0][:10]

# 임베딩 모델

In [None]:
# 모델 생성
model = Word2Vec(tokenized, vector_size=100, window=5, min_count=3, sg=0)

In [None]:
model.wv.most_similar(positive=['상업'])

In [None]:
model.wv.most_similar(positive=['공포'])

In [None]:
model.wv.most_similar(positive=['예술'])

# 타겟과 속성

In [None]:
# 장르별 리스트
genre = [' '.join(sum(v, [])) for v in tokenized_dict.values()]
genre[0]

In [None]:
len(genre)

In [None]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(genre)

In [None]:
X.shape

In [None]:
print(vectorizer.vocabulary_['영화'])
print(vectorizer.get_feature_names()[24952])

In [None]:
m = [X[i].tocoo() for i in range(X.shape[0])]

w = [[[i, j] for i, j in zip(mm.col, mm.data)] for mm in m]

for i in range(len(w)):
    w[i].sort(key=lambda x: x[1], reverse=True)
attributes = []
for i in range(len(w)):
    print(genre_name[i], end=': ')
    attr = []
    j = 0
    while (len(attr) < 15):
        if vectorizer.get_feature_names()[w[i][j][0]] in model.wv:
            attr.append(vectorizer.get_feature_names()[w[i][j][0]])
            print(vectorizer.get_feature_names()[w[i][j][0]], end=', ')
        j += 1
    attributes.append(attr)
    print()

#  WEAT score 계산 및 시각화

In [None]:
matrix = [[0 for _ in range(len(genre_name))] for _ in range(len(genre_name))]

In [None]:
def cos_sim(i, j):
    return dot(i, j.T)/(norm(i)*norm(j))

def s(w, A, B):
    c_a = cos_sim(w, A)
    c_b = cos_sim(w, B)
    mean_A = np.mean(c_a, axis=-1)
    mean_B = np.mean(c_b, axis=-1)
    return mean_A - mean_B #, c_a, c_b

def weat_score(X, Y, A, B):
    
    s_X = s(X, A, B)
    s_Y = s(Y, A, B)

    mean_X = np.mean(s_X)
    mean_Y = np.mean(s_Y)
    
    std_dev = np.std(np.concatenate([s_X, s_Y], axis=0))
    
    return  (mean_X-mean_Y) / std_dev

In [None]:
X = np.array([model.wv[word] for word in attributes[0]])
Y = np.array([model.wv[word] for word in attributes[1]])

for i in range(len(genre_name)-1):
    # i + 1로 이미 계산된 값은 하지 않음 - 좌하단이 0
    for j in range(i + 1, len(genre_name)):
        A = np.array([model.wv[word] for word in attributes[i]])
        B = np.array([model.wv[word] for word in attributes[j]])
        matrix[i][j] = weat_score(X, Y, A, B)

In [None]:
for i in range(len(genre_name)-1):
    for j in range(i+1, len(genre_name)):
        print(genre_name[i], genre_name[j],matrix[i][j])

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

np.random.seed(0)

# 한글 지원 폰트
sns.set(font='NanumGothic')

# 마이너스 부호 

plt.figure(figsize = (15,15))
plt.rcParams['axes.unicode_minus'] = False

ax = sns.heatmap(matrix, xticklabels=genre_name, yticklabels=genre_name, annot=True,  cmap='RdYlGn_r')
ax

공포, 예술, sf장르가 대부분의 것들과 극명하게 갈리는 형태를 보임