In [1]:
from gensim.models import Word2Vec
from konlpy.tag import Okt
import re
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
df = pd.read_csv("./datasets/questions.csv", index_col=0)
print(df.shape)

(13683, 12)


In [6]:
# 특수문자, 영어 등을 제거 후 Okt로 토큰화

def preprocess_txt(text):
    return re.sub(r'[^ ㄱ-ㅎㅏ-ㅣ가-힣+]+', ' ', text)

okt = Okt()

df['cleaned'] = df['지문텍스트'].apply(preprocess_txt)

corpus = []
for words in df['cleaned']:
    corpus.append(okt.morphs(words))

In [7]:
embedding_model = Word2Vec(corpus, vector_size=20, window = 5, 
                           min_count=1, workers=4, sg=1)

In [8]:
# 각 문항에 대한 평균 벡터 구하기
def get_document_vectors(corpus):
    document_embedding_list = []

    for line in corpus:
        doc2vec = None
        count = 0
        for word in line:
            if any(embedding_model.wv.get_vector(word)):
                count += 1
                if doc2vec is None:
                    doc2vec = embedding_model.wv.get_vector(word)
                else:
                    doc2vec = doc2vec + embedding_model.wv.get_vector(word)

        if doc2vec is not None:
            doc2vec = doc2vec / count
            document_embedding_list.append(list(doc2vec))

    return document_embedding_list

In [9]:
document_embedding_list = get_document_vectors(corpus)
print(document_embedding_list[:3])

[[0.15250134, 0.48501867, 0.53118765, -0.63673425, 0.12304843, 0.39727548, 0.29402885, 0.5896947, -0.31393582, 0.2641875, 0.098143496, 0.29152814, 0.11789917, -0.23578873, 0.78615296, -0.44704774, 1.6770614, -0.04557995, -1.2998639, -0.10553828], [-0.22006644, 0.18160604, 0.62898237, -0.21163268, -0.10210105, 0.10890714, 0.27807748, 0.5858529, -0.29765993, -0.03212421, 0.3561439, 0.068477884, 0.1401399, -0.33014894, 0.49948144, -0.20789479, 1.0032285, -0.14481983, -0.8614896, -0.15676883], [-0.19697903, 0.23798417, 0.66747224, -0.2528465, 0.029599281, 0.18028478, 0.1386769, 0.72134256, -0.29425624, 0.03037547, 0.13611607, 0.076841414, -0.060993753, -0.46446177, 0.5588821, -0.27269918, 1.2814449, -0.086522445, -0.72309405, -0.0032268497]]


In [14]:
cos_sim = cosine_similarity(document_embedding_list, document_embedding_list)
cos_sim_df = pd.DataFrame(
    data=cos_sim,
    index=df['문항코드'].values,
    columns=df['문항코드'].values
)
cos_sim_df

Unnamed: 0,2085,2086,2087,2088,7197,7198,7199,7200,7201,7202,...,30075571,30075572,30075574,30075575,30075576,30075577,30075578,30075579,30075580,30075581
2085,1.000000,0.913504,0.929142,0.870158,0.914609,0.991015,0.954176,0.914218,0.986844,0.958590,...,0.843022,0.894328,0.868989,0.871299,0.908550,0.840785,0.832825,0.878632,0.871586,0.927071
2086,0.913504,1.000000,0.962442,0.970976,0.959935,0.921112,0.907926,0.934703,0.917545,0.941876,...,0.930249,0.948177,0.951385,0.938057,0.934299,0.941892,0.945172,0.929335,0.969380,0.974224
2087,0.929142,0.962442,1.000000,0.962623,0.964978,0.935456,0.937908,0.964364,0.929966,0.970215,...,0.912012,0.911173,0.927303,0.905671,0.915486,0.908123,0.925146,0.915421,0.925155,0.956674
2088,0.870158,0.970976,0.962623,1.000000,0.948774,0.885975,0.900680,0.945135,0.888577,0.936149,...,0.940405,0.912500,0.953359,0.936661,0.926062,0.949708,0.959100,0.948245,0.964377,0.963467
7197,0.914609,0.959935,0.964978,0.948774,1.000000,0.911856,0.900019,0.967791,0.905964,0.956114,...,0.908246,0.919654,0.927762,0.933515,0.907672,0.910280,0.906845,0.910563,0.932264,0.946358
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30075577,0.840785,0.941892,0.908123,0.949708,0.910280,0.856993,0.846213,0.904361,0.846195,0.863312,...,0.981920,0.926434,0.982680,0.980931,0.949978,1.000000,0.977997,0.951794,0.975179,0.947893
30075578,0.832825,0.945172,0.925146,0.959100,0.906845,0.854987,0.866684,0.907431,0.839026,0.860926,...,0.973210,0.938321,0.977093,0.948383,0.938286,0.977997,1.000000,0.956259,0.980139,0.956985
30075579,0.878632,0.929335,0.915421,0.948245,0.910563,0.878390,0.904534,0.914052,0.870566,0.896722,...,0.955691,0.914174,0.956238,0.954302,0.967491,0.951794,0.956259,1.000000,0.961923,0.956262
30075580,0.871586,0.969380,0.925155,0.964377,0.932264,0.887597,0.891362,0.926920,0.875848,0.894796,...,0.968221,0.959375,0.983750,0.967641,0.952219,0.975179,0.980139,0.961923,1.000000,0.971276


In [15]:
prob_skill = df[['문항코드', 'cleaned']].drop_duplicates()

def get_similar_problem(item_id):

  sim_df = pd.DataFrame(cos_sim_df[item_id].sort_values(ascending=False).reset_index())
  sim_df.columns = ['문항코드', 'similarity']
  sim_df = sim_df[sim_df['문항코드'] != item_id][:10]

  skill_check = prob_skill[prob_skill['문항코드'] == item_id]
  for k in range(len(skill_check)):
    print("Base Question: ",skill_check['cleaned'].iloc[k])

  for i in range(len(sim_df)):
    prob_id = sim_df['문항코드'].iloc[i]
    print("TOP ",i+1, "(",sim_df['similarity'].iloc[i].round(3),") : ", prob_id, "-", end = ' ')
    skill_check = prob_skill[prob_skill['문항코드'] == prob_id]
    for j in range(len(skill_check)):
      print(skill_check['cleaned'].iloc[j], end = ' | ')
    print()
  print()
  
  
def get_different_problem(item_id):

  sim_df = pd.DataFrame(cos_sim_df[item_id].sort_values(ascending=True).reset_index())
  sim_df.columns = ['문항코드', 'similarity']
  sim_df = sim_df[sim_df['문항코드'] != item_id][:10]

  skill_check = prob_skill[prob_skill['문항코드'] == item_id]
  for k in range(len(skill_check)):
    print("Base Question: ",skill_check['cleaned'].iloc[k])

  for i in range(len(sim_df)):
    prob_id = sim_df['문항코드'].iloc[i]
    print("TOP ",i+1, "(",sim_df['similarity'].iloc[i].round(3),") : ", prob_id, "-", end = ' ')
    skill_check = prob_skill[prob_skill['문항코드'] == prob_id]
    for j in range(len(skill_check)):
      print(skill_check['cleaned'].iloc[j], end = ' | ')
    print()
  print()