In [28]:
from collections import defaultdict

import numpy as np
import pandas as pd
from sqlalchemy import create_engine


In [21]:
conn = create_engine(
    f"mysql+mysqlconnector://{'root'}:{'rootroot'}@{'127.0.0.1'}:{3306}/{'alaitp'}?charset=utf8", pool_recycle=3600)

In [22]:
def select_raw_words() -> pd.DataFrame:
    sql_query = """
                    SELECT * FROM keywords_en_core_web_lg
                """
    return pd.read_sql_query(sql_query, conn)

def select_job_model_words() -> pd.DataFrame:
    sql_query = """
                    SELECT * FROM keywords_job_model
                """
    return pd.read_sql_query(sql_query, conn)

In [23]:
df_raw = select_raw_words()
df_job = select_job_model_words()

In [24]:
df_job

Unnamed: 0,keyword_name,job_id,created_time,count,source,keyword_type
0,testing,1c629bae5c6dc3f48e10196b19c151186502b30775b199...,2019-12-22 19:33:05,1,all,SOFTWARE_ENGINEERING
1,3-4+ years of development experience,1c629bae5c6dc3f48e10196b19c151186502b30775b199...,2019-12-22 19:33:05,1,all,WORK_EXPERIENCE
2,2+ years of experience,1c629bae5c6dc3f48e10196b19c151186502b30775b199...,2019-12-22 19:33:05,1,all,WORK_EXPERIENCE
3,communication skills,1c629bae5c6dc3f48e10196b19c151186502b30775b199...,2019-12-22 19:33:05,1,all,SOFT_SKILL
4,Agile,1c629bae5c6dc3f48e10196b19c151186502b30775b199...,2019-12-22 19:33:05,1,all,APPROACH
...,...,...,...,...,...,...
28764,problem-solving skills,2923a7d6b1cf9df5993253768bb05ada370ce5e55d1772...,2019-12-22 19:41:58,1,all,GENERAL
28765,product managers,2923a7d6b1cf9df5993253768bb05ada370ce5e55d1772...,2019-12-22 19:41:58,1,all,POSITION
28766,constructive feedback,2923a7d6b1cf9df5993253768bb05ada370ce5e55d1772...,2019-12-22 19:41:58,1,all,SOFT_SKILL
28767,high-quality code,2923a7d6b1cf9df5993253768bb05ada370ce5e55d1772...,2019-12-22 19:41:58,1,all,QUALITY


In [25]:
unique_tag = list(df_raw.keyword_name.unique())
tag_idx = [unique_tag.index(word) for word in unique_tag]
tag_dict = dict(zip(unique_tag, tag_idx))
idx_to_tag = dict(zip(tag_idx, unique_tag))


In [32]:
entity_entity_matrix = np.zeros((len(unique_tag), len(unique_tag)), np.float64)
keyword_dict = defaultdict(list)
for row in df_raw.itertuples():
    # Check whether the news_id exist
    keyword_tuple = (row.keyword_name, row.count, row.keyword_type)
    job_id = row.job_id
    keyword_dict[job_id].append(keyword_tuple)

for key in keyword_dict:
    for item in keyword_dict[key]:
        row_idx = tag_dict[item[0]]
        for word in keyword_dict[key]:
            col_idx = tag_dict[word[0]]
            entity_entity_matrix[row_idx, col_idx] += 1

In [35]:
def get_most_related_words(word: str, n: int) -> dict:
    """ word: the query word
        n: top n
    """
    top_n_dict = {}
    try:
        word_index = tag_dict[word]
    except KeyError:  # the word does not exist
        return top_n_dict
    # Sort the index by value, return indices of the highest value to the lowest
    top_n_indices = np.argsort(entity_entity_matrix[word_index])[::-1][0:n]
    top_n_counts = entity_entity_matrix[word_index][top_n_indices]
    for i, idx in enumerate(top_n_indices):
        tag = idx_to_tag[idx]
#         if tag != word:  # remove word itself
        top_n_dict[tag] = top_n_counts[i]
    return top_n_dict

In [81]:
get_most_related_words('GraphQL', 20)

{'GraphQL': 65.0,
 'first': 26.0,
 'Apollo': 18.0,
 'Node.js': 18.0,
 'UI': 18.0,
 'every day': 17.0,
 'CSS': 17.0,
 'Amazon': 17.0,
 'Shopify': 17.0,
 'JavaScript': 17.0,
 'Argentina': 17.0,
 'hundreds': 17.0,
 'UX': 17.0,
 'New York': 17.0,
 '401k': 15.0,
 'daily': 15.0,
 'US': 15.0,
 'Android': 15.0,
 'U.S.': 14.0,
 'QA': 14.0}