In [None]:
import pandas as pd
import csv
import json
import re
from opencc import OpenCC
from ckiptagger import WS, POS, NER, construct_dictionary
import gensim
from gensim.models.word2vec import Word2Vec

In [None]:
review_tokenization_df = pd.read_csv('review_tokenization.csv')
review_tokenization_df

In [None]:
store_labels_df = pd.read_csv('store_labels.csv')
store_labels_df

In [None]:
labels_list = []
labels_dict = {}
with open("cafe_keyword_v2.csv", 'r', encoding = 'utf-8-sig') as file:
    csvreader = csv.reader(file)
    headings = next(csvreader)
    for row in csvreader:
        row_filter = [t for t in row if t != '']
        labels_list.append(row_filter)
        for label in row_filter:
            labels_dict[label] = 1
print(labels_list)

In [None]:
synonyms_list = []
cc = OpenCC('s2tw')
f = open("sim_words.txt", "r")
for x in f:
    x_con = cc.convert(x)
    synonyms_list.append(x_con.replace('\n', '').split(' '))

In [None]:
ws = WS(r'data')
ckip_dict = construct_dictionary(labels_dict)

corpus = []
ckip_results = review_tokenization_df['review_tokenization']
for c in range(len(ckip_results)):
    corpus.append(ckip_results[c][2:-2].replace("'", "").split(", "))
model = Word2Vec(corpus)

In [None]:
def query(q):
    
    reg = re.compile("[^\u4e00-\u9fa5^a-z^A-Z^0-9]")
    q_pre = reg.sub('', q)
    queries = ws([q_pre], coerce_dictionary = ckip_dict)
    q_flag = [False] * len(queries[0])
    
    if len(queries[0]) > 1:
        for q in range(len(queries[0])):
            for labels in labels_list:
                if queries[0][q] in labels:
                    q_flag[q] = True
                    break
    
    weights = [1 / len(q_flag)] * len(q_flag)
    t_num = 0
    f_num = 0
    for q in q_flag:
        if q == True:
            t_num += 1
        else:
            f_num += 1
    if t_num != 0 and f_num != 0:
        for q in range(len(q_flag)):
            if q_flag[q]:
                weights[q] = 0.8 / t_num
            else:
                weights[q] = 0.2 / f_num
    
    sims = {}
    for labels in labels_list:
        for qr in range(len(queries[0])):
            if queries[0][qr] not in model.wv.index_to_key:
                continue
            for label in labels:
                if label not in model.wv.index_to_key:
                    continue
                sim = model.wv.similarity(queries[0][qr], label)
                if sims.get(labels[0]) == None:
                    sims[labels[0]] = sim * weights[qr]
                else:
                    sims[labels[0]] += sim * weights[qr]
                break
    if len(sims) == 0:
        print('Your query is not in model!')

    sims = {k: v for k, v in sorted(sims.items(), key = lambda item: item[1], reverse = True)}
    top_labels = list(sims.keys())[0:3]
    scores = {}
    results_dict = {"query_label": top_labels, "stores": []}
    
    for i in range(len(store_labels_df)):
        rating = store_labels_df.iloc[i]['star']
        scores[i] = rating / 5 * 0.2
        for l in range(len(top_labels)):
            store_labels = store_labels_df.iloc[i]['labels'].replace("'", '"')
            store_labels = json.loads(store_labels)
            if top_labels[l] in store_labels.keys():
                score = store_labels[top_labels[l]]
                if l == 0:
                    scores[i] += score * 0.6
                else:
                    scores[i] += score * 0.1
                    
    scores = {k: v for k, v in sorted(scores.items(), key = lambda item: item[1], reverse = True)}
    attr = list(store_labels_df.columns)
    top_num = 0
    for i in scores.keys():
        
        if top_num == 10:
            break
            
        result = {}
        result["link"] = store_labels_df.iloc[i]['link']
        result["id"] = store_labels_df.iloc[i]['id']
        result["name"] = store_labels_df.iloc[i]['name']
        result["time"] = store_labels_df.iloc[i]['time']
        result["address"] = store_labels_df.iloc[i]['address']
        result["phone"] = store_labels_df.iloc[i]['phone']
        result["img"] = store_labels_df.iloc[i]['img']
        result["website"] = store_labels_df.iloc[i]['website']
        result["star"] = store_labels_df.iloc[i]['star']
        result["review_num"] = store_labels_df.iloc[i]['review_num']
        
        top_labels_i = store_labels_df.iloc[i]['labels'].replace("'", '"')
        top_labels_i = json.loads(top_labels_i)
        top_labels_i = {k: v for k, v in sorted(top_labels_i.items(), key = lambda item: item[1], reverse = True)}
        top_labels_i = list(top_labels_i.keys())[0:3]
        result['labels'] = top_labels_i
        
        top_num += 1
        results_dict["stores"].append(result)
        
    return results_dict

In [None]:
results = query('貓咪')
for i in range(10):
    print(results['stores'][i]['name'])
print(results)