In [1]:
!mkdir relationGraph

In [15]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/86.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m81.9/86.0 kB[0m [31m2.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentencepiece (from sentence-transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone
  Created wheel for sentence-transformers: filename=sentence_trans

reference: https://huggingface.co/sentence-transformers

In [20]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')

In [30]:
query_embedding = model.encode('job')
passage_embedding = model.encode('Financial Analyst Job Opportunities - Google Search')

print("Similarity:", util.dot_score(query_embedding, passage_embedding))

Similarity: tensor([[0.3890]])


In [54]:
THRESHOLD = 0.3

In [101]:
query_embedding = model.encode('course')
passage_embedding = model.encode("financial analyst")
util.dot_score(query_embedding, passage_embedding)

tensor([[True]])

In [115]:
def SentenceTransformer_match(person_id, key_word):
  file_path = f"./persona{person_id}.json"
  query_embedding = model.encode(key_word)
  result = {
      'browsingHistoryList' : [],
      'facebookPostsList'   : [],
      'schedule'            : [],
      'info'                : []
  }

  with open(file_path, 'r') as json_file:
    data = json.load(json_file)['data']

    for bh in data['browsingHistoryList']:
      passage_embedding = model.encode(bh['title'])
      if util.dot_score(query_embedding, passage_embedding) > THRESHOLD:
        result['browsingHistoryList'].append(bh['id'])

    for pc in data['facebookPostsList']:
      passage_embedding = model.encode(pc['content'])
      if util.dot_score(query_embedding, passage_embedding) > THRESHOLD:
        result['facebookPostsList'].append(pc['id'])

    for sch in data['schedule']:
      passage_embedding = model.encode(sch['address'])
      if util.dot_score(query_embedding, passage_embedding) > THRESHOLD:
        result['schedule'].append(sch['id'])

    for info in data.keys():
      if isinstance(data[info], str):
        passage_embedding = model.encode(data[info])
        if util.dot_score(query_embedding, passage_embedding) > THRESHOLD:
          result['info'].append(info)

  file_path = f"./relationGraph/{person_id}_{key_word}.json"
  with open(file_path, 'w') as json_file:
      json.dump(result, json_file)
      print(f"---------------Saved as {person_id}_{key_word}.json------------------")
  return result


In [63]:
import os

def extract_pid_and_keyword(filename):
  parts = filename.split('_')
  if len(parts) == 2:
    pid, keyword = parts
    return pid, keyword.split('.')[0]
  return None, None

def search_files(directory):
  matching_files = []
  for filename in os.listdir(directory):
    if filename.endswith(".json"):
      pid, keyword = extract_pid_and_keyword(filename)
      if pid is not None and keyword is not None:
        matching_files.append({
            "pid": pid,
            "keyword": keyword
        })
  return matching_files


In [116]:
def search_for_existed_search(person_id, key_word):
  matching_files_info = search_files("./relationGraph")
  query_embedding = model.encode(key_word)
  res = []
  for file_info in matching_files_info:
    pid = int(file_info["pid"])
    keyword = file_info["keyword"]
    passage_embedding = model.encode(keyword)
    if pid == person_id and util.dot_score(query_embedding, passage_embedding) > THRESHOLD:
      res.append([pid, keyword])
  return res


In [117]:
def read_existed_search(person_id, key_word):
  with open(f"./relationGraph/{person_id}_{key_word}.json", 'r') as file:
    data = json.load(file)
    return data

In [118]:
def relevant_search(person_id, key_word):

  existed_search = search_for_existed_search(person_id, key_word)
  print(existed_search)
  if len(existed_search) > 0:
    print('Similar searches found:')
    for i, s in enumerate(existed_search):
      print(f" {i+1}: pid: {s[0]}, keyword: {s[1]}")
    print('Do you want to view an existed search instead?')
    user_input = int(input("If no, type 0; else, type index"))
    if user_input > 0:
      return read_existed_search(person_id, existed_search[user_input-1][1])
  return SentenceTransformer_match(person_id, key_word)


In [107]:
relevant_search(2, 'dinner')

[[2, 'lunch'], [2, 'eat'], [2, 'dinner']]
Similar searches found:
 1: pid: 2, keyword: lunch
 2: pid: 2, keyword: eat
 3: pid: 2, keyword: dinner
Do you want to view an existed search instead?
If no, type 0; else, type index2
{'browsingHistoryList': [3, 6, 13, 14, 56, 125, 126, 127, 248, 249, 304, 306, 307, 317, 346, 354, 359, 397, 407, 417], 'facebookPostsList': ['2', '3', '6', '7', '8', '10', '11', '14', '15', '16', '19', '21', '22', '25', '26', '28', '29'], 'schedule': [1063], 'info': []}


In [121]:
relevant_search(1, 'dinner')

[[1, 'dinner'], [1, 'eat']]
Similar searches found:
 1: pid: 1, keyword: dinner
 2: pid: 1, keyword: eat
Do you want to view an existed search instead?
If no, type 0; else, type index0
---------------Saved as 1_dinner.json------------------


{'browsingHistoryList': [16,
  63,
  100,
  105,
  107,
  116,
  166,
  171,
  183,
  186,
  196,
  197,
  205,
  209,
  256,
  260,
  265,
  266,
  305,
  306,
  311,
  314,
  333,
  338,
  360],
 'facebookPostsList': ['2',
  '3',
  '4',
  '6',
  '8',
  '10',
  '11',
  '16',
  '17',
  '19',
  '22'],
 'schedule': [1001,
  1004,
  1006,
  1012,
  1015,
  1017,
  1023,
  1026,
  1029,
  1033,
  1036,
  1039,
  1044,
  1047,
  1050,
  1054,
  1058,
  1063,
  1067],
 'info': ['industry']}

In [120]:
def decode_dic(dic, person_id):
  file_path = f"./persona{person_id}.json"
  with open(file_path, 'r') as json_file:
    data = json.load(json_file)['data']

    for bh in data['browsingHistoryList']:
      if bh['id'] in dic['browsingHistoryList']:
        print(['browsingHistoryList', bh['id']], bh['title'])

    for pc in data['facebookPostsList']:
      if pc['id'] in dic['facebookPostsList']:
        print(['facebookPostsList', pc['id']], pc['content'])

    for sch in data['schedule']:
      if sch['id'] in dic['schedule']:
        print(['schedule', sch['id']], sch['address'])

    for info in data.keys():
      if isinstance(data[info], str):
        if info in dic['info']:
          print([info], data[info])

In [119]:
dic = read_existed_search(2, 'lunch')
decode_dic(dic, 2)

['browsingHistoryList', 3] Healthy Breakfast Ideas - EatingWell
['browsingHistoryList', 4] Morning Commute Tips - The Balance Careers
['browsingHistoryList', 6] Healthy Lunch Ideas - BBC Good Food
['browsingHistoryList', 13] Healthy Breakfast Ideas - Google Search
['browsingHistoryList', 14] 30+ Healthy Breakfast Ideas for a Balanced Morning
['browsingHistoryList', 56] Healthy Breakfast Recipes - 432 Birch St, Los Angeles, CA 90005
['browsingHistoryList', 125] Breakfast Recipes - Google Search
['browsingHistoryList', 126] Healthy Breakfast Ideas - Google Search
['browsingHistoryList', 127] Breakfast Restaurants Near Me - Google Search
['browsingHistoryList', 201] Breakfast and News Reading - 1200 S Central Ave, Los Angeles, CA 90021
['browsingHistoryList', 206] Breakfast and News Reading - 1200 S Central Ave, Los Angeles, CA 90021
['browsingHistoryList', 248] Quick and Healthy Breakfast Ideas - EatingWell
['browsingHistoryList', 249] Easy Breakfast Recipes - Food Network
['browsingHist