In [None]:
!pip install https://github.com/amazon-science/ReFinED/archive/refs/tags/V1.zip
!pip install text-preprocessing
!pip install transformers==4.33.2

Collecting https://github.com/amazon-science/ReFinED/archive/refs/tags/V1.zip
  Downloading https://github.com/amazon-science/ReFinED/archive/refs/tags/V1.zip
[2K     [32m|[0m [32m202.4 kB[0m [31m1.1 MB/s[0m [33m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting Unidecode>=1.1.1 (from ReFinED==1.0)
  Downloading Unidecode-1.4.0-py3-none-any.whl.metadata (13 kB)
Collecting ujson (from ReFinED==1.0)
  Downloading ujson-5.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.3 kB)
Collecting lmdb>=1.0 (from ReFinED==1.0)
  Downloading lmdb-1.6.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.1 kB)
Collecting boto3 (from ReFinED==1.0)
  Downloading boto3-1.38.45-py3-none-any.whl.metadata (6.6 kB)
Collecting prettyprint (from ReFinED==1.0)
  Downloading prettyprint-0.1.5.tar.gz (2.1 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting botocore (from ReFinED==1.0)
  Downloading botocore-1.38.4

In [None]:
import re
from refined.inference.processor import Refined
import json
import os
import nltk
nltk.download('punkt_tab')
refined = Refined.from_pretrained(model_name='wikipedia_model_with_numbers',
                                      entity_set="wikipedia")
import math

  torch.utils._pytree._register_pytree_node(
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
Downloading /root/.cache/refined/wikipedia_model_with_numbers/model.pt: 100%|██████████| 724M/724M [00:04<00:00, 145MB/s]
Downloading /root/.cache/refined/wikipedia_model_with_numbers/config.json: 100%|██████████| 658/658 [00:00<00:00, 3.87kB/s]
Downloading /root/.cache/refined/wikipedia_model_with_numbers/precomputed_entity_descriptions_emb_wikipedia_6269457-300.np: 100%|██████████| 3.76G/3.76G [00:25<00:00, 146MB/s]
Downloading /root/.cache/refined/roberta-base/pytorch_model.bin: 100%|██████████| 501M/501M [00:04<00:00, 106MB/s]
Downloading /root/.cache/refined/roberta-base/config.json: 100%|██████████| 481/481 [00:00<00:00, 3.31kB/s]
Downloading /root/.cache/refined/roberta-base/vocab.json: 100%|██████████| 899k/899k [00:00<00:00, 3.93MB/s]
Downloading /root/.cache/refined/roberta-base/merges.txt: 100%|██████████| 456k/456k [0

In [None]:
#Helper functions

def retrieve_all_verticals(query='mcdonalds', api_key='default'):
    dict = {}
    if not api_key == 'default':
        web_json = fetch_results_json(query=query, type='web', api_key=api_key)
        images_json = fetch_results_json(query=query, type='images', api_key=api_key)
        news_json = fetch_results_json(query=query, type='news', api_key=api_key)
        videos_json = fetch_results_json(query=query, type='videos', api_key=api_key)

        # Aggregate all into a datastructure
        dict = {
            'web': web_json,
            'images': images_json,
            'news': news_json,
            'videos': videos_json
        }

    return dict


'''
    This function is the main entry point for calling APIs with paramteres
'''
def fetch_results_json(query='mcdonalds', type='web', api_key='default'):
    vertical_codes = {'images': ('tbm', 'isch'),
                      'news': ('tbm', 'nws'),
                      'videos': ('tbm', 'vid')}

    from serpapi import GoogleSearch as GoogleSearchResults

    params = {
        "engine": "google",
        "q": query,
        'num': 100,
        "api_key": api_key,
        "google_domain": "google.com",
        "hl": "en",
        "api_key": api_key
    }

    if type != 'web':  # if not web then specify vertical code
        params.update({vertical_codes[type][0]: vertical_codes[type][1]})  # take key as vertical : value as code

    client = GoogleSearchResults(params)
    results = client.get_dict()
    return results


def process_web(api_response, index=0):
    dict = {}
    offset = 0

    for number, result in enumerate(api_response['organic_results'], start=index):
        if 'title' not in result:
            offset = offset - 1
            continue

        if 'snippet' not in result:
            snip = ""
        else:
            snip = result['snippet']


        dict[number + offset] = {'title': result['title'],
                                 'snippet': snip,
                                 'url': result['link'],
                                 'type': 'web'}

    return dict


def process_images(api_response, index=1):
    dict = {}

    offset = 0
    for number, result in enumerate(api_response['images_results'], start=index):

        if 'thumbnail' not in result or 'title' not in result:

            offset = offset - 1
            continue
        dict[number + offset] = {'title': result['title'],
                                 'url': result['link'],
                                 'thumbnail': result['thumbnail'],
                                 'type': 'image'}

    return dict


def process_news(api_response, index=1):
    dict = {}

    for number, result in enumerate(api_response['news_results'], start=index):
        if 'thumbnail' not in result:
            thumbnail = ""
        else:
            thumbnail = result['thumbnail']
        dict[number] = {'title': result['title'],
                        'snippet': result['snippet'],
                        'uploaded': result['date'],
                        'url': result['link'],
                        'thumbnail': thumbnail,
                        'type': 'news'}
    return dict


def process_videos(api_response, index=1):
    dict = {}

    for number, result in enumerate(api_response['video_results'], start=index):
        if 'thumbnail' not in result:
            thumbnail = ""
        else:
            thumbnail = result['thumbnail']

        uploaded = ""
        if 'rich_snippet' in result:
            if 'top' in result['rich_snippet']:
                if 'extensions' in result['rich_snippet']:
                    uploaded = " - ".join(result['rich_snippet']['top']['extensions'])

        dict[number] = {'title': result['title'],
                        'snippet': '',
                        'uploaded': uploaded,
                        'thumbnail': thumbnail,
                        'url': result['link'],
                        'type': 'video'}
    return dict


def get_all_verticals(query='mcdonalds', api_key = "default"):
    import pickle

    dbfile = open('examplePickle', 'rb')
    db = pickle.load(dbfile)

    api = dict()
    if not api_key == 'default':
        api = retrieve_all_verticals(query=query, api_key=api_key) # developers need to apply their own api key before retrieving the search results in real-tome
    else:
        print("hello")
        api = db
    web = process_web(api['web'])
    images = process_images(api['images'], len(web))
    news = process_news(api['news'], len(images) + len(web))
    videos = process_videos(api['videos'], len(news) + len(images) + len(web))

    dicts = {**web, **images, **news, **videos}
    return dicts

def extract_corpus(snippets):
  from text_preprocessing import preprocess_text
  from text_preprocessing import to_lower, remove_email, remove_url, remove_punctuation, lemmatize_word, \
      remove_special_character, normalize_unicode, remove_stopword, stem_word, expand_contraction, tokenize_word
  preprocess_functions = [remove_special_character, normalize_unicode, to_lower, expand_contraction,
                          remove_punctuation]

  corpus = dict()
  import re
  # as per recommendation from @freylis, compile once only
  stip_html = re.compile('<.*?>')

  for k, v in snippets.items():
      text = v['title'] + ' ' + (v['snippet'] if 'snippet' in v else '')
      text = re.sub(stip_html, ' ', text)

      text = preprocess_text(text, preprocess_functions)
      corpus[k] = text
  return corpus

def read_json(file_name):
    path = file_name
    with open(path, 'r') as openfile:
        # Reading from json file
        return json.load(openfile)
def get_single_list_from_streams(dictionary, filter_option):
    filtered_list = {k: v for k, v in dictionary.items() if k in filter_option}
    shortlist_snippets = list(filtered_list.values())

    return [j for sub in shortlist_snippets for j in sub]

def word_count(words):
    counts = dict()
    # words = str.split()

    for word in words:
        if word in counts:
            counts[word] += 1
        else:
            counts[word] = 1

    return counts

In [None]:
def query_processor(q):
  if q is None:
    return
  else:
    return retrieve_all_verticals(q)

In [None]:
def get_ner_query(q, fetch=False, iquery=False):
    if q is not None:
        spans = refined.process_text(q)
        streams = []
        for word in spans:
            streams.append(word.coarse_mention_type)
        return streams

In [None]:
def BM25F(query, streams, corpus, selected_facets):
    from statistics import mean
    query = query.split()

    filters = list(streams.keys())
    w_s_dict = dict()

    alpha = 0.05
    total_categories_selected = 12 if selected_facets is None else len(selected_facets)
    number_of_Cats = len(filters)
    for item in filters:
        if item in selected_facets:
            w_s_dict[item] = (1-alpha * (number_of_Cats - total_categories_selected)) / total_categories_selected
        else:
            w_s_dict[item] = alpha

    print(w_s_dict)
    scores = dict()
    streams_items = streams.items()
    total_docs = sum([len(v) for k, v in streams_items])

    for stream_name, stream_docs in streams_items:
        for doc in stream_docs:
            snippet = corpus[doc].split()
            common_terms = list(set(snippet) & set(query))
            document_score = 0
            for term in common_terms:
                tf_dt = 0
                tf_td_all = list()
                for key, inner_stream_docs in streams_items:
                    avg_sl_s = mean([len(corpus[d_doc]) for d_doc in inner_stream_docs])
                    for inner_doc in inner_stream_docs:
                        if doc == inner_doc:
                            sl_s = len(snippet)  # get_sl(doc, corpus)
                            # get_avg_sl(inner_stream, corpus)

                            b_s = 0.75

                            tf_td_denominator = (1 - b_s) + (b_s * sl_s / avg_sl_s)
                            tf_t_S = snippet.count(term)  # get_tf_ts(stream, corpus, q)

                            tf_td_all.append(w_s_dict[key] * (tf_t_S / tf_td_denominator))
                tf_dt = sum(tf_td_all)

                k = 1.2
                first_part = tf_dt / (k + tf_dt)

                ##### DFT
                dft = 0
                for count_doc in stream_docs:
                    inner_doc = corpus[count_doc].split()
                    if term in inner_doc:
                        dft += 1

                second_part = math.log(total_docs - dft + 0.5) / (dft + 0.5)

                document_score += first_part * second_part

            scores[doc] = document_score
    return dict(sorted(scores.items(), key=lambda item: item[1], reverse=True))
    # return scores


In [None]:
def get_ner_streams(corpus):
  labels = {'PERSON': [], 'NORP': [], 'FAC': [], 'ORG': [], 'GPE': [],
              'LOC': [], 'PRODUCT': [], 'EVENT': [],
              'WORK_OF_ART': [], 'LAW': [], 'LANGUAGE': [],
              'DATE': [], 'TIME': [], 'PERCENT': [], 'MONEY': [],
              'QUANTITY': [], 'ORDINAL': [], 'CARDINAL': []}

  tokens = {'PERSON': [], 'NORP': [], 'FAC': [], 'ORG': [], 'GPE': [],
            'LOC': [], 'PRODUCT': [], 'EVENT': [],
            'WORK_OF_ART': [], 'LAW': [], 'LANGUAGE': [],
            'DATE': [], 'TIME': [], 'PERCENT': [], 'MONEY': [],
            'QUANTITY': [], 'ORDINAL': [], 'CARDINAL': []}

  text = ''
  entities = []
  def process_content(item, title, snippet=None):
      nonlocal text
      content = title + (snippet or '')
      spans = refined.process_text(content)

      for word in spans:
          predicted_entity = word.coarse_mention_type
          if not predicted_entity or predicted_entity not in labels:
              continue

          if labels[predicted_entity] is None:
              labels[predicted_entity] = [item]
              tokens[predicted_entity] = [word.text]
          else:
              labels[predicted_entity].append(item)
              tokens[predicted_entity].append(word.text)

          span_tag = f"<span class='{predicted_entity}1'>{word.text}</span>"
          title = title.replace(word.text, span_tag)
          if snippet:
              snippet = snippet.replace(word.text, span_tag)

      return title, snippet

  for item, value in corpus.items():
      title, snippet = value['title'], value.get('snippet')
      text += title + (snippet or '')
      new_title, new_snippet = process_content(item, title, snippet)

      corpus[item]['title'] = new_title
      if snippet:
          corpus[item]['snippet'] = new_snippet

  # Remove duplicates
  labels = {k: list(dict.fromkeys(v)) for k, v in labels.items() if v}
  tokens = {k: v for k, v in tokens.items() if v}
  tags = {k: len(v) for k, v in labels.items()}

  return {'entities': labels, 'corpus': tokens, 'tags': tags, 'snippets': corpus}

In [None]:
query = "new york"
verticals = query_processor(query)
corpus = extract_corpus(verticals)
streams = get_ner_streams(verticals)['entities']
selected_ner = get_ner_query(query)

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
  with autocast():


In [None]:
BM25F("new york", streams, corpus, selected_ner)

{'PERSON': 0.05, 'FAC': 0.05, 'ORG': 0.05, 'GPE': 0.05, 'EVENT': 0.05, 'WORK_OF_ART': 0.05, 'DATE': 0.05, 'TIME': 0.05, 'PERCENT': 0.05, 'MONEY': 0.05, 'QUANTITY': 0.05, 'ORDINAL': 0.05, 'CARDINAL': 0.05}


{'248': 0.9017927226664287,
 '37': 0.8221407202631397,
 '34': 0.7424498540574638,
 '267': 0.7110738386341283,
 '213': 0.6352907726694395,
 '291': 0.60618002703658,
 '252': 0.5638652109536743,
 '308': 0.5390909348393513,
 '264': 0.5332244176014695,
 '77': 0.5017786038388805,
 '218': 0.49629724982181783,
 '289': 0.48516175665687933,
 '223': 0.47619820552095415,
 '271': 0.47542966752594606,
 '210': 0.4601364152006929,
 '286': 0.4378837616380324,
 '246': 0.4275568282992011,
 '66': 0.3815677655414986,
 '260': 0.3679349219332009,
 '285': 0.3648446471542303,
 '243': 0.36469581601161294,
 '361': 0.3617135438868837,
 '372': 0.35488711878186296,
 '95': 0.35334785893408105,
 '347': 0.3468596540836479,
 '226': 0.3302342034769607,
 '209': 0.3252036682837171,
 '96': 0.3117481455234107,
 '272': 0.2945804272723488,
 '217': 0.2938133442304879,
 '251': 0.28890808469357415,
 '219': 0.2802499299753867,
 '230': 0.2573586125406725,
 '269': 0.2497155422254924,
 '25': 0.24478579866066263,
 '75': 0.23162099075

In [None]:
def get_micro_facets(ner_labels, ner_corpus):
  filter_by_ner = [k for k in ner_labels.keys()]
  flattened_filtered_list = get_single_list_from_streams(ner_labels, filter_by_ner)

  if len(flattened_filtered_list) != 0:
      snippets = {k: v for k, v in corpus.items() if k in flattened_filtered_list}

  # count the occurrences of each NER for SERP Summary
  occurances_count_dict = dict()
  for k, v in ner_corpus.items():
      occurances_count_dict[k] = word_count(v)

  return occurances_count_dict
  # reranking snippets
  keys = [item[0] for item in sorted_ranked_list]

  ordered_dict = OrderedDict()
  for key in keys:
      if key in snippets and key in flattened_filtered_list:
          ordered_dict[key] = snippets[key]
  snippets = ordered_dict

In [None]:
streams_corpus = get_ner_streams(verticals)['corpus']

In [None]:
get_micro_facets(streams, streams_corpus)

{'PERSON': {'Andrew Cuomo': 2,
  'Jerome L': 1,
  'Liz Eswein': 2,
  'Cuomo': 8,
  'Cu': 1,
  'Trump': 2,
  'Eric': 1,
  'Adams': 1,
  "Andrew Cuomo's": 1,
  'Victoria': 1,
  'Lulu Garcia-Navarro': 1,
  'Jane Diina': 1,
  'Vazgryna': 1,
  'Jim Dwyer': 2,
  'Bill de \nBlasio': 1,
  'Bill de Blasio': 3,
  'Andy King': 3,
  'Callens': 1,
  'Eboni': 2,
  'K. Williams': 1,
  'Andrew Gounardes': 1,
  'Kristina Sgueglia': 1,
  'Adam Harding': 1,
  'Robert Lee': 1,
  'Stacey Lastoe': 1,
  'Diana Diroy': 1,
  'Channon Hodge': 1,
  'de Blasio': 4,
  'Lehrer': 1,
  'Kent Swig': 1,
  'Maeve Higgins': 1,
  'Douglas Elliman': 1,
  'William Rudin': 1},
 'FAC': {'Guggenheim': 1,
  'the Metropolitan Museum of Art': 1,
  'the Museum of Modern Art': 1,
  'New York City Center': 2,
  'Museum of the City of New York': 4,
  'the One World Observatory': 1,
  'New York Public Library': 2,
  'NYPL': 1,
  'The Metropolitan Museum of Art': 1,
  'Fifth Avenue': 1,
  'Fort Tryon Park': 1,
  'The Roosevelt Hotel': 