In [10]:
import numpy as np
import pandas as pd
import bigframes.pandas as bpd
import bigframes.ml.llm as llm
import bigframes.bigquery as bbq

In [2]:
col_name = 'overall_embedding'

In [6]:
%%bigquery
CREATE VIEW `financial-fraud-detection-1.sec_enforcement_actions.view_sec_search` AS
SELECT
  original.id as id,
  original.title as title,
  original.summary as summary,
  original.entities as entities,
  embeddings.ml_generate_embedding_result as sec_enforcement_embeddings
FROM `financial-fraud-detection-1.sec_enforcement_actions.sec_actions` original
INNER JOIN `financial-fraud-detection-1.sec_enforcement_actions.sec_enforcement_embeddings` embeddings
ON original.id = embeddings.id

Query is running:   0%|          |

In [8]:
%%bigquery results
SELECT query.cik as edgar_cik,
  query.year as edgar_year,
  base.id as overall_embedding_mathing_sec_id,
  base.title as overall_embedding_mathing_sec_title
FROM
 VECTOR_SEARCH(
   TABLE `financial-fraud-detection-1.sec_enforcement_actions.sec_enforcement_embeddings`, 'ml_generate_embedding_result',
   (SELECT * FROM `financial-fraud-detection-1.EDGAR.edgar_corpus_small_embeddings` LIMIT 1), 'overall_embedding', top_k => 5, distance_type => 'COSINE')

Query is running:   0%|          |

Downloading:   0%|          |

In [9]:
results

Unnamed: 0,edgar_cik,edgar_year,overall_embedding_mathing_sec_id,overall_embedding_mathing_sec_title
0,315189,1993,652c355b8848e75fdf85651bb0e31432,SEC Charges Delphi Corporation and Nine Indivi...
1,315189,1993,69ef263fb0ce90e8aedc9a456ed0c02d,SEC Announces Charges Against Corporate Inside...
2,315189,1993,b7e65a1605c5eeefc1e9b98e00dc28e1,SEC Charges Diebold and Former Executives With...
3,315189,1993,c09ee5721683dbd0c691e961585f7f74,"SEC Charges ConAgra Foods, Inc. in Financial F..."
4,315189,1993,a145fee70d224f197ae414be27f09beb,SEC Charges Four Public Companies With Longsta...


In [12]:
search_query = bpd.read_gbq("financial-fraud-detection-1.EDGAR.edgar_corpus_small_embeddings")

In [16]:
result_vector = bbq.vector_search(
  base_table="financial-fraud-detection-1.sec_enforcement_actions.sec_enforcement_embeddings",
  column_to_search="ml_generate_embedding_result",
  query=search_query,
  query_column_to_search="overall_embedding",
  top_k=5,
  distance_type="cosine")

`db_dtypes` is a preview feature and subject to change.


In [20]:
result_vector.loc[result_vector['cik']==315189]['id']

1    c09ee5721683dbd0c691e961585f7f74
1    652c355b8848e75fdf85651bb0e31432
1    69ef263fb0ce90e8aedc9a456ed0c02d
1    a145fee70d224f197ae414be27f09beb
1    b7e65a1605c5eeefc1e9b98e00dc28e1
Name: id, dtype: string

In [23]:
result_vector_2 = bbq.vector_search(
  base_table="financial-fraud-detection-1.sec_enforcement_actions.sec_enforcement_embeddings",
  column_to_search="ml_generate_embedding_result",
  query=search_query,
  query_column_to_search="overall_embedding",
  top_k=5,
  distance_type="cosine",
  use_brute_force=True).sort_values("id")

`db_dtypes` is a preview feature and subject to change.


In [25]:
result_vector_2

Unnamed: 0,filename,cik,year,section_7embedding,section_9embedding,section_12embedding,section_10embedding,section_4embedding,section_8embedding,section_14embedding,...,section_9Aembedding,section_1embedding,overall_embedding,ml_generate_embedding_result,ml_generate_embedding_statistics,ml_generate_embedding_status,id,title,content,distance
4,1800_1993.txt,1800,1993,[-0.01756296 -0.01733708 -0.01346771 -0.013578...,[-1.56782791e-02 -6.42858818e-03 -3.48928086e-...,[-4.16508093e-02 -2.62772869e-02 -3.11164446e-...,[-6.22933321e-02 -4.06846032e-02 -2.88069844e-...,[-0.01235421 -0.05187324 -0.01839188 0.048709...,[-0.01758242 -0.04470409 -0.02377128 -0.002298...,[ 3.80702619e-03 -6.57393262e-02 -4.26834710e-...,...,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...,[-8.46420446e-03 -1.74391275e-02 -3.04021587e-...,[-2.02630794e-02 -2.28220087e-02 -2.03435779e-...,[-2.47707199e-02 -2.66260467e-02 1.42063461e-...,"{""token_count"":426,""truncated"":false}",,5220a1d4d9c02d813c8536464a5be416,Pharmaceutical Company and Former Executives C...,Title: Pharmaceutical Company and Former Execu...,0.306597
0,1067983_2020.htm,1067983,2020,[-3.98591966e-04 -6.93545857e-03 -3.90583078e-...,[-3.78713012e-02 -2.25618221e-02 -6.61137030e-...,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...,[-0.0344297 -0.01074299 -0.04601453 0.000932...,[-1.76449773e-02 -9.34779196e-03 -3.27466425e-...,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...,...,[ 1.63675216e-03 -2.70394441e-02 -3.73632014e-...,[ 7.57767455e-03 -5.32740511e-03 -1.38319638e-...,[-1.39428654e-02 -7.94268673e-03 -2.75115496e-...,[-3.66573147e-02 -1.60664450e-02 1.47968018e-...,"{""token_count"":1539,""truncated"":false}",,55f78bf10042846ac936f1564295e733,Twenty-Six Firms to Pay More Than $390 Million...,Title: Twenty-Six Firms to Pay More Than $390 ...,0.281621
4,1800_1993.txt,1800,1993,[-0.01756296 -0.01733708 -0.01346771 -0.013578...,[-1.56782791e-02 -6.42858818e-03 -3.48928086e-...,[-4.16508093e-02 -2.62772869e-02 -3.11164446e-...,[-6.22933321e-02 -4.06846032e-02 -2.88069844e-...,[-0.01235421 -0.05187324 -0.01839188 0.048709...,[-0.01758242 -0.04470409 -0.02377128 -0.002298...,[ 3.80702619e-03 -6.57393262e-02 -4.26834710e-...,...,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...,[-8.46420446e-03 -1.74391275e-02 -3.04021587e-...,[-2.02630794e-02 -2.28220087e-02 -2.03435779e-...,[-0.03424182 -0.02437511 -0.02993707 -0.051027...,"{""token_count"":500,""truncated"":false}",,59c38537b26fd1c6ea110bd3b190e588,SEC Charges General Electric and Two Subsidiar...,Title: SEC Charges General Electric and Two Su...,0.317306
1,315189_1993.txt,315189,1993,[-1.35571547e-02 -1.56442206e-02 -2.92028040e-...,[-1.83812398e-02 1.81908242e-03 -2.49630474e-...,[-3.85654159e-02 -1.78559888e-02 -4.98842299e-...,[-5.09253442e-02 -3.28539982e-02 -1.21700037e-...,[-4.47579399e-02 -1.16266981e-02 -4.29858454e-...,[-9.26065768e-05 -5.11534102e-02 -4.29588519e-...,[ 2.28945124e-02 5.21549085e-03 -1.74354800e-...,...,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...,[ 2.29252045e-02 2.62432406e-03 -4.36920724e-...,[-1.49562186e-02 -1.20997473e-02 -1.77254046e-...,[-9.61120520e-03 -2.12113801e-02 -1.15588652e-...,"{""token_count"":779,""truncated"":false}",,652c355b8848e75fdf85651bb0e31432,SEC Charges Delphi Corporation and Nine Indivi...,Title: SEC Charges Delphi Corporation and Nine...,0.310323
4,1800_1993.txt,1800,1993,[-0.01756296 -0.01733708 -0.01346771 -0.013578...,[-1.56782791e-02 -6.42858818e-03 -3.48928086e-...,[-4.16508093e-02 -2.62772869e-02 -3.11164446e-...,[-6.22933321e-02 -4.06846032e-02 -2.88069844e-...,[-0.01235421 -0.05187324 -0.01839188 0.048709...,[-0.01758242 -0.04470409 -0.02377128 -0.002298...,[ 3.80702619e-03 -6.57393262e-02 -4.26834710e-...,...,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...,[-8.46420446e-03 -1.74391275e-02 -3.04021587e-...,[-2.02630794e-02 -2.28220087e-02 -2.03435779e-...,[-9.61120520e-03 -2.12113801e-02 -1.15588652e-...,"{""token_count"":779,""truncated"":false}",,652c355b8848e75fdf85651bb0e31432,SEC Charges Delphi Corporation and Nine Indivi...,Title: SEC Charges Delphi Corporation and Nine...,0.303072
2,50104_1993.txt,50104,1993,[-3.47243063e-02 1.94188505e-02 -8.92389864e-...,[-1.55314188e-02 -2.45964527e-03 -3.88711654e-...,[-3.79250757e-02 -4.47099060e-02 -3.67634818e-...,[-4.57181484e-02 -5.33069260e-02 -1.22106224e-...,[-0.05072239 -0.01314292 -0.03440231 -0.023048...,[-3.04003223e-02 4.79081871e-03 -5.41995295e-...,[ 3.38502717e-03 -5.04764915e-03 -5.59636764e-...,...,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...,[-0.03292404 0.00087296 -0.04182 -0.004596...,[-0.02347898 -0.01731962 -0.02790547 0.000921...,[-4.25696969e-02 -6.16127253e-02 -6.23959641e-...,"{""token_count"":1269,""truncated"":false}",,69ef263fb0ce90e8aedc9a456ed0c02d,SEC Announces Charges Against Corporate Inside...,Title: SEC Announces Charges Against Corporate...,0.303516
1,315189_1993.txt,315189,1993,[-1.35571547e-02 -1.56442206e-02 -2.92028040e-...,[-1.83812398e-02 1.81908242e-03 -2.49630474e-...,[-3.85654159e-02 -1.78559888e-02 -4.98842299e-...,[-5.09253442e-02 -3.28539982e-02 -1.21700037e-...,[-4.47579399e-02 -1.16266981e-02 -4.29858454e-...,[-9.26065768e-05 -5.11534102e-02 -4.29588519e-...,[ 2.28945124e-02 5.21549085e-03 -1.74354800e-...,...,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...,[ 2.29252045e-02 2.62432406e-03 -4.36920724e-...,[-1.49562186e-02 -1.20997473e-02 -1.77254046e-...,[-4.25696969e-02 -6.16127253e-02 -6.23959641e-...,"{""token_count"":1269,""truncated"":false}",,69ef263fb0ce90e8aedc9a456ed0c02d,SEC Announces Charges Against Corporate Inside...,Title: SEC Announces Charges Against Corporate...,0.315145
0,1067983_2020.htm,1067983,2020,[-3.98591966e-04 -6.93545857e-03 -3.90583078e-...,[-3.78713012e-02 -2.25618221e-02 -6.61137030e-...,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...,[-0.0344297 -0.01074299 -0.04601453 0.000932...,[-1.76449773e-02 -9.34779196e-03 -3.27466425e-...,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...,...,[ 1.63675216e-03 -2.70394441e-02 -3.73632014e-...,[ 7.57767455e-03 -5.32740511e-03 -1.38319638e-...,[-1.39428654e-02 -7.94268673e-03 -2.75115496e-...,[-4.25696969e-02 -6.16127253e-02 -6.23959641e-...,"{""token_count"":1269,""truncated"":false}",,69ef263fb0ce90e8aedc9a456ed0c02d,SEC Announces Charges Against Corporate Inside...,Title: SEC Announces Charges Against Corporate...,0.255627
4,1800_1993.txt,1800,1993,[-0.01756296 -0.01733708 -0.01346771 -0.013578...,[-1.56782791e-02 -6.42858818e-03 -3.48928086e-...,[-4.16508093e-02 -2.62772869e-02 -3.11164446e-...,[-6.22933321e-02 -4.06846032e-02 -2.88069844e-...,[-0.01235421 -0.05187324 -0.01839188 0.048709...,[-0.01758242 -0.04470409 -0.02377128 -0.002298...,[ 3.80702619e-03 -6.57393262e-02 -4.26834710e-...,...,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...,[-8.46420446e-03 -1.74391275e-02 -3.04021587e-...,[-2.02630794e-02 -2.28220087e-02 -2.03435779e-...,[-4.25696969e-02 -6.16127253e-02 -6.23959641e-...,"{""token_count"":1269,""truncated"":false}",,69ef263fb0ce90e8aedc9a456ed0c02d,SEC Announces Charges Against Corporate Inside...,Title: SEC Announces Charges Against Corporate...,0.309091
3,1137091_2020.htm,1137091,2020,[-2.10153785e-02 2.59747695e-03 -3.92209039e-...,[-4.25587595e-02 -2.09636036e-02 -6.31605536e-...,[-6.73391223e-02 -1.49382157e-02 -3.27744223e-...,[-3.24487165e-02 -1.60200917e-03 -5.89513499e-...,[-3.31896767e-02 1.45743862e-02 -3.72849405e-...,[-1.28673098e-02 -8.76872710e-03 -2.86414517e-...,[-7.22742230e-02 -2.71525867e-02 -2.11981349e-...,...,[-3.16515304e-02 -1.75493170e-02 -9.58232504e-...,[-5.22894418e-03 -6.85504824e-03 -1.68921131e-...,[-3.72444206e-02 -1.18594729e-02 -2.65914233e-...,[-4.25696969e-02 -6.16127253e-02 -6.23959641e-...,"{""token_count"":1269,""truncated"":false}",,69ef263fb0ce90e8aedc9a456ed0c02d,SEC Announces Charges Against Corporate Inside...,Title: SEC Announces Charges Against Corporate...,0.287852


In [28]:
result_value_2 = result_vector_2.reset_index()

In [31]:
result_vector_2.loc[result_vector_2['cik']==315189]['id']

1    652c355b8848e75fdf85651bb0e31432
1    69ef263fb0ce90e8aedc9a456ed0c02d
1    a145fee70d224f197ae414be27f09beb
1    b7e65a1605c5eeefc1e9b98e00dc28e1
1    c09ee5721683dbd0c691e961585f7f74
Name: id, dtype: string

In [32]:
result_value_2.info()

<class 'bigframes.dataframe.DataFrame'>
Index: 25 entries, 0 to 24
Data columns (total 32 columns):
  #  Column                            Non-Null Count    Dtype
---  --------------------------------  ----------------  -----------------------------------------
  0  index                             25 non-null       Int64
  1  filename                          25 non-null       string
  2  cik                               25 non-null       Int64
  3  year                              25 non-null       Int64
  4  section_7embedding                25 non-null       list<item: double>[pyarrow]
  5  section_9embedding                25 non-null       list<item: double>[pyarrow]
  6  section_12embedding               25 non-null       list<item: double>[pyarrow]
  7  section_10embedding               25 non-null       list<item: double>[pyarrow]
  8  section_4embedding                25 non-null       list<item: double>[pyarrow]
  9  section_8embedding                25 non-null       lis

In [34]:
result_vector_2.sort_values(by=['cik', 'distance'], ascending=False)

Unnamed: 0,filename,cik,year,section_7embedding,section_9embedding,section_12embedding,section_10embedding,section_4embedding,section_8embedding,section_14embedding,...,section_9Aembedding,section_1embedding,overall_embedding,ml_generate_embedding_result,ml_generate_embedding_statistics,ml_generate_embedding_status,id,title,content,distance
3,1137091_2020.htm,1137091,2020,[-2.10153785e-02 2.59747695e-03 -3.92209039e-...,[-4.25587595e-02 -2.09636036e-02 -6.31605536e-...,[-6.73391223e-02 -1.49382157e-02 -3.27744223e-...,[-3.24487165e-02 -1.60200917e-03 -5.89513499e-...,[-3.31896767e-02 1.45743862e-02 -3.72849405e-...,[-1.28673098e-02 -8.76872710e-03 -2.86414517e-...,[-7.22742230e-02 -2.71525867e-02 -2.11981349e-...,...,[-3.16515304e-02 -1.75493170e-02 -9.58232504e-...,[-5.22894418e-03 -6.85504824e-03 -1.68921131e-...,[-3.72444206e-02 -1.18594729e-02 -2.65914233e-...,[-4.30587344e-02 -6.94525093e-02 -3.08916345e-...,"{""token_count"":551,""truncated"":false}",,80210595c2e576cad4ef1b38b1c76f42,SEC Charges New Jersey Software Company and Se...,Title: SEC Charges New Jersey Software Company...,0.29471
3,1137091_2020.htm,1137091,2020,[-2.10153785e-02 2.59747695e-03 -3.92209039e-...,[-4.25587595e-02 -2.09636036e-02 -6.31605536e-...,[-6.73391223e-02 -1.49382157e-02 -3.27744223e-...,[-3.24487165e-02 -1.60200917e-03 -5.89513499e-...,[-3.31896767e-02 1.45743862e-02 -3.72849405e-...,[-1.28673098e-02 -8.76872710e-03 -2.86414517e-...,[-7.22742230e-02 -2.71525867e-02 -2.11981349e-...,...,[-3.16515304e-02 -1.75493170e-02 -9.58232504e-...,[-5.22894418e-03 -6.85504824e-03 -1.68921131e-...,[-3.72444206e-02 -1.18594729e-02 -2.65914233e-...,[-4.66634147e-02 -1.80950365e-03 -1.84153859e-...,"{""token_count"":441,""truncated"":false}",,7fe4e612554df988fb6d4c4bf949b5bd,"SEC Charges Companies, Former Executives as Pa...","Title: SEC Charges Companies, Former Executive...",0.294016
3,1137091_2020.htm,1137091,2020,[-2.10153785e-02 2.59747695e-03 -3.92209039e-...,[-4.25587595e-02 -2.09636036e-02 -6.31605536e-...,[-6.73391223e-02 -1.49382157e-02 -3.27744223e-...,[-3.24487165e-02 -1.60200917e-03 -5.89513499e-...,[-3.31896767e-02 1.45743862e-02 -3.72849405e-...,[-1.28673098e-02 -8.76872710e-03 -2.86414517e-...,[-7.22742230e-02 -2.71525867e-02 -2.11981349e-...,...,[-3.16515304e-02 -1.75493170e-02 -9.58232504e-...,[-5.22894418e-03 -6.85504824e-03 -1.68921131e-...,[-3.72444206e-02 -1.18594729e-02 -2.65914233e-...,[-0.05589295 -0.01434082 0.00880999 -0.026128...,"{""token_count"":776,""truncated"":false}",,d7a2d99d60d8f1c5678866b973670c23,SEC Charges Corporate Insiders for Failing to ...,Title: SEC Charges Corporate Insiders for Fail...,0.29265
3,1137091_2020.htm,1137091,2020,[-2.10153785e-02 2.59747695e-03 -3.92209039e-...,[-4.25587595e-02 -2.09636036e-02 -6.31605536e-...,[-6.73391223e-02 -1.49382157e-02 -3.27744223e-...,[-3.24487165e-02 -1.60200917e-03 -5.89513499e-...,[-3.31896767e-02 1.45743862e-02 -3.72849405e-...,[-1.28673098e-02 -8.76872710e-03 -2.86414517e-...,[-7.22742230e-02 -2.71525867e-02 -2.11981349e-...,...,[-3.16515304e-02 -1.75493170e-02 -9.58232504e-...,[-5.22894418e-03 -6.85504824e-03 -1.68921131e-...,[-3.72444206e-02 -1.18594729e-02 -2.65914233e-...,[-4.25696969e-02 -6.16127253e-02 -6.23959641e-...,"{""token_count"":1269,""truncated"":false}",,69ef263fb0ce90e8aedc9a456ed0c02d,SEC Announces Charges Against Corporate Inside...,Title: SEC Announces Charges Against Corporate...,0.287852
3,1137091_2020.htm,1137091,2020,[-2.10153785e-02 2.59747695e-03 -3.92209039e-...,[-4.25587595e-02 -2.09636036e-02 -6.31605536e-...,[-6.73391223e-02 -1.49382157e-02 -3.27744223e-...,[-3.24487165e-02 -1.60200917e-03 -5.89513499e-...,[-3.31896767e-02 1.45743862e-02 -3.72849405e-...,[-1.28673098e-02 -8.76872710e-03 -2.86414517e-...,[-7.22742230e-02 -2.71525867e-02 -2.11981349e-...,...,[-3.16515304e-02 -1.75493170e-02 -9.58232504e-...,[-5.22894418e-03 -6.85504824e-03 -1.68921131e-...,[-3.72444206e-02 -1.18594729e-02 -2.65914233e-...,[-4.65844162e-02 -2.42230538e-02 -3.48388292e-...,"{""token_count"":509,""truncated"":false}",,a145fee70d224f197ae414be27f09beb,SEC Charges Four Public Companies With Longsta...,Title: SEC Charges Four Public Companies With ...,0.282714
0,1067983_2020.htm,1067983,2020,[-3.98591966e-04 -6.93545857e-03 -3.90583078e-...,[-3.78713012e-02 -2.25618221e-02 -6.61137030e-...,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...,[-0.0344297 -0.01074299 -0.04601453 0.000932...,[-1.76449773e-02 -9.34779196e-03 -3.27466425e-...,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...,...,[ 1.63675216e-03 -2.70394441e-02 -3.73632014e-...,[ 7.57767455e-03 -5.32740511e-03 -1.38319638e-...,[-1.39428654e-02 -7.94268673e-03 -2.75115496e-...,[-3.66573147e-02 -1.60664450e-02 1.47968018e-...,"{""token_count"":1539,""truncated"":false}",,55f78bf10042846ac936f1564295e733,Twenty-Six Firms to Pay More Than $390 Million...,Title: Twenty-Six Firms to Pay More Than $390 ...,0.281621
0,1067983_2020.htm,1067983,2020,[-3.98591966e-04 -6.93545857e-03 -3.90583078e-...,[-3.78713012e-02 -2.25618221e-02 -6.61137030e-...,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...,[-0.0344297 -0.01074299 -0.04601453 0.000932...,[-1.76449773e-02 -9.34779196e-03 -3.27466425e-...,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...,...,[ 1.63675216e-03 -2.70394441e-02 -3.73632014e-...,[ 7.57767455e-03 -5.32740511e-03 -1.38319638e-...,[-1.39428654e-02 -7.94268673e-03 -2.75115496e-...,[-1.16345128e-02 -4.30568047e-02 -1.48458006e-...,"{""token_count"":1393,""truncated"":false}",,7af28a48e0a7aa4f22df07c1c42283bd,SEC Charges 69 Audit Firms and Partners for Is...,Title: SEC Charges 69 Audit Firms and Partners...,0.280855
0,1067983_2020.htm,1067983,2020,[-3.98591966e-04 -6.93545857e-03 -3.90583078e-...,[-3.78713012e-02 -2.25618221e-02 -6.61137030e-...,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...,[-0.0344297 -0.01074299 -0.04601453 0.000932...,[-1.76449773e-02 -9.34779196e-03 -3.27466425e-...,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...,...,[ 1.63675216e-03 -2.70394441e-02 -3.73632014e-...,[ 7.57767455e-03 -5.32740511e-03 -1.38319638e-...,[-1.39428654e-02 -7.94268673e-03 -2.75115496e-...,[-4.66634147e-02 -1.80950365e-03 -1.84153859e-...,"{""token_count"":441,""truncated"":false}",,7fe4e612554df988fb6d4c4bf949b5bd,"SEC Charges Companies, Former Executives as Pa...","Title: SEC Charges Companies, Former Executive...",0.276745
0,1067983_2020.htm,1067983,2020,[-3.98591966e-04 -6.93545857e-03 -3.90583078e-...,[-3.78713012e-02 -2.25618221e-02 -6.61137030e-...,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...,[-0.0344297 -0.01074299 -0.04601453 0.000932...,[-1.76449773e-02 -9.34779196e-03 -3.27466425e-...,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...,...,[ 1.63675216e-03 -2.70394441e-02 -3.73632014e-...,[ 7.57767455e-03 -5.32740511e-03 -1.38319638e-...,[-1.39428654e-02 -7.94268673e-03 -2.75115496e-...,[-3.26581709e-02 -2.29606368e-02 -2.05970611e-...,"{""token_count"":1117,""truncated"":false}",,bffd0ae5fdfb5d316a2dc778d415b446,SEC Levies More Than $3.8 Million in Penalties...,Title: SEC Levies More Than $3.8 Million in Pe...,0.265691
0,1067983_2020.htm,1067983,2020,[-3.98591966e-04 -6.93545857e-03 -3.90583078e-...,[-3.78713012e-02 -2.25618221e-02 -6.61137030e-...,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...,[-0.0344297 -0.01074299 -0.04601453 0.000932...,[-1.76449773e-02 -9.34779196e-03 -3.27466425e-...,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...,...,[ 1.63675216e-03 -2.70394441e-02 -3.73632014e-...,[ 7.57767455e-03 -5.32740511e-03 -1.38319638e-...,[-1.39428654e-02 -7.94268673e-03 -2.75115496e-...,[-4.25696969e-02 -6.16127253e-02 -6.23959641e-...,"{""token_count"":1269,""truncated"":false}",,69ef263fb0ce90e8aedc9a456ed0c02d,SEC Announces Charges Against Corporate Inside...,Title: SEC Announces Charges Against Corporate...,0.255627


In [37]:
rv2 = result_value_2[['cik', 'year', 'id', 'title', 'distance']].sort_values(by=['cik', 'distance'], ascending=False)

In [39]:
rv2.columns=['edgar_cik', 'edgar_year', 'sec_id', 'sec_title', 'distance']

In [40]:
rv2

Unnamed: 0,edgar_cik,edgar_year,sec_id,sec_title,distance
13,1137091,2020,80210595c2e576cad4ef1b38b1c76f42,SEC Charges New Jersey Software Company and Se...,0.29471
12,1137091,2020,7fe4e612554df988fb6d4c4bf949b5bd,"SEC Charges Companies, Former Executives as Pa...",0.294016
22,1137091,2020,d7a2d99d60d8f1c5678866b973670c23,SEC Charges Corporate Insiders for Failing to ...,0.29265
9,1137091,2020,69ef263fb0ce90e8aedc9a456ed0c02d,SEC Announces Charges Against Corporate Inside...,0.287852
14,1137091,2020,a145fee70d224f197ae414be27f09beb,SEC Charges Four Public Companies With Longsta...,0.282714
1,1067983,2020,55f78bf10042846ac936f1564295e733,Twenty-Six Firms to Pay More Than $390 Million...,0.281621
10,1067983,2020,7af28a48e0a7aa4f22df07c1c42283bd,SEC Charges 69 Audit Firms and Partners for Is...,0.280855
11,1067983,2020,7fe4e612554df988fb6d4c4bf949b5bd,"SEC Charges Companies, Former Executives as Pa...",0.276745
19,1067983,2020,bffd0ae5fdfb5d316a2dc778d415b446,SEC Levies More Than $3.8 Million in Penalties...,0.265691
7,1067983,2020,69ef263fb0ce90e8aedc9a456ed0c02d,SEC Announces Charges Against Corporate Inside...,0.255627


In [41]:
rv2.info()

<class 'bigframes.dataframe.DataFrame'>
Index: 25 entries, 13 to 4
Data columns (total 5 columns):
  #  Column      Non-Null Count    Dtype
---  ----------  ----------------  -------
  0  edgar_cik   25 non-null       Int64
  1  edgar_year  25 non-null       Int64
  2  sec_id      25 non-null       string
  3  sec_title   25 non-null       string
  4  distance    25 non-null       Float64
dtypes: Float64(1), Int64(2), string(2)
memory usage: 1200 bytes


In [42]:
rv2.to_gbq("financial-fraud-detection-1.SEC_EDGAR_Vector_Search.overall", if_exists="replace")

'financial-fraud-detection-1.SEC_EDGAR_Vector_Search.overall'

In [43]:
search_query.columns

Index(['filename', 'cik', 'year', 'section_7embedding', 'section_9embedding',
       'section_12embedding', 'section_10embedding', 'section_4embedding',
       'section_8embedding', 'section_14embedding', 'section_1Aembedding',
       'section_11embedding', 'section_15embedding', 'section_5embedding',
       'section_13embedding', 'section_7Aembedding', 'section_1Bembedding',
       'section_9Bembedding', 'section_2embedding', 'section_6embedding',
       'section_3embedding', 'section_9Aembedding', 'section_1embedding',
       'overall_embedding'],
      dtype='object')

In [None]:
# except overall since overall is already into table

In [46]:
separate_embeddings = {col.replace('embedding', '') for col in search_query.columns if 'embedding' in col} - {'overall_'}

In [47]:
separate_embeddings

{'section_1',
 'section_10',
 'section_11',
 'section_12',
 'section_13',
 'section_14',
 'section_15',
 'section_1A',
 'section_1B',
 'section_2',
 'section_3',
 'section_4',
 'section_5',
 'section_6',
 'section_7',
 'section_7A',
 'section_8',
 'section_9',
 'section_9A',
 'section_9B'}

In [48]:
len(separate_embeddings)

20

In [66]:
import pyarrow as pa


In [86]:
for col_name in separate_embeddings:

  query_column_to_search_value = col_name + "embedding"
  table_name = "financial-fraud-detection-1.SEC_EDGAR_Vector_Search." + col_name

  # filtering before use
  embedding_list = search_query[query_column_to_search_value].to_list()
  is_not_zero_vector = [sum(vec) != 0 for vec in embedding_list]
  search_query_filtered = (search_query.to_pandas().loc[is_not_zero_vector])

  result_for_col = bbq.vector_search(
    base_table="financial-fraud-detection-1.sec_enforcement_actions.sec_enforcement_embeddings",
    column_to_search="ml_generate_embedding_result",
    query=search_query_filtered,
    query_column_to_search=query_column_to_search_value,
    top_k=5,
    distance_type="cosine",
    use_brute_force=True)

  result_for_col = result_for_col.reset_index()
  result_for_col = result_for_col[['cik', 'year', 'id', 'title', 'distance']].sort_values(by=['cik', 'distance'], ascending=False)
  result_for_col.columns=['edgar_cik', 'edgar_year', 'sec_id', 'sec_title', 'distance']

  result_for_col.to_gbq(table_name, if_exists="replace")

`db_dtypes` is a preview feature and subject to change.


`db_dtypes` is a preview feature and subject to change.


`db_dtypes` is a preview feature and subject to change.


`db_dtypes` is a preview feature and subject to change.


`db_dtypes` is a preview feature and subject to change.


`db_dtypes` is a preview feature and subject to change.


`db_dtypes` is a preview feature and subject to change.


`db_dtypes` is a preview feature and subject to change.


`db_dtypes` is a preview feature and subject to change.


`db_dtypes` is a preview feature and subject to change.


`db_dtypes` is a preview feature and subject to change.


`db_dtypes` is a preview feature and subject to change.


`db_dtypes` is a preview feature and subject to change.


`db_dtypes` is a preview feature and subject to change.


`db_dtypes` is a preview feature and subject to change.


`db_dtypes` is a preview feature and subject to change.


`db_dtypes` is a preview feature and subject to change.


`db_dtypes` is a preview feature and subject to change.


`db_dtypes` is a preview feature and subject to change.


`db_dtypes` is a preview feature and subject to change.


In [None]:
# tables created