In [1]:
pip install --upgrade polars[plot]==0.20.31

Collecting polars[plot]==0.20.31
  Downloading polars-0.20.31-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (28.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m28.8/28.8 MB[0m [31m32.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting hvplot>=0.9.1 (from polars[plot]==0.20.31)
  Downloading hvplot-0.10.0-py3-none-any.whl (155 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m155.4/155.4 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: polars, hvplot
  Attempting uninstall: polars
    Found existing installation: polars 0.20.2
    Uninstalling polars-0.20.2:
      Successfully uninstalled polars-0.20.2
Successfully installed hvplot-0.10.0 polars-0.20.31


In [2]:
PROJECT_ID = "bt-int-wod-masterclass-c3f3"
REGION = "EU"
DATASET_ID = "data_beans_curated_local"
TABLE_NAME = "customer_review_embedded_laurenz"
CONNECTION_NAME = "vertex-ai"

In [28]:
import bigframes.pandas as bpd
from google.cloud import bigquery

import polars as pl

from bokeh.io import show, save
import hvplot
import hvplot.polars

In [4]:
client = bigquery.Client(project=PROJECT_ID, location=REGION)

In [5]:
# copy embedded customer reviews into a polars dataframe
query = (
    f"SELECT * FROM `{DATASET_ID}.{TABLE_NAME}` r")
query_job = client.query(query)  # API request
# copy table into polars dataframe
customer_review_embedded_raw = pl.from_arrow(query_job.result().to_arrow())


In [6]:
customer_review_embedded_raw

rowid,text_embedding,statistics,ml_embed_text_status,content
i64,list[f64],str,str,str
7461,"[0.048845, -0.020206, … 0.054143]","""{""token_count"":52,""truncated"":…","""""","""I'm a coffee fanatic and I don…"
7549,"[0.047678, -0.034145, … 0.035929]","""{""token_count"":55,""truncated"":…","""""","""I always get my coffee from he…"
7559,"[0.004317, -0.034291, … 0.067955]","""{""token_count"":55,""truncated"":…","""""","""I had never been to a location…"
7560,"[0.04532, -0.018647, … 0.070964]","""{""token_count"":55,""truncated"":…","""""","""This place has a great selecti…"
7603,"[0.053228, -0.023868, … 0.047716]","""{""token_count"":56,""truncated"":…","""""","""I stopped by this coffee truck…"
…,…,…,…,…
7492,"[0.041095, -0.065384, … 0.047765]","""{""token_count"":53,""truncated"":…","""""","""I think it is overpriced, but …"
7512,"[0.029933, -0.011228, … 0.050344]","""{""token_count"":53,""truncated"":…","""""","""The staff was friendly and hel…"
7500,"[0.025881, -0.038813, … 0.057246]","""{""token_count"":53,""truncated"":…","""""","""The staff were very friendly a…"
7475,"[0.04194, -0.012867, … 0.056525]","""{""token_count"":53,""truncated"":…","""""","""I had a great experience at th…"


In [7]:
#convert the embedding array column to individual columns, one for each array item, using a list comprehension
customer_review_embedded = customer_review_embedded_raw.with_columns(
    (pl.col("text_embedding").cast(pl.Array(pl.Float64, width=768)).arr.get(i).alias(f"embedding_{i}") for i in range(768))
)

  (pl.col("text_embedding").cast(pl.Array(pl.Float64, width=768)).arr.get(i).alias(f"embedding_{i}") for i in range(768))


In [8]:
customer_review_embedded

rowid,text_embedding,statistics,ml_embed_text_status,content,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,embedding_9,embedding_10,embedding_11,embedding_12,embedding_13,embedding_14,embedding_15,embedding_16,embedding_17,embedding_18,embedding_19,embedding_20,embedding_21,embedding_22,embedding_23,embedding_24,embedding_25,embedding_26,embedding_27,embedding_28,embedding_29,embedding_30,embedding_31,…,embedding_731,embedding_732,embedding_733,embedding_734,embedding_735,embedding_736,embedding_737,embedding_738,embedding_739,embedding_740,embedding_741,embedding_742,embedding_743,embedding_744,embedding_745,embedding_746,embedding_747,embedding_748,embedding_749,embedding_750,embedding_751,embedding_752,embedding_753,embedding_754,embedding_755,embedding_756,embedding_757,embedding_758,embedding_759,embedding_760,embedding_761,embedding_762,embedding_763,embedding_764,embedding_765,embedding_766,embedding_767
i64,list[f64],str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
7461,"[0.048845, -0.020206, … 0.054143]","""{""token_count"":52,""truncated"":…","""""","""I'm a coffee fanatic and I don…",0.048845,-0.020206,-0.018818,-0.047474,0.038713,0.045697,-0.017239,-0.042988,-0.006626,0.029441,0.000612,0.004637,0.010675,-0.004681,-0.012631,0.020612,-0.011457,0.000392,-0.009434,-0.058769,0.000176,-0.002464,0.04422,-0.021724,0.006449,-0.054143,0.037962,-0.005147,-0.06251,0.011067,-0.086287,0.066934,…,0.003121,-0.048528,0.014404,-0.048726,0.091999,-0.019359,0.009935,-0.01678,-0.070496,-0.016119,-0.049654,0.074149,0.081228,0.067097,0.009673,0.023677,-0.016921,-0.02818,-0.040767,-0.016931,-0.015579,-0.045161,0.084795,0.023857,0.018486,0.009689,0.009537,0.024732,0.032967,-0.041192,0.04709,-0.039201,0.049214,0.04284,0.027583,0.008554,0.054143
7549,"[0.047678, -0.034145, … 0.035929]","""{""token_count"":55,""truncated"":…","""""","""I always get my coffee from he…",0.047678,-0.034145,-0.025936,-0.054205,0.044886,0.051056,-0.0296,-0.052977,0.005275,-0.011844,-0.015338,0.001637,-0.004763,-0.023152,-0.041484,-0.006018,-0.027571,-0.007105,-0.031696,-0.053676,-0.021392,0.01215,0.073769,-0.019532,0.008011,-0.047853,-0.007672,-0.013809,-0.040439,-0.012788,-0.068001,0.040584,…,0.001261,-0.025588,0.03546,-0.080545,0.087602,-0.046749,0.018271,-0.045936,-0.074644,-0.045385,-0.02517,0.062332,0.052306,0.074276,-0.024718,0.03709,-0.001617,-0.02995,-0.000615,0.008106,0.0111,-0.020192,0.023027,0.006353,0.022236,-0.00609,0.028793,0.027707,0.027476,-0.026624,0.065381,-0.041634,0.018789,0.019633,0.002239,-0.02994,0.035929
7559,"[0.004317, -0.034291, … 0.067955]","""{""token_count"":55,""truncated"":…","""""","""I had never been to a location…",0.004317,-0.034291,-0.020192,-0.006772,0.033597,0.090561,-0.002594,-0.035049,0.007243,-0.023208,-0.039742,0.003135,0.028551,-0.036963,-0.05121,0.002579,-0.033508,0.010342,0.006456,-0.047412,-0.007196,0.005565,0.069751,-0.036811,0.005051,-0.062901,0.004829,-0.038101,-0.029347,0.030128,-0.046597,0.07112,…,0.003918,-0.000431,0.031944,-0.04806,0.063712,-0.04396,-0.014613,-0.023835,-0.086876,-0.025749,-0.008571,0.063344,0.035737,0.086655,-0.035771,0.036079,0.009569,-0.005663,0.01644,0.018999,-0.033631,-0.008642,0.022383,0.013164,0.02605,0.002983,0.054093,0.011348,0.047733,-0.024122,0.052828,-0.022036,0.024124,0.024854,0.007793,-0.020556,0.067955
7560,"[0.04532, -0.018647, … 0.070964]","""{""token_count"":55,""truncated"":…","""""","""This place has a great selecti…",0.04532,-0.018647,-0.033229,-0.054886,0.066516,0.051332,-0.009663,-0.049191,0.01406,-0.004939,0.015234,-0.022976,0.015583,-0.030902,-0.022509,-0.011805,-0.020288,0.010996,-0.013753,-0.046997,-0.011644,0.001977,0.054376,-0.013521,-0.004046,-0.060478,0.015322,-0.003354,-0.046507,-0.010268,-0.06348,0.03687,…,0.02378,-0.022326,-0.00358,-0.080423,0.038952,-0.013766,-0.009324,-0.028457,-0.065103,-0.05633,-0.005077,0.079641,0.072097,0.061348,0.003218,0.035126,0.02121,-0.034902,-0.024324,0.014971,0.007251,-0.000504,0.031855,0.007573,0.033037,-0.005193,0.039702,0.008287,0.037472,-0.026117,0.055417,-0.043377,0.01465,0.032689,-0.014966,-0.011814,0.070964
7603,"[0.053228, -0.023868, … 0.047716]","""{""token_count"":56,""truncated"":…","""""","""I stopped by this coffee truck…",0.053228,-0.023868,-0.049221,-0.061731,0.04267,0.062132,0.000897,-0.023412,-0.008498,0.014876,-0.009825,-0.021335,-0.008295,-0.015367,-0.016351,0.011868,-0.025651,0.028539,-0.004644,-0.054805,-0.002182,-0.003409,0.060985,0.004805,0.027617,-0.057099,0.012354,-0.022245,-0.040149,-0.003689,-0.071424,0.051083,…,0.011466,-0.006525,0.011717,-0.017505,0.042835,-0.037959,0.022307,-0.026614,-0.042421,-0.030379,-0.008988,0.053528,0.045693,0.074565,-0.002939,0.075779,0.02373,-0.033506,-0.002729,-0.015574,0.005053,-0.008491,0.02853,0.007738,0.004493,-0.014993,0.034256,0.002963,0.034895,-0.037654,0.034019,-0.048664,0.018646,0.013108,-0.010859,-0.00461,0.047716
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
7492,"[0.041095, -0.065384, … 0.047765]","""{""token_count"":53,""truncated"":…","""""","""I think it is overpriced, but …",0.041095,-0.065384,-0.039071,-0.042883,0.037476,0.04672,0.027848,-0.027376,0.003505,0.013463,-0.007072,-0.012866,-0.000765,-0.019068,-0.0274,0.001958,-0.008349,0.000292,-0.011755,-0.060063,-0.022024,-0.007475,0.032606,-0.002897,-0.019965,-0.053312,0.019836,-0.040925,-0.03494,-0.007605,-0.09483,0.088128,…,-0.006452,-0.027454,0.015479,-0.035378,0.053098,0.021587,0.034371,-0.03116,-0.063145,-0.019188,-0.016995,0.019805,0.061166,0.079661,-0.007536,0.052948,-0.021936,-0.016107,-0.026523,-0.001981,0.020978,-0.035168,0.019463,-0.013948,0.009038,-0.020249,0.012847,0.04348,0.070893,-0.021213,0.053228,-0.045022,0.008603,0.017673,0.005926,-0.036479,0.047765
7512,"[0.029933, -0.011228, … 0.050344]","""{""token_count"":53,""truncated"":…","""""","""The staff was friendly and hel…",0.029933,-0.011228,-0.031262,-0.042669,0.046636,0.081698,0.006006,-0.039688,0.036753,-0.008103,-0.011902,0.000941,0.043024,-0.028415,-0.01264,0.004969,-0.02329,0.017803,-0.029007,0.006745,-0.012863,-0.012817,0.067244,-0.026956,0.006576,-0.067503,0.014674,-0.034705,-0.054241,0.005111,-0.093253,0.043078,…,-0.001062,0.021202,-0.001295,-0.048903,0.047226,-0.025895,-0.004884,-0.045128,-0.041412,-0.034434,-0.005425,0.049472,0.046416,0.053883,-0.0054,0.062568,-0.00088,0.002776,-0.014142,0.004604,0.009659,-0.000261,0.003206,-0.016514,0.004032,0.004014,0.031565,0.01879,0.050792,-0.053596,0.047211,-0.016292,0.021825,0.015827,0.005518,0.004924,0.050344
7500,"[0.025881, -0.038813, … 0.057246]","""{""token_count"":53,""truncated"":…","""""","""The staff were very friendly a…",0.025881,-0.038813,-0.027333,-0.040803,0.050177,0.070295,-0.015433,-0.06679,0.008473,-0.000148,-0.044416,-0.021023,0.008832,-0.038241,-0.015371,0.013667,-0.029876,0.002515,-0.029593,-0.017262,0.020768,-0.003453,0.081202,-0.014221,0.004567,-0.092428,0.009197,-0.0311,-0.056761,-0.015991,-0.084181,0.038098,…,0.017393,-0.006807,-0.008525,-0.040217,0.062816,-0.027295,0.002162,-0.032328,-0.059853,-0.015015,-0.020098,0.079672,0.053231,0.085956,-0.019692,0.037182,-0.009942,-0.00244,0.001725,-0.014985,-0.015543,0.012244,0.037066,-0.020349,0.003222,-0.012606,0.046018,0.008451,0.031353,-0.041986,0.078318,-0.042985,0.02759,0.016827,-0.001142,-0.019651,0.057246
7475,"[0.04194, -0.012867, … 0.056525]","""{""token_count"":53,""truncated"":…","""""","""I had a great experience at th…",0.04194,-0.012867,0.004069,-0.035836,0.008495,0.061997,0.005026,-0.044069,0.018485,0.001413,-0.01677,-0.031542,0.024678,-0.043411,-0.011541,0.035251,-0.009785,-0.007921,-0.02439,-0.018398,-0.000712,-0.013827,0.073629,-0.006923,0.000153,-0.080357,0.024611,-0.0313,-0.042498,0.025652,-0.077314,0.047701,…,0.0207,0.020671,0.015392,-0.044429,0.050896,-0.043562,0.017871,-0.032345,-0.035889,-0.014749,-0.043011,0.041719,0.025136,0.074836,-0.021727,0.040063,0.003118,-0.004306,0.001001,-0.001822,0.00801,-0.001092,0.028449,-0.007515,0.00682,-0.012976,0.050668,0.013968,0.04758,-0.039321,0.063847,-0.026938,0.024267,0.012895,0.00469,-0.021364,0.056525


In [9]:
# Set BigQuery DataFrames options
bpd.options.bigquery.project = PROJECT_ID
bpd.options.bigquery.location = REGION

In [10]:
# convert customer_review_embedded to Bigframe (only the embedding columns)
customer_review_embedded_bf = bpd.read_pandas(customer_review_embedded.select(pl.selectors.matches(r"^embedding_")).to_pandas())

In [11]:
customer_review_embedded_bf

Unnamed: 0,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,embedding_9,...,embedding_758,embedding_759,embedding_760,embedding_761,embedding_762,embedding_763,embedding_764,embedding_765,embedding_766,embedding_767
0,0.048845,-0.020206,-0.018818,-0.047474,0.038713,0.045697,-0.017239,-0.042988,-0.006626,0.029441,...,0.024732,0.032967,-0.041192,0.04709,-0.039201,0.049214,0.04284,0.027583,0.008554,0.054143
1,0.047678,-0.034145,-0.025936,-0.054205,0.044886,0.051056,-0.0296,-0.052977,0.005275,-0.011844,...,0.027707,0.027476,-0.026624,0.065381,-0.041634,0.018789,0.019633,0.002239,-0.02994,0.035929
2,0.004317,-0.034291,-0.020192,-0.006772,0.033597,0.090561,-0.002594,-0.035049,0.007243,-0.023208,...,0.011348,0.047733,-0.024122,0.052828,-0.022036,0.024124,0.024854,0.007793,-0.020556,0.067955
3,0.04532,-0.018647,-0.033229,-0.054886,0.066516,0.051332,-0.009663,-0.049191,0.01406,-0.004939,...,0.008287,0.037472,-0.026117,0.055417,-0.043377,0.01465,0.032689,-0.014966,-0.011814,0.070964
4,0.053228,-0.023868,-0.049221,-0.061731,0.04267,0.062132,0.000897,-0.023412,-0.008498,0.014876,...,0.002963,0.034895,-0.037654,0.034019,-0.048664,0.018646,0.013108,-0.010859,-0.00461,0.047716
5,0.026698,-0.013233,-0.033684,-0.033508,0.038089,0.048833,-0.009421,-0.033227,0.030092,-0.003076,...,0.02108,0.066222,-0.020456,0.034009,-0.021611,0.037692,0.014901,0.016053,-0.020357,0.051574
6,0.040598,-0.013675,0.005653,-0.024912,0.033753,0.058193,-0.000557,-0.043817,0.009429,-0.005622,...,0.007196,0.037509,-0.01499,0.048047,-0.038213,0.025821,0.030311,0.004337,-0.009171,0.058013
7,0.024971,-0.032426,-0.04778,-0.04042,0.033208,0.085341,-0.008933,-0.017382,0.003807,-0.001902,...,0.00323,0.043043,-0.012381,0.023846,-0.027226,0.041813,0.030772,-0.005052,-0.018343,0.035575
8,0.028715,-0.004995,-0.058901,-0.040589,0.027722,0.056442,0.010439,-0.062266,-0.010051,-0.009421,...,0.005806,0.032287,-0.034392,0.039063,-0.04518,0.031915,0.024755,-0.008787,-0.026162,0.049303
9,0.025832,-0.019892,-0.022126,-0.019593,0.033527,0.046461,-0.004793,-0.041055,0.011118,-0.01999,...,-0.012599,0.049826,-0.052484,0.046917,-0.025565,0.035416,0.008906,0.024642,-0.027964,0.071153


In [12]:
import bigframes.ml.decomposition as dc

In [13]:
pca = dc.PCA(n_components=10)

In [14]:
pca.fit(X=customer_review_embedded_bf)

PCA(n_components=10)

In [18]:
pca.explained_variance_ratio_.sort_values(by="explained_variance_ratio", ascending=False)

Unnamed: 0,principal_component_id,explained_variance_ratio
2,0,0.134559
4,1,0.066765
6,2,0.059778
7,3,0.051538
9,4,0.04622
3,5,0.036796
1,6,0.034411
0,7,0.031167
8,8,0.021263
5,9,0.020868


In [22]:
coords_after_pca = pl.from_pandas(pca.predict(X=customer_review_embedded_bf).to_pandas())

In [23]:
coords_after_pca

principal_component_1,principal_component_2,principal_component_3,principal_component_4,principal_component_5,principal_component_6,principal_component_7,principal_component_8,principal_component_9,principal_component_10
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
-3.925665,-1.911317,-1.724364,10.259554,2.721579,9.261119,3.667532,-3.16673,3.634138,2.770968
-6.222728,-1.066516,-3.318703,0.487521,1.067717,8.312714,9.98865,-7.468767,-0.268469,5.280275
-7.03107,-1.128108,-5.63462,-3.592317,-1.068499,1.129989,4.203582,3.246923,1.312858,8.254532
-11.235043,3.857524,-2.780472,6.032825,3.378198,5.796707,6.46881,-0.389664,4.151447,2.776398
-11.979665,-7.670064,9.700599,3.773867,0.386152,-2.512794,2.322554,2.551778,2.14362,-3.621871
…,…,…,…,…,…,…,…,…,…
7.786922,-0.19147,-4.769679,9.767386,1.240049,-3.50724,2.795626,-1.30248,-2.05993,4.223208
-10.975461,1.634239,-0.615791,-0.237953,-2.129387,-0.11829,4.958987,6.613494,1.03222,-3.973353
-11.921846,-0.881251,-8.341742,-1.169255,3.026805,3.215227,3.083298,7.348188,1.074097,1.010134
-8.557177,-0.09872,-2.282274,-5.702808,5.010712,-4.043657,2.342278,5.436452,4.192566,5.507476


In [24]:
coords_after_pca = coords_after_pca.with_columns(content=pl.Series(customer_review_embedded.select("content")))

In [29]:
scatterplot = coords_after_pca.plot(kind="scatter", x="principal_component_1", y="principal_component_2", hover_cols=["content"], width=1000, height=600)
scatterplot_bokeh = hvplot.render(scatterplot, backend="bokeh")
show(scatterplot_bokeh)