# Imports

In [1]:
# from sklearnex import patch_sklearn
# patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [1]:
import numpy as np
import polars as pl
import re

from embedding_model import EmbeddingModel

# Functions Definition

In [2]:
def textPreprocessing(text: str):
    if text is not None:
        text = text.lower()
        text = text.strip()
        text = re.sub(r'\d+', r' ', text)
        text = re.sub(r'[!@#$%^&*()_+\-=\[\]{};\'\\:"|,.<>\/?]', r'', text)
        text = re.sub(r'\b[a-zÀ-ÖØ-öø-ÿ]{1,2}\b', r' ', text)
        text = re.sub(r'\s+', r' ', text)
    return text

def averagePooling(embeddings: np.ndarray) -> np.ndarray:
    return sum(embeddings) / len(embeddings)

# Process Definition

## Query database (deprecated)

In [6]:
con_string = 'postgresql://postgres:postgres@localhost/ituassu_2022'
query = "select id, content_text from x_content xc"

In [7]:
df = pl.read_database_uri(query, con_string, partition_num=5120)
df

id,content_text
str,str
"""15773553417166…","""RT @alteddy_: …"
"""15773550488832…","""BRASIL GANHAND…"
"""15773553416078…","""RT @siteptbr: …"
"""15773176621236…","""🚨 Em pleno Out…"
"""15773553414062…","""RT @onlybluest…"
…,…
"""15865535594952…","""RT @joelspunk:…"
"""15865535228075…","""RT @vinicoment…"
"""15865535168390…","""Lula eh hoje"""
"""15865535140163…","""RT @choquei: 🚨…"


## Prepare embedding model

In [3]:
embedding_model = EmbeddingModel(256)

  return self.fget.__get__(instance, owner)()


## Importing data

In [5]:
# q1 = (
#     pl.scan_csv('tweets.csv', separator=';', low_memory=True).lazy().with_columns(
#         pl.col("content_text").map_elements(textPreprocessing).alias('processed_text')
#     ).with_columns(
#         pl.col("processed_text").map_batches(lambda series: embedding_model.get_embeddings(series, 32768)).alias('embeddings')
#     )
# )
# df = q1.collect(streaming=True)

Creating data loaders
Generating embeddings
Process Time (sec): 1550.351292499996


In [4]:
lazy_df = pl.scan_csv('tweets.csv', separator=';', low_memory=True)

In [5]:
lazy_df.lazy().count().collect()

id,content_text,created_at,fk_author_id
u32,u32,u32,u32
57925235,57925235,57925235,57925235


## Preprocessing the text

In [6]:
query1 = (
    lazy_df.lazy().with_columns(
        pl.col("content_text").map_elements(textPreprocessing).alias('processed_text')
    ).select(pl.col("id"), pl.col("processed_text"))
)
lazy_df = query1.collect()

In [7]:
lazy_df.lazy().count().collect()

id,processed_text
u32,u32
57925235,57925235


## Converting to embeddings

In [10]:
query2 = (
    lazy_df.lazy().with_columns(
        pl.col("processed_text").map_batches(lambda series: embedding_model.get_embeddings(series, 24576)).alias('embeddings')
    )
)
lazy_df = query2.collect()

Creating data loaders
Generating embeddings
Embeddings for 0 of 2357 data loaders
x_train_bt conversion to list: 0.0002051999999821419
move input to gpu: 4.864922000000661
calculate embeddings: 0.03483760000017355
add embeddings to list: 4.0582211000000825
Embeddings for 1 of 2357 data loaders
x_train_bt conversion to list: 0.0001940000001923181
move input to gpu: 3.2938433999988774
calculate embeddings: 5.421931900000345
add embeddings to list: 2.5157604000014544
Embeddings for 2 of 2357 data loaders
x_train_bt conversion to list: 0.00017809999917517416
move input to gpu: 2.426744800000961
calculate embeddings: 0.0003395000003365567
add embeddings to list: 3.5858381999987614
Embeddings for 3 of 2357 data loaders
x_train_bt conversion to list: 0.0001818999990064185
move input to gpu: 2.438289399999121
calculate embeddings: 0.000380000001314329
add embeddings to list: 1.598999300000287
Embeddings for 4 of 2357 data loaders
x_train_bt conversion to list: 0.00023330000112764537
move input

ComputeError: RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
lazy_df.lazy().count().collect()

## Save embeddings result to file

In [None]:
lazy_df.lazy().select(pl.col("id"), pl.col("embeddings")).sink_parquet('embeddings.parquet')

## Calculating dataset general embedding

In [33]:
query3 = (
    df.lazy().select(pl.col('id'), pl.col('embeddings'))
)
df2 = query3.collect()

In [32]:

len(df2["embeddings"].to_numpy())

675630

In [13]:
dataset_general_embedding = averagePooling(df2["embeddings"].to_numpy())
dataset_general_embedding

tensor([-3.5133e+00,  2.2863e+00, -6.7663e+00,  1.3351e+00,  4.5295e+00,
         1.2497e+00,  7.5366e+00, -2.9391e+00,  6.2160e+00,  2.6001e+00,
         8.0612e+00,  4.4557e+00, -8.0682e+00, -4.2549e+00,  6.9623e-01,
        -1.1163e+01,  1.6273e+00,  1.6618e+00,  6.2046e-01,  9.1146e+00,
        -1.4048e+00, -2.9777e+00,  2.9740e+00, -2.0703e+00, -1.6314e+00,
        -6.1616e+00, -5.8109e+00,  5.4801e+00, -4.5849e+00,  4.4751e+00,
         1.5677e+00, -1.1390e+02,  3.7008e+00,  5.0471e+00, -7.0308e+00,
         1.5080e-01, -6.6176e-01,  1.8567e+00,  1.0620e+01, -4.2890e+00,
         2.4058e+00, -4.5250e-01, -4.4988e-01,  6.4921e+00,  4.6020e+00,
        -4.8486e+00, -8.5016e-01, -5.4629e+00,  3.4079e+00, -4.1113e+00,
         5.4369e+00,  1.0095e+00,  5.2235e+00,  1.1085e+00, -1.8674e+01,
         7.7118e+00,  1.6852e+01,  1.4685e+00, -2.7471e+00, -7.2529e-01,
        -4.3492e+00, -2.3199e+00,  2.5769e+00,  5.2444e+00, -1.2436e+00,
        -1.6418e+01,  3.4527e-01, -2.9425e+00,  2.3

In [14]:
dataset_ids = df2['id'].to_arrow()
print(len(dataset_ids))

675630


## Optimizer algorithm metaheuristic approximation

### Initial metaheuristics

#### Random start

#### Greedy start

### Approximation function

### Optimizer algorithm

## Resultant dataset approximation statistics