In [1]:
%pip install polars

Collecting polars
  Downloading polars-1.12.0-cp39-abi3-win_amd64.whl.metadata (14 kB)
Downloading polars-1.12.0-cp39-abi3-win_amd64.whl (33.8 MB)
   ---------------------------------------- 0.0/33.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/33.8 MB ? eta -:--:--
   - -------------------------------------- 1.0/33.8 MB 4.2 MB/s eta 0:00:08
   ---- ----------------------------------- 3.9/33.8 MB 9.0 MB/s eta 0:00:04
   -------- ------------------------------- 7.1/33.8 MB 10.9 MB/s eta 0:00:03
   ------------ --------------------------- 10.5/33.8 MB 12.1 MB/s eta 0:00:02
   ---------------- ----------------------- 13.9/33.8 MB 12.6 MB/s eta 0:00:02
   ------------------- -------------------- 16.8/33.8 MB 12.9 MB/s eta 0:00:02
   -------------------- ------------------- 17.0/33.8 MB 12.9 MB/s eta 0:00:02
   -------------------------- ------------- 22.8/33.8 MB 13.2 MB/s eta 0:00:01
   ------------------------------ --------- 26.0/33.8 MB 13.5 MB/s eta 0:00:01
   ----

# Embedding vectors

In [20]:
import sys
sys.path.append('./airflow')

import polars as pl
import pymongo
import re
from tqdm import tqdm
from core.config import get_settings
from functions.operators.database import MongoDBOperator

In [28]:
settings = get_settings()
mongo_operator = MongoDBOperator('imcp', settings.DATABASE_URL)

In [57]:
def clean_text(text: str) -> str:
    if text is None:
        return ""
    return re.sub(r'[^a-zA-Z\s]', '', text)

def tokenize(text: str) -> str:
    if text is None:
        return [""]
    return text.split(" ")

def scaling_data(df:pl.DataFrame, selected_columns:list=None):
    if selected_columns != None:
        temp_df = df.select(selected_columns)
    else:
        temp_df = df.select('*')
    return temp_df

In [58]:
def data_generator(batch_size:int=10000, limit:int=100000):
    with pymongo.MongoClient(settings.DATABASE_URL) as client:
        db = client['imcp']
        documents = db['huggingface'].find({}).batch_size(batch_size).limit(limit)
        batch = []
        for doc in documents:
            batch.append(doc)
            if len(batch) == batch_size:
                yield batch  # Trả về nhóm tài liệu (batch)
                batch = []  # Reset batch sau khi yield
        # Nếu còn tài liệu dư ra sau khi lặp xong
        if batch:
            yield batch

In [None]:
for batch in data_generator():
    data = list(batch)
    df = pl.DataFrame(data).drop('_id')
    lowered_df = df.with_columns(
        *[pl.col(col).str.to_lowercase().alias(col) for col in ['caption','short_caption']]
    )
    cleaned_df = lowered_df.with_columns(
       *[ pl.col(col).map_elements(lambda x: clean_text(x), return_dtype=pl.String).alias(col) for col in ['caption','short_caption']]
    )
    tokenized_df = cleaned_df.with_columns(
        *[ pl.col(col).map_elements(lambda x: tokenize(x), return_dtype=pl.List(pl.String)).alias(f'{col}_tokens') for col in ['caption','short_caption']]
    )
    refined_df = scaling_data(tokenized_df, ['url', 'caption', 'short_caption', 'caption_tokens', 'short_caption_tokens', 'publisher', 'created_time'])
    data = refined_df.to_dicts()
    mongo_operator.insert('refined', data)
    print('SUCCESS with', len(data))

SUCCESS with 10000
SUCCESS with 10000
SUCCESS with 10000
SUCCESS with 10000
SUCCESS with 10000
SUCCESS with 10000
SUCCESS with 10000
SUCCESS with 10000
SUCCESS with 10000
SUCCESS with 10000


In [42]:
df = pl.DataFrame(data).drop('_id')
df.shape

(217868, 6)

# Cloud Storage

In [31]:
lowered_df = df.with_columns(
    *[pl.col(col).str.to_lowercase().alias(col) for col in ['caption','short_caption']]
)

In [32]:
lowered_df['caption'][0]

'a kitchen with wooden cabinets on the walls, a stove, multiple drawers, a refrigerator, a counter with fruits, and a well-organized layout for cooking and storage needs.'

In [None]:
lowered_df = lowered_df.with_columns(
    *[ pl.col(col).map_elements(lambda x: clean_text(x), return_dtype=pl.String).alias(col) for col in ['caption','short_caption']]
)

In [34]:
lowered_df['caption'][0]

'a kitchen with wooden cabinets on the walls a stove multiple drawers a refrigerator a counter with fruits and a wellorganized layout for cooking and storage needs'

In [None]:
tokenized_df = lowered_df.with_columns(
    *[ pl.col(col).map_elements(lambda x: tokenize(x), return_dtype=pl.List(pl.String)).alias(f'{col}_tokens') for col in ['caption','short_caption']]
)

In [None]:
refined_df = scaling_data(tokenized_df, ['url', 'caption', 'short_caption', 'caption_tokens', 'short_caption_tokens', 'publisher', 'created_time'])

In [39]:
data = refined_df.to_dicts()

In [25]:
mongo_operator.insert('refined', data)

Exception: you are over your space quota, using 519 MB of 512 MB, full error: {'ok': 0, 'errmsg': 'you are over your space quota, using 519 MB of 512 MB', 'code': 8000, 'codeName': 'AtlasError'}