In [1]:
%pip install polars

Collecting polars
  Downloading polars-1.12.0-cp39-abi3-win_amd64.whl.metadata (14 kB)
Downloading polars-1.12.0-cp39-abi3-win_amd64.whl (33.8 MB)
   ---------------------------------------- 0.0/33.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/33.8 MB ? eta -:--:--
   - -------------------------------------- 1.0/33.8 MB 4.2 MB/s eta 0:00:08
   ---- ----------------------------------- 3.9/33.8 MB 9.0 MB/s eta 0:00:04
   -------- ------------------------------- 7.1/33.8 MB 10.9 MB/s eta 0:00:03
   ------------ --------------------------- 10.5/33.8 MB 12.1 MB/s eta 0:00:02
   ---------------- ----------------------- 13.9/33.8 MB 12.6 MB/s eta 0:00:02
   ------------------- -------------------- 16.8/33.8 MB 12.9 MB/s eta 0:00:02
   -------------------- ------------------- 17.0/33.8 MB 12.9 MB/s eta 0:00:02
   -------------------------- ------------- 22.8/33.8 MB 13.2 MB/s eta 0:00:01
   ------------------------------ --------- 26.0/33.8 MB 13.5 MB/s eta 0:00:01
   ----

# Embedding vectors

In [9]:
import sys
sys.path.append('./airflow')

import polars as pl
import pymongo
import re
from datetime import datetime
from tqdm import tqdm
from core.config import get_settings
from utils.operators.database import MongoDBOperator

In [2]:
settings = get_settings()
mongo_operator = MongoDBOperator('imcp', settings.DATABASE_URL)

In [3]:
def clean_text(text: str) -> str:
    if text is None:
        return ""
    return re.sub(r'[^a-zA-Z\s]', '', text)

def tokenize(text: str) -> str:
    if text is None:
        return [""]
    return text.split(" ")

def scaling_data(df:pl.DataFrame, selected_columns:list=None):
    if selected_columns != None:
        temp_df = df.select(selected_columns)
    else:
        temp_df = df.select('*')
    return temp_df

In [4]:
def data_generator(batch_size:int=10000, limit:int=100000):
    with pymongo.MongoClient(settings.DATABASE_URL) as client:
        db = client['imcp']
        documents = db['huggingface'].find({}).batch_size(batch_size).limit(limit)
        batch = []
        for doc in documents:
            batch.append(doc)
            if len(batch) == batch_size:
                yield batch  # Trả về nhóm tài liệu (batch)
                batch = []  # Reset batch sau khi yield
        # Nếu còn tài liệu dư ra sau khi lặp xong
        if batch:
            yield batch

In [46]:
aggregate = [
    {
        '$match': {
            'status': 'SUCCESS', 
            'layer': 'silver'
        }
    }, {
        '$sort': {
            'end_time': -1
        }
    }, {
        '$limit': 1
    }
]
data = mongo_operator.find_data_with_aggregate('audit', aggregate)

In [None]:
latest_time = data[0]['end_time']

In [33]:
for batch in data_generator():
    data = list(batch)
    df = pl.DataFrame(data).drop('_id')
    df = df.filter(pl.col('created_time') >= latest_time)
    lowered_df = df.with_columns(
        *[pl.col(col).str.to_lowercase().alias(col) for col in ['caption','short_caption']]
    )
    cleaned_df = lowered_df.with_columns(
       *[ pl.col(col).map_elements(lambda x: clean_text(x), return_dtype=pl.String).alias(col) for col in ['caption','short_caption']]
    )
    tokenized_df = cleaned_df.with_columns(
        *[ pl.col(col).map_elements(lambda x: tokenize(x), return_dtype=pl.List(pl.String)).alias(f'{col}_tokens') for col in ['caption','short_caption']],
        pl.format("{}/raw_data/raw_images/{}", pl.lit(settings.MINIO_URL), pl.col("url").str.extract(r".*/(.*)").str.slice(-16, None)).alias("s3_url")
    )
    refined_df = scaling_data(tokenized_df, ['url', 's3_url', 'caption', 'short_caption', 'caption_tokens', 'short_caption_tokens', 'publisher', 'created_time'])
    data = refined_df.to_dicts()
    # mongo_operator.insert('refined', data)
    print(data[1])
    print('SUCCESS with', len(data))
    break

{'url': 'http://116.118.50.253:9000/mlflow/user_images/image_20241106132957.jpg', 's3_url': 'http://116.118.50.253:9000/mlflow/raw_data/raw_images/241106132957.jpg', 'caption': 'a wellorganized desk featuring a reliable computer can boost productivity and create a more efficient work environment from sleek monitors to ergonomic keyboards the right office equipment makes all the difference', 'short_caption': '', 'caption_tokens': ['a', 'wellorganized', 'desk', 'featuring', 'a', 'reliable', 'computer', 'can', 'boost', 'productivity', 'and', 'create', 'a', 'more', 'efficient', 'work', 'environment', 'from', 'sleek', 'monitors', 'to', 'ergonomic', 'keyboards', 'the', 'right', 'office', 'equipment', 'makes', 'all', 'the', 'difference'], 'short_caption_tokens': [''], 'publisher': 'android', 'created_time': datetime.datetime(2024, 11, 8, 13, 30, 24, 671000)}
SUCCESS with 2


In [42]:
df = pl.DataFrame(data).drop('_id')
df.shape

(217868, 6)

# Cloud Storage

In [2]:
%pip freeze > requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [7]:
import numpy as np
import pickle
import minio


if __name__ == '__main__':
    # with open("./airflow/data/extracted_features.pkl", "rb") as file:
    #     data = pickle.load(file)
    #     print(len(data.keys()))
    #     print(np.array(data['http://images.cocodataset.org/val2017/000000400573.jpg']).shape)
        
        
    client = minio.Minio(
        "116.118.50.253:9000",
        access_key="minio",
        secret_key="minio123",
        secure=False
    )

    objects = client.list_objects(
        "mlflow",
        prefix="raw_data/raw_images",
        recursive=True
    )

    count = 0
    for obj in objects:
        count += 1
    print(count)

405
