In [4]:
from pydantic import BaseModel, Field, field_validator
from typing import Union, List, Optional, Dict
import datetime
import uuid
import tiktoken
tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo")


# Base class for all bd_types
class BDType(BaseModel):
    source: str = Field("babydragon", description="The source of the data.")
    timestamp: Optional[datetime.datetime] = Field(None, description="When the data was collected or created. If not provided, the current time is used.")
    id: uuid.UUID = Field(default_factory=uuid.uuid4, description="Unique identifier of the data.")
    data_name: Optional[str] = Field(None, description="Name of the data.")
    elements_name: Optional[List[str]] = Field(None, description="Names of the elements if the data is a list.")

    @field_validator("timestamp")
    def set_timestamp(cls, v):
        return v or datetime.datetime.now()

    @field_validator("id")
    def set_id(cls, values, **kwargs):
        if "id" not in values:
            values["id"] = uuid.uuid4()
        return values


class NaturalLanguageSingle(BDType):
    text: str = Field(..., description="The natural language text. It should be less than or equal to `max_tokens` in length when tokenized.")
    max_tokens: int = Field(8000, description="The maximum allowed length of the text in tokens. The default value is 8000.")
    
    @field_validator("text")
    def validate_text(cls, v, info):
        try:
            # Tokenize the text and get the token count
            token_count = len(tokenizer.encode(v))
        except Exception as e:
            raise ValueError("Failed to tokenize text.") from e

        # Get max_tokens from info.data, if not available, default to 8000
        max_tokens = info.data.get("max_tokens", 8000)

        if token_count > max_tokens:
            raise ValueError(f"Text is longer than {max_tokens} tokens.")

        return v


class NaturalLanguageList(BDType):
    texts: List[NaturalLanguageSingle] = Field(..., description="A list of `NaturalLanguageSingle` objects. Each object should pass the validation requirements of the `NaturalLanguageSingle` class.")



In [5]:
import time
import string

# Function to generate a list of strings of specific length
def generate_strings(num_strings, tokens_per_string):
    return [string.ascii_lowercase * (tokens_per_string // 26) for _ in range(num_strings)]

# Lengths of the lists to generate
list_lengths = [10, 100, 1000, 10000, 100000]


In [6]:

# Tokens per string
tokens_per_string = 4000

for length in list_lengths[:-1]:
    # Generate a list of strings
    strings = generate_strings(length, tokens_per_string)
    
    # Start timer for NaturalLanguageSingle list creation
    start_time_nls_list = time.time()
    
    # Create a NaturalLanguageSingle list
    nls_list = [NaturalLanguageSingle(source='source', text=text) for text in strings]
    
    # End timer for NaturalLanguageSingle list creation
    end_time_nls_list = time.time()
    
    # Start timer for NaturalLanguageList initialization
    start_time_nll = time.time()
    
    # Initialize a NaturalLanguageList instance
    nll = NaturalLanguageList(source='source', texts=nls_list)
    
    # End timer for NaturalLanguageList initialization
    end_time_nll = time.time()
    
    # Print the creation and initialization times
    print(f"Creation time for a NaturalLanguageSingle list of length {length}: {end_time_nls_list - start_time_nls_list} seconds")
    print(f"Initialization time for a NaturalLanguageList with a list of length {length}: {end_time_nll - start_time_nll} seconds")
    print("---")


Creation time for a NaturalLanguageSingle list of length 10: 0.05450248718261719 seconds
Initialization time for a NaturalLanguageList with a list of length 10: 0.0005013942718505859 seconds
---
Creation time for a NaturalLanguageSingle list of length 100: 0.504584789276123 seconds
Initialization time for a NaturalLanguageList with a list of length 100: 0.0 seconds
---
Creation time for a NaturalLanguageSingle list of length 1000: 4.906872034072876 seconds
Initialization time for a NaturalLanguageList with a list of length 1000: 0.0 seconds
---


In [5]:
import time
import polars as pl
import string

def create_nls(text):
    return NaturalLanguageSingle(source='source', text=text)
    
def generate_strings(num_strings, tokens_per_string):
    return [string.ascii_lowercase * (tokens_per_string // 26) for _ in range(num_strings)]

# For different sizes of the DataFrame
for size in list_lengths[:-1]:
    # Create a DataFrame with 'size' number of strings
    start_time = time.time()
    text = generate_strings(size, 4000)
    print(f"Time to generate strings for size {size}: {time.time() - start_time} seconds")
    start_time = time.time()
    df = pl.DataFrame({
        "text": text
    })
    print(f"Time to create DataFrame for size {size}: {time.time() - start_time} seconds")

    # Create NaturalLanguageSingle instances and measure the time
    start_time = time.time()
    df = df.with_columns(pl.col("text").apply(create_nls).alias("nls"))
    print(f"Time to create NaturalLanguageSingle instances using map for size {size}: {time.time() - start_time} seconds")

    # Create a NaturalLanguageList and measure the time
    start_time = time.time()
    nll = NaturalLanguageList(source='source', texts=df['nls'].to_list())
    print(f"Time to create NaturalLanguageList for size {size}: {time.time() - start_time} seconds")


Time to generate strings for size 10: 0.0 seconds
Time to create DataFrame for size 10: 0.004997730255126953 seconds
Time to create NaturalLanguageSingle instances using map for size 10: 0.08402609825134277 seconds
Time to create NaturalLanguageList for size 10: 0.0010006427764892578 seconds
Time to generate strings for size 100: 0.0005011558532714844 seconds
Time to create DataFrame for size 100: 0.0005004405975341797 seconds
Time to create NaturalLanguageSingle instances using map for size 100: 0.5065701007843018 seconds
Time to create NaturalLanguageList for size 100: 0.0 seconds
Time to generate strings for size 1000: 0.0009996891021728516 seconds
Time to create DataFrame for size 1000: 0.005500316619873047 seconds
Time to create NaturalLanguageSingle instances using map for size 1000: 4.95569920539856 seconds
Time to create NaturalLanguageList for size 1000: 0.0 seconds
Time to generate strings for size 10000: 0.015499353408813477 seconds
Time to create DataFrame for size 10000: 0

In [6]:
import time
from functools import lru_cache


@lru_cache
def generate_strings(num_strings, tokens_per_string):
    return [string.ascii_lowercase * (tokens_per_string // 26) for _ in range(num_strings)]

@lru_cache
def create_nls(text):
    return NaturalLanguageSingle(source='source', text=text)

# For different sizes of the DataFrame
for size in list_lengths:
    # Create a DataFrame with 'size' number of strings
    start_time = time.time()
    text = generate_strings(size, 4000)
    print(f"Time to generate strings for size {size}: {time.time() - start_time} seconds")
    start_time = time.time()
    df = pl.DataFrame({
        "text": text
    })
    print(f"Time to create DataFrame for size {size}: {time.time() - start_time} seconds")

    # Create NaturalLanguageSingle instances and measure the time
    start_time = time.time()
    df = df.with_columns(pl.col("text").apply(create_nls).alias("nls"))
    print(f"Time to create NaturalLanguageSingle instances for size {size}: {time.time() - start_time} seconds")

    # Create a NaturalLanguageList and measure the time
    start_time = time.time()
    nll = NaturalLanguageList(source='source', texts=df['nls'].to_list())
    print(f"Time to create NaturalLanguageList for size {size}: {time.time() - start_time} seconds")


Time to generate strings for size 10: 0.003497600555419922 seconds
Time to create DataFrame for size 10: 0.003002643585205078 seconds
Time to create NaturalLanguageSingle instances for size 10: 0.006508588790893555 seconds
Time to create NaturalLanguageList for size 10: 0.007002353668212891 seconds
Time to generate strings for size 100: 0.0 seconds
Time to create DataFrame for size 100: 0.0 seconds
Time to create NaturalLanguageSingle instances for size 100: 0.000997781753540039 seconds
Time to create NaturalLanguageList for size 100: 0.0 seconds
Time to generate strings for size 1000: 0.0009996891021728516 seconds
Time to create DataFrame for size 1000: 0.0030012130737304688 seconds
Time to create NaturalLanguageSingle instances for size 1000: 0.006499290466308594 seconds
Time to create NaturalLanguageList for size 1000: 0.0005011558532714844 seconds
Time to generate strings for size 10000: 0.012498617172241211 seconds
Time to create DataFrame for size 10000: 0.049504995346069336 seco

In [7]:
import time
def generate_strings(num_strings, tokens_per_string):
    return [string.ascii_lowercase * (tokens_per_string // 26) for _ in range(num_strings)]
def create_nls(text):
    return NaturalLanguageSingle(source='source', text=text)
# For different sizes of the DataFrame
for size in list_lengths:
    # Create a DataFrame with 'size' number of strings
    start_time = time.time()
    text = generate_strings(size, 4000)
    print(f"Time to generate strings for size {size}: {time.time() - start_time} seconds")
    start_time = time.time()
    df = pl.DataFrame({
        "text": text
    })
    print(f"Time to create DataFrame for size {size}: {time.time() - start_time} seconds")

    # Create NaturalLanguageSingle instances and measure the time
    start_time = time.time()
    df = df.with_columns(pl.col("text").apply(create_nls, strategy= 'threading').alias("nls"))
    print(f"Time to create NaturalLanguageSingle with Threading instances for size {size}: {time.time() - start_time} seconds")

    # Create a NaturalLanguageList and measure the time
    start_time = time.time()
    nll = NaturalLanguageList(source='source', texts=df['nls'].to_list())
    print(f"Time to create NaturalLanguageList for size {size}: {time.time() - start_time} seconds")


Time to generate strings for size 10: 0.03099799156188965 seconds
Time to create DataFrame for size 10: 0.01900005340576172 seconds
Time to create NaturalLanguageSingle with Threading instances for size 10: 0.021015167236328125 seconds
Time to create NaturalLanguageList for size 10: 0.0 seconds
Time to generate strings for size 100: 0.000499725341796875 seconds
Time to create DataFrame for size 100: 0.0004999637603759766 seconds
Time to create NaturalLanguageSingle with Threading instances for size 100: 0.0709998607635498 seconds
Time to create NaturalLanguageList for size 100: 0.0 seconds
Time to generate strings for size 1000: 0.0005009174346923828 seconds
Time to create DataFrame for size 1000: 0.003999233245849609 seconds
Time to create NaturalLanguageSingle with Threading instances for size 1000: 0.645089864730835 seconds
Time to create NaturalLanguageList for size 1000: 0.0 seconds
Time to generate strings for size 10000: 0.020998477935791016 seconds
Time to create DataFrame for 

In [8]:
import os

print(os.cpu_count())

16
