In [1]:
from pydantic import BaseModel, Field, validator
from typing import Union, List, Optional
import tiktoken
import datetime
import uuid

tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo")

# Base class for all bd_types
class BDType(BaseModel):
    source: str = Field(..., description="The source of the data.")
    timestamp: Optional[datetime.datetime] = Field(None, description="When the data was collected or created. If not provided, the current time is used.")
    id: Optional[str] = Field(None, description="A unique ID for the data instance. If not provided, it's generated by combining source, timestamp, and a uuid.")
    data_name: Optional[str] = Field(None, description="Name of the data.")
    elements_name: Optional[List[str]] = Field(None, description="Names of the elements if the data is a list.")
    
    @validator("timestamp", pre=True, always=True)
    def set_timestamp(cls, v):
        return v or datetime.datetime.now()

    @validator("id", pre=True, always=True)
    def set_id(cls, v, values, **kwargs):
        if "source" in values and "timestamp" in values:
            source = values["source"]
            timestamp = values["timestamp"]
            return v or f"{source}-{timestamp}-{uuid.uuid4()}"
        return v

class NaturalLanguageSingle(BDType):
    text: str = Field(..., description="The natural language text. It should be less than or equal to `max_tokens` in length when tokenized.")
    max_tokens: int = Field(8000, description="The maximum allowed length of the text in tokens. The default value is 8000.")
    
    @validator("text")
    def validate_text(cls, v, values):
        try:
            token_count = len(tokenizer.encode(v))
        except Exception as e:
            raise ValueError("Failed to tokenize text.") from e

        max_tokens = values.get("max_tokens", 8000)  # Get max_tokens from values, if not available, default to 8000
        if token_count > max_tokens:
            raise ValueError(f"Text is longer than {max_tokens} tokens.")
        return v

class NaturalLanguageList(BDType):
    texts: List[NaturalLanguageSingle] = Field(..., description="A list of `NaturalLanguageSingle` objects. Each object should pass the validation requirements of the `NaturalLanguageSingle` class.")


In [2]:
import time
import string

# Function to generate a list of strings of specific length
def generate_strings(num_strings, tokens_per_string):
    return [string.ascii_lowercase * (tokens_per_string // 26) for _ in range(num_strings)]

# Lengths of the lists to generate
list_lengths = [10, 100, 1000, 10000, 100000]

# Tokens per string
tokens_per_string = 4000

for length in list_lengths:
    # Generate a list of strings
    strings = generate_strings(length, tokens_per_string)
    
    # Start timer for NaturalLanguageSingle list creation
    start_time_nls_list = time.time()
    
    # Create a NaturalLanguageSingle list
    nls_list = [NaturalLanguageSingle(source='source', text=text) for text in strings]
    
    # End timer for NaturalLanguageSingle list creation
    end_time_nls_list = time.time()
    
    # Start timer for NaturalLanguageList initialization
    start_time_nll = time.time()
    
    # Initialize a NaturalLanguageList instance
    nll = NaturalLanguageList(source='source', texts=nls_list)
    
    # End timer for NaturalLanguageList initialization
    end_time_nll = time.time()
    
    # Print the creation and initialization times
    print(f"Creation time for a NaturalLanguageSingle list of length {length}: {end_time_nls_list - start_time_nls_list} seconds")
    print(f"Initialization time for a NaturalLanguageList with a list of length {length}: {end_time_nll - start_time_nll} seconds")
    print("---")


Creation time for a NaturalLanguageSingle list of length 10: 0.05250287055969238 seconds
Initialization time for a NaturalLanguageList with a list of length 10: 0.0 seconds
---
Creation time for a NaturalLanguageSingle list of length 100: 0.5040838718414307 seconds
Initialization time for a NaturalLanguageList with a list of length 100: 0.0 seconds
---
Creation time for a NaturalLanguageSingle list of length 1000: 4.989835739135742 seconds
Initialization time for a NaturalLanguageList with a list of length 1000: 0.0014958381652832031 seconds
---
Creation time for a NaturalLanguageSingle list of length 10000: 49.594555616378784 seconds
Initialization time for a NaturalLanguageList with a list of length 10000: 0.01700115203857422 seconds
---
Creation time for a NaturalLanguageSingle list of length 100000: 502.29613733291626 seconds
Initialization time for a NaturalLanguageList with a list of length 100000: 0.230055570602417 seconds
---


In [8]:
import time
import polars as pl
import string

def create_nls(text):
    return NaturalLanguageSingle(source='source', text=text)
    
def generate_strings(num_strings, tokens_per_string):
    return [string.ascii_lowercase * (tokens_per_string // 26) for _ in range(num_strings)]

# For different sizes of the DataFrame
for size in list_lengths:
    # Create a DataFrame with 'size' number of strings
    start_time = time.time()
    text = generate_strings(size, 4000)
    print(f"Time to generate strings for size {size}: {time.time() - start_time} seconds")
    start_time = time.time()
    df = pl.DataFrame({
        "text": text
    })
    print(f"Time to create DataFrame for size {size}: {time.time() - start_time} seconds")

    # Create NaturalLanguageSingle instances and measure the time
    start_time = time.time()
    df = df.with_columns(pl.col("text").apply(create_nls).alias("nls"))
    print(f"Time to create NaturalLanguageSingle instances for size {size}: {time.time() - start_time} seconds")

    # Create a NaturalLanguageList and measure the time
    start_time = time.time()
    nll = NaturalLanguageList(source='source', texts=df['nls'].to_list())
    print(f"Time to create NaturalLanguageList for size {size}: {time.time() - start_time} seconds")


Time to generate strings for size 10: 0.03900623321533203 seconds
Time to create DataFrame for size 10: 0.019512414932250977 seconds
Time to create NaturalLanguageSingle instances for size 10: 0.05900073051452637 seconds
Time to create NaturalLanguageList for size 10: 0.09901261329650879 seconds
Time to generate strings for size 100: 0.0 seconds
Time to create DataFrame for size 100: 0.0004987716674804688 seconds
Time to create NaturalLanguageSingle instances for size 100: 0.5225801467895508 seconds
Time to create NaturalLanguageList for size 100: 0.0 seconds
Time to generate strings for size 1000: 0.0005016326904296875 seconds
Time to create DataFrame for size 1000: 0.003997802734375 seconds
Time to create NaturalLanguageSingle instances for size 1000: 5.029733180999756 seconds
Time to create NaturalLanguageList for size 1000: 0.0019996166229248047 seconds
Time to generate strings for size 10000: 0.00900125503540039 seconds
Time to create DataFrame for size 10000: 0.04849886894226074 

In [5]:
import time
from functools import lru_cache


@lru_cache
def generate_strings(num_strings, tokens_per_string):
    return [string.ascii_lowercase * (tokens_per_string // 26) for _ in range(num_strings)]

@lru_cache
def create_nls(text):
    return NaturalLanguageSingle(source='source', text=text)

# For different sizes of the DataFrame
for size in list_lengths:
    # Create a DataFrame with 'size' number of strings
    start_time = time.time()
    text = generate_strings(size, 4000)
    print(f"Time to generate strings for size {size}: {time.time() - start_time} seconds")
    start_time = time.time()
    df = pl.DataFrame({
        "text": text
    })
    print(f"Time to create DataFrame for size {size}: {time.time() - start_time} seconds")

    # Create NaturalLanguageSingle instances and measure the time
    start_time = time.time()
    df = df.with_columns(pl.col("text").apply(create_nls).alias("nls"))
    print(f"Time to create NaturalLanguageSingle instances for size {size}: {time.time() - start_time} seconds")

    # Create a NaturalLanguageList and measure the time
    start_time = time.time()
    nll = NaturalLanguageList(source='source', texts=df['nls'].to_list())
    print(f"Time to create NaturalLanguageList for size {size}: {time.time() - start_time} seconds")


Time to generate strings for size 10: 0.02899765968322754 seconds
Time to create DataFrame for size 10: 0.003998279571533203 seconds
Time to create NaturalLanguageSingle instances for size 10: 0.0055103302001953125 seconds
Time to create NaturalLanguageList for size 10: 0.062011003494262695 seconds
Time to generate strings for size 100: 0.0 seconds
Time to create DataFrame for size 100: 0.0005035400390625 seconds
Time to create NaturalLanguageSingle instances for size 100: 0.0009975433349609375 seconds
Time to create NaturalLanguageList for size 100: 0.0005028247833251953 seconds
Time to generate strings for size 1000: 0.0004999637603759766 seconds
Time to create DataFrame for size 1000: 0.005998849868774414 seconds
Time to create NaturalLanguageSingle instances for size 1000: 0.003997802734375 seconds
Time to create NaturalLanguageList for size 1000: 0.0020008087158203125 seconds
Time to generate strings for size 10000: 0.014501094818115234 seconds
Time to create DataFrame for size 10

In [6]:
import time
def generate_strings(num_strings, tokens_per_string):
    return [string.ascii_lowercase * (tokens_per_string // 26) for _ in range(num_strings)]
def create_nls(text):
    return NaturalLanguageSingle(source='source', text=text)
# For different sizes of the DataFrame
for size in list_lengths:
    # Create a DataFrame with 'size' number of strings
    start_time = time.time()
    text = generate_strings(size, 4000)
    print(f"Time to generate strings for size {size}: {time.time() - start_time} seconds")
    start_time = time.time()
    df = pl.DataFrame({
        "text": text
    })
    print(f"Time to create DataFrame for size {size}: {time.time() - start_time} seconds")

    # Create NaturalLanguageSingle instances and measure the time
    start_time = time.time()
    df = df.with_columns(pl.col("text").apply(create_nls, strategy= 'threading').alias("nls"))
    print(f"Time to create NaturalLanguageSingle with Threading instances for size {size}: {time.time() - start_time} seconds")

    # Create a NaturalLanguageList and measure the time
    start_time = time.time()
    nll = NaturalLanguageList(source='source', texts=df['nls'].to_list())
    print(f"Time to create NaturalLanguageList for size {size}: {time.time() - start_time} seconds")


Time to generate strings for size 10: 0.039513587951660156 seconds
Time to create DataFrame for size 10: 0.001999378204345703 seconds
Time to create NaturalLanguageSingle with Threading instances for size 10: 0.015501976013183594 seconds
Time to create NaturalLanguageList for size 10: 0.001998424530029297 seconds
Time to generate strings for size 100: 0.0 seconds
Time to create DataFrame for size 100: 0.0005009174346923828 seconds
Time to create NaturalLanguageSingle with Threading instances for size 100: 0.06851530075073242 seconds
Time to create NaturalLanguageList for size 100: 0.0 seconds
Time to generate strings for size 1000: 0.0004992485046386719 seconds
Time to create DataFrame for size 1000: 0.004498958587646484 seconds
Time to create NaturalLanguageSingle with Threading instances for size 1000: 0.6605849266052246 seconds
Time to create NaturalLanguageList for size 1000: 0.0024993419647216797 seconds
Time to generate strings for size 10000: 0.012504816055297852 seconds
Time to

In [7]:
import os

print(os.cpu_count())

16
