# Install Required Packages
We start by installing the required dependencies for running the TabuLa model in a Google Colab environment. These include specific libraries and repositories necessary for inference.

In [1]:
# Clone the repository for TabuLa and change directory
!git clone https://github.com/mlfoundations/rtfm.git
%cd rtfm

# Upgrade pip to the latest version
!pip install --upgrade pip

# Install Python 3.8
!pip install python==3.8

# Install dependencies from requirements file
!pip install -r requirements.txt

# Install additional dependencies for TabuLa model
!pip install git+https://github.com/jpgard/llama-recipes.git
!pip install -e .
!pip install --no-deps git+https://github.com/mlfoundations/tableshift.git


fatal: destination path 'rtfm' already exists and is not an empty directory.
/content/rtfm
[31mERROR: Could not find a version that satisfies the requirement python==3.8 (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for python==3.8[0m[31m
Collecting git+https://github.com/jpgard/llama-recipes.git
  Cloning https://github.com/jpgard/llama-recipes.git to /tmp/pip-req-build-c0spk86b
  Running command git clone --filter=blob:none --quiet https://github.com/jpgard/llama-recipes.git /tmp/pip-req-build-c0spk86b
  Resolved https://github.com/jpgard/llama-recipes.git to commit 186213f2ba3382f0aaa29326377e36c840d78160
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: llama-recipes
  Building wheel for llama-recipes (pyproject.toml) ... [?25l[?25hdone
  Created wheel for llama-recipes: filename=llama_r

We clone the main repository and navigate into it.
We ensure pip is updated and install Python 3.8 for compatibility.
We install dependencies listed in the requirements.txt file and other specific libraries required for TabuLa.

# Model Loading and Setup
Here we load the TabuLa-8B model along with its tokenizer and set up necessary configurations for inference.



In [2]:
!sed -i 's/from tabliblib/from tablib/' rtfm/datasets/target_selection.py


In [3]:
!pip install -r requirements.txt --force-reinstall


Collecting torch>=2.0.1 (from -r requirements.txt (line 2))
  Downloading torch-2.5.1-cp310-cp310-manylinux1_x86_64.whl.metadata (28 kB)
Collecting accelerate (from -r requirements.txt (line 3))
  Downloading accelerate-1.2.0-py3-none-any.whl.metadata (19 kB)
Collecting appdirs (from -r requirements.txt (line 4))
  Using cached appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Collecting loralib (from -r requirements.txt (line 5))
  Using cached loralib-0.1.2-py3-none-any.whl.metadata (15 kB)
Collecting bitsandbytes (from -r requirements.txt (line 6))
  Using cached bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl.metadata (2.9 kB)
Collecting black==23.11.0 (from -r requirements.txt (line 7))
  Using cached black-23.11.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (66 kB)
Collecting datasets (from -r requirements.txt (line 9))
  Using cached datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting fire (from -r requirements.txt (line 10))
  Using cac

In [None]:
import pandas as pd
import torch
from transformers import AutoTokenizer, LlamaForCausalLM, AutoConfig
from rtfm.configs import TrainConfig, TokenizerConfig
from rtfm.inference_utils import InferenceModel
from rtfm.serialization.serializers import get_serializer
from rtfm.tokenization.text import prepare_tokenizer

# Configure model and tokenizer
train_config = TrainConfig(model_name="mlfoundations/tabula-8b", context_length=8192)
tokenizer_config = TokenizerConfig()

# Load model configuration
config = AutoConfig.from_pretrained(train_config.model_name)
config.torch_dtype = torch.bfloat16  # Match TabuLa's training setup

device = "cuda" if torch.cuda.is_available() else "cpu"

# Load the model and tokenizer
model = LlamaForCausalLM.from_pretrained(
    train_config.model_name, device_map="auto", config=config).to(device)
tokenizer = AutoTokenizer.from_pretrained(train_config.model_name)

# Set up serializer for special tokens
serializer = get_serializer(train_config.serializer_cls)
tokenizer, model = prepare_tokenizer(
    model,
    tokenizer=tokenizer,
    pretrained_model_name_or_path=train_config.model_name,
    model_max_length=train_config.context_length,
    use_fast_tokenizer=tokenizer_config.use_fast_tokenizer,
    serializer_tokens_embed_fn=tokenizer_config.serializer_tokens_embed_fn,
    serializer_tokens=serializer.special_tokens if tokenizer_config.add_serializer_tokens else None,
)

# Create an inference model
inference_model = InferenceModel(model=model, tokenizer=tokenizer, serializer=serializer)


We configure the TabuLa-8B model with a context length of 8192 tokens.
The bfloat16 data type ensures efficient memory usage during training and inference.
The serializer adds special tokens that are specific to TabuLa for processing tabular data.
Finally, we initialize the inference model for making predictions.

# Performing Zero-Shot Prediction
Here we demonstrate how to predict categorical outputs for unseen examples using TabuLa-8B.

In [None]:
# Example labeled data for inference
labeled_examples = pd.DataFrame(
    [
        {"location": "New York", "temperature": 22, "humidity": 65, "wind_speed": 12, "pressure": 1012, "month": "July",
         "weather_yesterday": "Sunny", "precipitation": 0, "visibility": 10, "weather_today": "Partly Sunny"},
        {"location": "Los Angeles", "temperature": 26, "humidity": 60, "wind_speed": 7, "pressure": 1015,
         "month": "July", "weather_yesterday": "Partly Sunny", "precipitation": 0, "visibility": 10, "weather_today": "Sunny"},
        # Add more examples here
    ]
)

# Target data for prediction
target_example = pd.DataFrame(
    [
        {"location": "San Jose", "temperature": 23, "humidity": 55, "wind_speed": 8, "pressure": 1013, "month": "July",
         "weather_yesterday": "Sunny", "precipitation": 0, "visibility": 10, "weather_today": "Sunny"},
    ]
)

# Perform prediction
output = inference_model.predict(
    target_example=target_example,
    target_colname="weather_today",
    target_choices=["Sunny", "Partly Sunny", "Cloudy", "Partly Cloudy", "Rain"],
    labeled_examples=labeled_examples,
)
print(f"Prediction for sample \n {target_example} \n is: {output}")


We define labeled examples as prior knowledge for the model.
The target example is the instance we want the model to predict.
The target_colname specifies the column to be predicted, while target_choices provides possible outcomes.
The model predicts the most likely outcome based on the labeled data.

# Predicting Continuous Targets
This section demonstrates prediction for continuous values, which are discretized into buckets.




In [9]:
from rtfm.serialization.serialization_utils import discretize_continuous_column

# Create labeled examples with continuous targets
examples = pd.DataFrame(
    [
        {"location": "New York", "size_sqft": 1200, "bedrooms": 3, "bathrooms": 2, "age": 10, "lot_size_acres": 0.15, "garage": True, "price": 850},
        {"location": "Los Angeles", "size_sqft": 1500, "bedrooms": 4, "bathrooms": 3, "age": 8, "lot_size_acres": 0.25, "garage": True, "price": 950},
        # Add more examples
    ]
)

# Discretize the target column
examples["price"] = discretize_continuous_column(examples["price"], num_buckets=4)
target_choices = examples["price"].unique().tolist()

# Define target and labeled examples
target_example = examples.iloc[[0]]
labeled_examples = examples.iloc[1:]

# Perform prediction
output = inference_model.predict(
    target_example=target_example,
    target_colname="price",
    target_choices=target_choices,
    labeled_examples=labeled_examples,
)
print(f"Prediction for sample \n {target_example} \n is: {output}")


NameError: name 'inference_model' is not defined

continuous values are bucketized into discrete categories using the discretize_continuous_column function.
We predict the bucket (range) into which the target value is most likely to fall.

In [None]:
# ## Step 1: Install Required Libraries
# Install necessary dependencies and resolve conflicts.
!pip install transformers accelerate pandas
!pip install git+https://github.com/mlfoundations/rtfm.git
!pip install git+https://github.com/mlfoundations/tableshift.git

# ## Step 2: Import Required Libraries
import pandas as pd
import torch
from transformers import AutoTokenizer, LlamaForCausalLM, AutoConfig
from rtfm.configs import TrainConfig, TokenizerConfig
from rtfm.inference_utils import InferenceModel
from rtfm.serialization.serializers import get_serializer
from rtfm.tokenization.text import prepare_tokenizer

# ## Step 3: Define Model Configuration and Load Model
# Configure the model and tokenizer to be used for inference
train_config = TrainConfig(model_name="mlfoundations/tabula-8b", context_length=8192)
tokenizer_config = TokenizerConfig()
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load the model configuration
config = AutoConfig.from_pretrained(train_config.model_name)
config.torch_dtype = torch.bfloat16  # Match the TabuLa setup

# Load the model and tokenizer
model = LlamaForCausalLM.from_pretrained(
    train_config.model_name, device_map="auto", config=config
).to(device)
tokenizer = AutoTokenizer.from_pretrained(train_config.model_name)
serializer = get_serializer(train_config.serializer_cls)

# Prepare the tokenizer and model with necessary configurations
tokenizer, model = prepare_tokenizer(
    model,
    tokenizer=tokenizer,
    pretrained_model_name_or_path=train_config.model_name,
    model_max_length=train_config.context_length,
    use_fast_tokenizer=tokenizer_config.use_fast_tokenizer,
    serializer_tokens_embed_fn=tokenizer_config.serializer_tokens_embed_fn,
    serializer_tokens=serializer.special_tokens
    if tokenizer_config.add_serializer_tokens
    else None,
)

# Initialize inference model
inference_model = InferenceModel(model=model, tokenizer=tokenizer, serializer=serializer)

# ## Step 4: Prepare Data for Inference
# Example: Categorical Target Prediction
labeled_examples = pd.DataFrame(
    [
        {"location": "New York", "temperature": 22, "humidity": 65, "wind_speed": 12, "pressure": 1012, "month": "July",
         "weather_yesterday": "Sunny", "precipitation": 0, "visibility": 10, "weather_today": "Partly Sunny"},
        {"location": "Los Angeles", "temperature": 26, "humidity": 60, "wind_speed": 7, "pressure": 1015,
         "month": "July", "weather_yesterday": "Partly Sunny", "precipitation": 0, "visibility": 10, "weather_today": "Sunny"},
    ]
)

target_example = pd.DataFrame(
    [
        {"location": "San Jose", "temperature": 23, "humidity": 55, "wind_speed": 8, "pressure": 1013, "month": "July",
         "weather_yesterday": "Sunny", "precipitation": 0, "visibility": 10, "weather_today": "Sunny"},
    ]
)

# ## Step 5: Perform Inference
# Predict weather_today based on the target example and labeled examples
output = inference_model.predict(
    target_example=target_example,
    target_colname="weather_today",
    target_choices=["Sunny", "Partly Sunny", "Cloudy", "Partly Cloudy", "Rain"],
    labeled_examples=labeled_examples,
)

# Print prediction
print(f"Prediction for sample \n {target_example} \n is: {output}")

# ## Step 6: Continuous Target Prediction
# Create and preprocess the data for continuous target prediction
from rtfm.serialization.serialization_utils import discretize_continuous_column

examples = pd.DataFrame(
    [
        {"location": "New York", "size_sqft": 1200, "bedrooms": 3, "bathrooms": 2, "age": 10, "lot_size_acres": 0.15, "garage": True, "price": 850},
        {"location": "Los Angeles", "size_sqft": 1500, "bedrooms": 4, "bathrooms": 3, "age": 8, "lot_size_acres": 0.25, "garage": True, "price": 950},
    ]
)

examples["price"] = discretize_continuous_column(examples["price"], num_buckets=4)
target_choices = examples["price"].unique().tolist()

target_example = examples.iloc[[0]]
labeled_examples = examples.iloc[1:]

# Predict continuous target
output = inference_model.predict(
    target_example=target_example,
    target_colname="price",
    target_choices=target_choices,
    labeled_examples=labeled_examples,
)

# Print prediction
print(f"Prediction for sample \n {target_example} \n is: {output}")
