In [1]:
!pip install xgboost transformers "ray[data,train]" --quiet
!pip install "ray[serve]
# Cell 1: Install Necessary Libraries (if not already installed)
!pip install --upgrade ray xgboost scikit-learn pandas --quiet

zsh:1: unmatched "


In [2]:
from typing import Tuple

import ray
from ray.data import Dataset, Preprocessor
from ray.data.preprocessors import StandardScaler
from ray.train.xgboost import XGBoostTrainer
from ray.train import Result, ScalingConfig
import xgboost
import pandas as pd
from ray.train import Checkpoint

  from .autonotebook import tqdm as notebook_tqdm
2024-11-06 21:39:08,375	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2024-11-06 21:39:08,716	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2024-11-06 21:39:09,045	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [3]:
def prepare_data() -> Tuple[Dataset, Dataset, Dataset]:
    dataset = ray.data.read_csv("s3://anonymous@air-example-data/breast_cancer.csv")
    train_dataset, valid_dataset = dataset.train_test_split(test_size=0.3)
    test_dataset = valid_dataset.drop_columns(["target"])
    return train_dataset, valid_dataset, test_dataset

In [4]:
def train_xgboost(num_workers: int, use_gpu: bool = False) -> Result:
    train_dataset, valid_dataset, _ = prepare_data()

    # Scale some random columns
    columns_to_scale = ["mean radius", "mean texture"]
    preprocessor = StandardScaler(columns=columns_to_scale)
    train_dataset = preprocessor.fit_transform(train_dataset)
    valid_dataset = preprocessor.transform(valid_dataset)

    # XGBoost specific params
    params = {
        "tree_method": "approx",
        "objective": "binary:logistic",
        "eval_metric": ["logloss", "error"],
    }

    trainer = XGBoostTrainer(
        scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu),
        label_column="target",
        params=params,
        datasets={"train": train_dataset, "valid": valid_dataset},
        num_boost_round=100,
        metadata = {"preprocessor_pkl": preprocessor.serialize()}
    )
    result = trainer.fit()
    print(result.metrics)

    return result

In [5]:
class Predict:

    def __init__(self, checkpoint: Checkpoint):
        self.model = XGBoostTrainer.get_model(checkpoint)
        self.preprocessor = Preprocessor.deserialize(checkpoint.get_metadata()["preprocessor_pkl"])

    def __call__(self, batch: pd.DataFrame) -> pd.DataFrame:
        preprocessed_batch = self.preprocessor.transform_batch(batch)
        dmatrix = xgboost.DMatrix(preprocessed_batch)
        return {"predictions": self.model.predict(dmatrix)}


def predict_xgboost(result: Result):
    _, _, test_dataset = prepare_data()

    scores = test_dataset.map_batches(
        Predict, 
        fn_constructor_args=[result.checkpoint], 
        concurrency=1, 
        batch_format="pandas"
    )
    
    predicted_labels = scores.map_batches(lambda df: (df > 0.5).astype(int), batch_format="pandas")
    predicted_labels.show()

In [6]:
# Cell 2: Initialize Ray
import ray

if not ray.is_initialized():
    ray.init(ignore_reinit_error=True)  # Starts Ray locally

# Cell 3: Import Libraries
import xgboost
import pandas as pd
from typing import Tuple

2024-11-06 21:39:09,175	INFO worker.py:1631 -- Connecting to existing Ray cluster at address: 127.0.0.1:64780...
[2024-11-06 21:39:14,188 E 35024 3714882] gcs_rpc_client.h:179: Failed to connect to GCS at address 127.0.0.1:64780 within 5 seconds.
[2024-11-06 21:39:44,219 W 35024 3714882] gcs_client.cc:178: Failed to get cluster ID from GCS server: TimedOut: Timed out while waiting for GCS to become available.
[2024-11-06 21:39:50,232 E 35024 3714882] gcs_rpc_client.h:179: Failed to connect to GCS at address 127.0.0.1:64780 within 5 seconds.
[2024-11-06 21:40:20,263 W 35024 3714882] gcs_client.cc:178: Failed to get cluster ID from GCS server: TimedOut: Timed out while waiting for GCS to become available.
[2024-11-06 21:40:26,273 E 35024 3714882] gcs_rpc_client.h:179: Failed to connect to GCS at address 127.0.0.1:64780 within 5 seconds.
[2024-11-06 21:40:56,303 W 35024 3714882] gcs_client.cc:178: Failed to get cluster ID from GCS server: TimedOut: Timed out while waiting for GCS to becom

ConnectionError: 

In [None]:
# Cell 4: Define Functions
@ray.remote
class Predict:
    def __init__(self, checkpoint):
        # Initialize your model and preprocessor here
        self.model = xgboost.Booster()
        self.model.load_model(checkpoint)
        # self.preprocessor = YourPreprocessor()  # Define your preprocessor

    def transform_batch(self, batch):
        # Implement your preprocessing here
        # preprocessed_batch = self.preprocessor.transform(batch)
        return batch  # Replace with actual preprocessing

    def predict(self, batch):
        preprocessed_batch = self.transform_batch(batch)
        dmatrix = xgboost.DMatrix(preprocessed_batch)
        return {"predictions": self.model.predict(dmatrix)}

def train_xgboost(num_workers: int, use_gpu: bool = False) -> ray.actor.ActorHandle:
    # Prepare data
    train_dataset, valid_dataset, _ = prepare_data()

    # Initialize the Predictor actor
    predictor = Predict.remote('path_to_model_checkpoint')  # Replace with actual path

    # Example prediction (replace with your actual training logic)
    batch = train_dataset.take(10)  # Example batch
    result = ray.get(predictor.predict.remote(batch))
    print(result)
    return predictor

# Cell 5: Train the Model
result = train_xgboost(num_workers=2, use_gpu=False)

Collecting pandas
  Using cached pandas-2.2.3-cp311-cp311-macosx_11_0_arm64.whl.metadata (89 kB)
Using cached pandas-2.2.3-cp311-cp311-macosx_11_0_arm64.whl (11.3 MB)
Installing collected packages: pandas
  Attempting uninstall: pandas
    Found existing installation: pandas 2.1.4
    Uninstalling pandas-2.1.4:
      Successfully uninstalled pandas-2.1.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
modin 0.25.1 requires pandas<2.2,>=2.1, but you have pandas 2.2.3 which is incompatible.[0m[31m
[0mSuccessfully installed pandas-2.2.3


2024-11-06 20:40:48,218	INFO worker.py:1631 -- Connecting to existing Ray cluster at address: 127.0.0.1:64780...
[2024-11-06 20:40:53,226 E 20532 3648458] gcs_rpc_client.h:179: Failed to connect to GCS at address 127.0.0.1:64780 within 5 seconds.
[2024-11-06 20:41:23,255 W 20532 3648458] gcs_client.cc:178: Failed to get cluster ID from GCS server: TimedOut: Timed out while waiting for GCS to become available.
[2024-11-06 20:41:29,264 E 20532 3648458] gcs_rpc_client.h:179: Failed to connect to GCS at address 127.0.0.1:64780 within 5 seconds.
[2024-11-06 20:41:59,294 W 20532 3648458] gcs_client.cc:178: Failed to get cluster ID from GCS server: TimedOut: Timed out while waiting for GCS to become available.
[2024-11-06 20:42:05,306 E 20532 3648458] gcs_rpc_client.h:179: Failed to connect to GCS at address 127.0.0.1:64780 within 5 seconds.
[2024-11-06 20:42:35,333 W 20532 3648458] gcs_client.cc:178: Failed to get cluster ID from GCS server: TimedOut: Timed out while waiting for GCS to becom

In [None]:
predict_xgboost(result)

2024-11-05 23:03:40,300	INFO streaming_executor.py:108 -- Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-11-05_21-43-13_393763_11030/logs/ray-data
2024-11-05 23:03:40,300	INFO streaming_executor.py:109 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCSV]
                                                                                                 
✔️  Dataset execution finished in 0.83 seconds: 100%|██████████| 569/569 [00:00<00:00, 686 row/s]

- ReadCSV->SplitBlocks(16): Tasks: 0; Queued blocks: 0; Resources: 0.0 CPU, 94.2KB object store: : 569 row [00:00, 688 row/s]
2024-11-05 23:03:41,137	INFO streaming_executor.py:108 -- Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-11-05_21-43-13_393763_11030/logs/ray-data
2024-11-05 23:03:41,137	INFO streaming_executor.py:109 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCSV]
                                                         

{'predictions': 1}
{'predictions': 1}
{'predictions': 0}
{'predictions': 1}
{'predictions': 1}
{'predictions': 1}
{'predictions': 1}
{'predictions': 1}
{'predictions': 1}
{'predictions': 1}
{'predictions': 0}
{'predictions': 1}
{'predictions': 1}
{'predictions': 1}
{'predictions': 1}
{'predictions': 0}
{'predictions': 0}
{'predictions': 1}
{'predictions': 1}
{'predictions': 0}





In [None]:
# File name: model.py
from transformers import pipeline

class Translator:
    def __init__(self):
        # Load model
        self.model = pipeline("translation_en_to_fr", model="t5-small")

    def translate(self, text: str) -> str:
        # Run inference
        model_output = self.model(text)

        # Post-process output to return only the translation text
        translation = model_output[0]["translation_text"]

        return translation


In [None]:
from ray import serve

translator = Translator()

translation = translator.translate("Hello world!")
print(translation)

ModuleNotFoundError: No module named 'grpc'. You can run `pip install "ray[serve]"` to install all Ray Serve dependencies.

In [None]:
import ray
from fastapi import FastAPI

from transformers import pipeline

app = FastAPI()


In [None]:
@serve.deployment(num_replicas=2, ray_actor_options={"num_cpus": 0.2, "num_gpus": 0})
@serve.ingress(app)
class Translator:
    def __init__(self):
        # Load model
        self.model = pipeline("translation_en_to_fr", model="t5-small")

    @app.post("/")
    def translate(self, text: str) -> str:
        # Run inference
        model_output = self.model(text)

        # Post-process output to return only the translation text
        translation = model_output[0]["translation_text"]

        return translation

translator_app = Translator.bind()

In [None]:
# File name: model_client.py
import requests
from ray import serve

response = requests.post("http://127.0.0.1:8000/", params={"text": "Hello world!"})
french_text = response.json()

print(french_text)

ModuleNotFoundError: No module named 'grpc'. You can run `pip install "ray[serve]"` to install all Ray Serve dependencies.

In [None]:
import os
import tempfile
import numpy as np
from starlette.requests import Request
from typing import Dict

In [None]:
import tensorflow as tf

In [None]:
TRAINED_MODEL_PATH = os.path.join(tempfile.gettempdir(), "mnist_model.h5")

def train_and_save_model():
    # Load mnist dataset
    mnist = tf.keras.datasets.mnist
    (x_train, y_train), (x_test, y_test) = mnist.load_data()
    x_train, x_test = x_train / 255.0, x_test / 255.0

    # Train a simple neural net model
    model = tf.keras.models.Sequential(
        [
            tf.keras.layers.Flatten(input_shape=(28, 28)),
            tf.keras.layers.Dense(128, activation="relu"),
            tf.keras.layers.Dropout(0.2),
            tf.keras.layers.Dense(10),
        ]
    )
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    model.compile(optimizer="adam", loss=loss_fn, metrics=["accuracy"])
    model.fit(x_train, y_train, epochs=1)

    model.evaluate(x_test, y_test, verbose=2)
    model.summary()

    # Save the model in h5 format in local file system
    model.save(TRAINED_MODEL_PATH)

In [None]:
if not os.path.exists(TRAINED_MODEL_PATH):
    train_and_save_model()

In [None]:
@serve.deployment
class TFMnistModel:
    def __init__(self, model_path: str):
        import tensorflow as tf

        self.model_path = model_path
        self.model = tf.keras.models.load_model(model_path)

    async def __call__(self, starlette_request: Request) -> Dict:
        # Step 1: transform HTTP request -> tensorflow input
        # Here we define the request schema to be a json array.
        input_array = np.array((await starlette_request.json())["array"])
        reshaped_array = input_array.reshape((1, 28, 28))

        # Step 2: tensorflow input -> tensorflow output
        prediction = self.model(reshaped_array)

        # Step 3: tensorflow output -> web output
        return {"prediction": prediction.numpy().tolist(), "file": self.model_path}

NameError: name 'serve' is not defined

In [None]:
import requests
import numpy as np

In [None]:
resp = requests.get(
    "http://localhost:8000/", json={"array": np.random.randn(28 * 28).tolist()}
)
print(resp.json())