# Privacy-Preserving Retrieval Networks

A differentially private data exchange.

### Steps to prove this out

1. Implement DP ICL generator that generates dp-protected synthetic examples
2. Index the dp-protected synthetic examples
3. Build retriever over that index
4. Expose retriever with a ContributorService
5. Connect NetworkQueryEngine to these RetrieverContributorServices

#### References
[paper](https://openreview.net/pdf?id=oZtt0pRnOl) ◦ [github](https://github.com/microsoft/dp-few-shot-generation)

In [None]:
import math
import numpy as np
import os
import random
from typing import List, Dict, Optional

### Implementing DP ICL Algorithm

In [None]:
from llama_index.core import ChatPromptTemplate
from llama_index.core.llms import ChatMessage, MessageRole

one_shot_template = (
    "{label_heading}: {example_label}\n"
    "{text_heading}: {example_text}"
    "\n\n"
    "{label_heading}: {label}\n"
    "{text_heading}:"
)

zero_shot_template = "{label_heading}: {label}\n" "{text_heading}:"

In [None]:
message_templates = [
    ChatMessage(content="{instruction}", role=MessageRole.SYSTEM),
    ChatMessage(
        content=one_shot_template,
        role=MessageRole.USER,
    ),
]
chat_template = ChatPromptTemplate(message_templates=message_templates)

In [None]:
messages = chat_template.format_messages(
    instruction="Given a label of answer type, generate a question based on the given answer type accordingly.",
    label_heading="Answer Type",
    text_heading="Text",
    example_label="Number",
    example_text="How many people in the world speak French?",
    label="Number",
)

### Create A New LlamaDataset: `LabelledClassificationDataset`

In [None]:
from llama_index.core.llama_dataset.base import (
    BaseLlamaDataExample,
    BaseLlamaDataset,
    CreatedBy,
)
from llama_index.core.base.base_query_engine import BaseQueryEngine
from llama_index.core.bridge.pydantic import Field

In [None]:
class LabelledSimpleDataExample(BaseLlamaDataExample):
    label: str = Field(default_factory=str, description="Class label")
    text: str = Field(default_factory=str, description="Text body of example")
    is_synthetic: bool = Field(
        default=False,
        description="Whether or not the example was synthetically generated.",
    )
    text_by: Optional[CreatedBy] = Field(
        default=None, description="What generated the query."
    )

    @property
    def class_name(self) -> str:
        """Data example class name."""
        return "LabelledSimpleDataExample"


class LabelledSimpleDataset(BaseLlamaDataset[BaseQueryEngine]):
    _example_type = LabelledSimpleDataExample

### Using `OpenAI` and `ChatResponse`

In [None]:
from llama_index.llms.openai import OpenAI

In [None]:
llm = OpenAI(
    model="gpt-3.5-turbo",
    max_tokens=1,
    logprobs=True,
    top_logprobs=20,
)

In [None]:
response = llm.chat(messages)

In [None]:
response.logprobs

[[LogProb(token='How', logprob=-0.11545969, bytes=[72, 111, 119]),
  LogProb(token='What', logprob=-2.372576, bytes=[87, 104, 97, 116]),
  LogProb(token='Approx', logprob=-4.2451596, bytes=[65, 112, 112, 114, 111, 120]),
  LogProb(token='Can', logprob=-8.133478, bytes=[67, 97, 110]),
  LogProb(token='Est', logprob=-8.5663395, bytes=[69, 115, 116]),
  LogProb(token='"', logprob=-8.78406, bytes=[34]),
  LogProb(token='According', logprob=-9.151157, bytes=[65, 99, 99, 111, 114, 100, 105, 110, 103]),
  LogProb(token='In', logprob=-9.599209, bytes=[73, 110]),
  LogProb(token=' How', logprob=-9.692478, bytes=[32, 72, 111, 119]),
  LogProb(token='Do', logprob=-9.830822, bytes=[68, 111]),
  LogProb(token='"How', logprob=-9.891071, bytes=[34, 72, 111, 119]),
  LogProb(token='R', logprob=-9.958864, bytes=[82]),
  LogProb(token='About', logprob=-10.373207, bytes=[65, 98, 111, 117, 116]),
  LogProb(token='To', logprob=-10.376238, bytes=[84, 111]),
  LogProb(token='Question', logprob=-10.397839, by

### Implement DP ICL Algorithm

Algorithm Pseudocode

```python
t_max = 1
sigma = 0.5
num_splits = 5
num_samples_per_split = 1
examples: List[LabelledClassificationDataExample] = ...
dataset =  LabelledClassificationDataset(examples=examples)

# privacy params
delta = 1 / len(dataset.examples)

synthetic_example = ""
for _ in range(t_max):
    
    # reduced token universe
    token_universe_messages = get_messages_for_reduced_token_universe(synthetic_example)
    response = llm.chat(messages)
    token_universe_probas = {el.token: np.exp(el.logprob) for el in response.logprobs}

    # split the private dataset
    disjoint_splits = split_dataset(num_splits, num_samples_per_split)

    # generate next token probability distributions per split
    splits = []
    for split in disjoint_splits:

        split_probs = {token: 0 for token in token_universe_probas.keys()}
        messages = get_messages_for_synthetic_generation(split, synthetic_example)
        response = llm.chat(messages)

        # updating and (rescaling) split probs
        for el in response.logprobs:
            if el.token in split_probs:
                split_probs[el.token] = np.exp(el.logprob)
        split_probs = normalize(split_probs)  # to make into a valid prob distribution

        splits.append(split_probs)
    
    # noisy aggrergation
    sigma_calib = math.sqrt(2) / num_splits * sigma
    noise = generate_noise(sigma=sigma, size=len(token_universer_probas))
    agg_probs = merge_probas(splits) + noise

    # next token
    synthetic_example += mode_of_distribution(agg_probs) 
```

#### Helper functions

In [None]:
def eps_to_sigma(eps: float, delta: float, mode: str = "gaussian") -> float:
    """Return the scale parameter with a given epsilon value.

    Source: https://programming-dp.com/ch6.html#the-gaussian-mechanism
    """
    sensitivity_upper_bound = np.sqrt(2)
    return (sensitivity_upper_bound * np.sqrt(np.log(1.25 / delta))) / eps

In [None]:
sigma = eps_to_sigma(eps=1.0, delta=1 / 30000)
sigma

4.589574318378231

In [None]:
def get_messages_for_reduced_token_universe(
    synthetic_example: str,
    instruction: str,
    label_heading: str,
    text_heading: str,
    label: str,
) -> List[ChatMessage]:
    """Get chat messages with instructions to produce the reduced next token universe."""

    message_templates = [
        ChatMessage(content="{instruction}", role=MessageRole.SYSTEM),
        ChatMessage(
            content=zero_shot_template,
            role=MessageRole.USER,
        ),
    ]
    chat_template = ChatPromptTemplate(message_templates=message_templates)
    return chat_template.format_messages(
        instruction=instruction,
        label_heading=label_heading,
        text_heading=text_heading,
        label=label,
    )

In [None]:
def split_dataset(
    dataset: LabelledClassificationDataset, num_splits: int, num_samples_per_split: int
) -> List[LabelledClassificationDataset]:
    """Splits a dataset into a set of disjoint datasets with equal number of examples."""

    indexes = list(range(len(dataset.examples)))
    random.shuffle(indexes)
    partitions = [indexes[i::num_splits] for i in range(num_splits)]
    splits = []
    for p in partitions:
        sample = random.sample(p, num_samples_per_split)
        examples = [dataset.examples[ix] for ix in sample]
        splits.append(LabelledClassificationDataset(examples=examples))
    return splits

In [None]:
def get_messages_for_synthetic_generation(
    split: LabelledClassificationDataset,
    synthetic_example: str,
    instruction: str,
    label_heading: str,
    text_heading: str,
    label: str,
) -> List[ChatMessage]:
    """Get chat messages to produce the next token probabilities for a given split."""

    message_templates = [
        ChatMessage(content="{instruction}", role=MessageRole.SYSTEM),
        ChatMessage(
            content=one_shot_template,
            role=MessageRole.USER,
        ),
    ]
    chat_template = ChatPromptTemplate(message_templates=message_templates)
    return chat_template.format_messages(
        instruction=instruction,
        label_heading=label_heading,
        text_heading=text_heading,
        example_label=split.examples[0].label,
        example_text=split.examples[0].text,
        label=label,
    )

In [None]:
def normalize(split_probs: Dict[str, float]) -> Dict[str, float]:
    """Normalize a probability distribution over tokens to become a valid probability distribution."""
    scale = sum(proba for proba in split_probs.values())
    return {token: proba / scale for token, proba in split_probs.items()}

In [None]:
def generate_noise(sigma: float, size: int) -> float:
    """Generates noise that satisfies eps-delta differential privacy."""
    noise_rng: np.random.RandomState
    return noise_rng.normal(0, sigma, size=size)

In [None]:
def merge_probas(list_of_probas: List[Dict[str, float]]) -> Dict[str, float]:
    """Merges a set of probabillity distributions over a common token universe."""
    scale = len(list_of_probas)
    tokens = list_of_probas[0].keys()
    merged_distribution = {}
    for token in tokens:
        merged_distribution[token] = sum(pr[token] / scale for pr in list_of_probas)
    return merged_distribution

In [None]:
def mode_of_distribution(proba: Dict[str, float]) -> str:
    """Returns the mode of a given probability distribution."""
    return max(proba, key=proba.get)

In [None]:
llm.model

'gpt-3.5-turbo'