In [2]:
print(2)

2


In [3]:
import openai
import sklearn
import pandas as pd
import functools

In [4]:
api_key = ""
openai.api_key = api_key

In [5]:
# Load the data
data = pd.read_json("data/frichti_clean_prepared (2).jsonl", lines=True)

true_examples = data["completion"].values.tolist()

In [6]:
from tqdm import tqdm
# import gpt2 tokenizer from transformers
from transformers import GPT2Tokenizer

@functools.lru_cache(maxsize=1000, typed=False)
def generate_examples_from_prompts(prompt_prefix, prompt_suffix, prompt_list, n_examples_per_prompt=10, engine="text-davinci-003", max_tokens=250):
    """
    Generate examples from a list of prompts, with openai API
    """
    examples = []
    for prompt in prompt_list:
        prompt = prompt_prefix + prompt + prompt_suffix
        response = openai.Completion.create(
            engine=engine,
            prompt=prompt,
            temperature=0.7,
            max_tokens=max_tokens,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
            n=n_examples_per_prompt,
            stop=["END"]
        )
        examples.extend([choice["text"] for choice in response["choices"]])

    # Add the stop token to the end of the examples
    examples = [example + "\nEND" for example in examples]
    return examples


In [7]:
# Take 4 examples at random for the prompt prefix, and remove them from the dataset
# The examples should be long enough to be representative of the dataset

seed = 56
prompts = sklearn.utils.resample(true_examples, n_samples=4, random_state=seed)

prompt_prefix = "Voici des exemples de newsletter Frichti:\n"
for i, prompt in enumerate(prompts):
    prompt_prefix += f"Example {i + 1} \n"
    prompt_prefix += prompt
    prompt_prefix += "\n"

print("Prompt prefix: ", prompt_prefix)

Prompt prefix:  Voici des exemples de newsletter Frichti:
Example 1 
 DU FAIT MAISON EN MOINS DE 10 MIN ????? 😱? 
 Contenu: Le plat le plus rapide du monde fait son retour !

Quelle surprise, c’est déjà prêt ! 💛

Des amis qui débarquent sans prévenir ?
Frichti à la rescousse ! On s’occupe de tout.
Un gratin dauphinois livré en 10min !
Quelques instants au four, et c’est déjà l’heure de passer à table !
On vous dit bon appétit 😋

La cuisine, c’est (super) facile avec Frichti !

END
Example 2 
 L’histoire commence à 9h45 dans le Dauphiné? 
 Contenu: Des abricots cultivés sur les hauteurs de la vallée du Rhône.  

Sur les hauteurs de la vallée du Rhône, Alain cultive des abricots dans son verger certifié Haute Valeur Environnementale. En altitude, ce fruit pousse plus lentement, juste le temps qu’il lui faut pour développer tous ses arômes.

END
Example 3 
 Mamma Mia !!! 😱🇮🇹? 
 Contenu: La nouvelle carte d’été de Pastavino est disponible dès maintenant chez Frichti

La référence de la cui

In [None]:
# Generate examples from the prompt prefix
prompt_suffix = "Ecris une nouvelle newsletter Frichti du même style que les exemples:"
prompts = ('',) # tuple to be able to use lru_cache
max_tokens = 250
generated_examples = generate_examples_from_prompts(prompt_prefix, prompt_suffix, prompts, n_examples_per_prompt=100, max_tokens=max_tokens)
for i, example in enumerate(generated_examples):
    print(f"Example {i}: {example}")


In [None]:
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
# Compute the number of tokens in all generated examples
n_tokens = [len(tokenizer.encode(example)) for example in generated_examples]
print("Number of tokens in generated examples: ", n_tokens)

# Remove the truncated examples
max_tokens = 250
print("Number of examples before removing truncated examples: ", len(generated_examples))
generated_examples = [example for example, n_tokens in zip(generated_examples, n_tokens) if n_tokens < max_tokens - 2]
print("Number of examples after removing truncated examples: ", len(generated_examples))


In [None]:
for i, example in enumerate(generated_examples):
    print(f"Example {i}: {example}")

# Prompt Newtone

In [None]:
# pour l'instant j'enlève le menu de la semaine pour avoir un prompt standard
# et les keywords aussi
# prompt_prefix = """
# Etape 1 : écris une Newsletter en (bo environ 105 mots sur le sujet suivant: Ecris moi une newsletter pour "Frichti", notre startup de livraison de plats préparés.
# Etape 2 : Ecris un objet du mail au début.

# Etape 4 : - un ton très familier et amical
# - Utilise le tutoiement
# - Ajoute quelques emojis en lien avec l'aliment mentionné
# - Signe à la fin l'Equipe Frichti
# """
# prompt_suffix = ""
# generated_examples = generate_examples_from_prompts(prompt_prefix, prompt_suffix, prompts, n_examples_per_prompt=100)
# for i, example in enumerate(generated_examples):
#     print(f"Example {i}: {example}")

In [None]:
# Embed the true and generated examples with the openai api and train a classifier
import numpy as np

@functools.lru_cache(maxsize=1000, typed=False)
def get_embedding(examples, model="text-embedding-ada-002"):
   #text = text.replace("\n", " ")
   embeddings =  openai.Embedding.create(input = examples, model=model)['data']
   # Create a numpy array of the embeddings
   embeddings = np.array([np.array(embedding["embedding"]) for embedding in embeddings])
   return embeddings
 



In [None]:
# Train a classifier to distinguish between the true and generated examples
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

@functools.lru_cache(maxsize=1000, typed=False)
def score_examples_classifier(true_examples, generated_examples, n_true=None, n_trials=1, show_examples=False):
    embeddings_generated = get_embedding(tuple((generated_examples)))
    embeddings_true = get_embedding(tuple(true_examples))

    res_dic  = {}
    clf_list = [LogisticRegression, RandomForestClassifier]
    clf_names = ["LogisticRegression", "RandomForestClassifier"]
    for clf, clf_name in zip(clf_list, clf_names):
        # Create empty arrays for each classifier and each metric
        res_dic[clf_name] = {}
        res_dic[clf_name]["accuracy"] = np.zeros(n_trials)
        res_dic[clf_name]["precision"] = np.zeros(n_trials)
        res_dic[clf_name]["recall"] = np.zeros(n_trials)
        res_dic[clf_name]["f1"] = np.zeros(n_trials)
        res_dic[clf_name]["min_accuracy"] = np.zeros(n_trials)
        res_dic[clf_name]["mean_generated_proba"] = np.zeros(n_trials)
        res_dic[clf_name]["mean_true_proba"] = np.zeros(n_trials)

        

    if n_true is None:
        n_true = len(embeddings_true)
    for i in range(n_trials):
        chosen_true_examples_indices = np.random.choice(len(embeddings_true), n_true, replace=False)
        embeddings_true_chosen = embeddings_true[chosen_true_examples_indices]

        X = np.concatenate([embeddings_generated, embeddings_true_chosen])
        y = np.concatenate([np.zeros(len(embeddings_generated)), np.ones(n_true)])


        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


        for clf, clf_name in zip(clf_list, clf_names):
            clf = clf(random_state=i)
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            res_dic[clf_name]["accuracy"][i] = accuracy_score(y_test, y_pred)
            res_dic[clf_name]["precision"][i] = precision_score(y_test, y_pred)
            res_dic[clf_name]["recall"][i] = recall_score(y_test, y_pred)
            res_dic[clf_name]["f1"][i] = f1_score(y_test, y_pred)
            res_dic[clf_name]["min_accuracy"][i] = max(1 - np.mean(y_test), np.mean(y_test))
            y_pred_proba = clf.predict_proba(X_test)
            # compute the mean probablity of the generated examples being true
            res_dic[clf_name]["mean_generated_proba"][i] = np.mean(y_pred_proba[y_test == 0, 1])
            # compute the mean probablity of the true examples being true
            res_dic[clf_name]["mean_true_proba"][i] = np.mean(y_pred_proba[y_test == 1, 1])


            if show_examples:
                # Sort the generated examples by their probability of being true
                y_pred_proba = clf.predict_proba(X_test)
                y_pred_proba_true = y_pred_proba[:, 1]
                y_pred_proba_true_sorted_indices = np.argsort(y_pred_proba_true)[::-1]
                print(f"--------- Classifier: {clf_name} ---------")
                print("Generated examples sorted by probability of being true:")
                print("----------------------------------------")
                for j in y_pred_proba_true_sorted_indices:
                    if y_test[j] == 0: # generated example
                        print(f"Example: {generated_examples[j]}")
                        print(f"Probability of being true: {y_pred_proba_true[j]}")
                        print(f"True label: {y_test[j]}")
                        print(f"Predicted label: {y_pred[j]}")
                        print("")
                        print("----------------------------------------")

                # Sort the true examples by their probability of being false
                y_pred_proba_false = y_pred_proba[:, 0]
                y_pred_proba_false_sorted_indices = np.argsort(y_pred_proba_false)[::-1]
                print("True examples sorted by probability of being false:")
                print("----------------------------------------")
                for j in y_pred_proba_false_sorted_indices:
                    if y_test[j] == 1:
                        print(f"Example: {true_examples[j]}")
                        print(f"Probability of being true: {y_pred_proba_true[j]}")
                        print(f"True label: {y_test[j]}")
                        print(f"Predicted label: {y_pred[j]}")
                        print("")
                        print("----------------------------------------")




    
    return res_dic





In [None]:
res_dic = score_examples_classifier(tuple(true_examples), tuple(generated_examples), n_true=100, n_trials=1, show_examples=True)

In [None]:
res_dic

In [None]:
res_dic

In [None]:
res_dic

# Another scoring method: compute the perplexity of the true examples

In [None]:
perplexities = [1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000]