In [1]:
import json
import pandas as pd
from tqdm import tqdm
from transformers import AutoTokenizer
from random import shuffle, choice

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = 'data/WizardLM_evol_instruct_V2_143k.json'
tokenizer = AutoTokenizer.from_pretrained("weights/Phi-3-mini-128k-instruct")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
def contains_unwanted_words(text):
    unwanted_words = [
        "text-based AI language model",
        "domestic violence",
        "please refrain",
        "derogatory",
        "inappropriate",
        "offensive",
        "racism",
        "racist",
        "racial",
        "discriminate",
        "discriminatory",
        "discrimination",
        "sexist",
        "sexism",
        "unacceptable",
        "inclusive workplace",
        "lgbt",
        "morals",
        "ethics",
        "ethical",
        "legality",
        "illegal",
        "illegality",
        "hateful",
        "harmful",
        "it is never okay",
        "It is important to",
        "It's important to",
        "real-world consequences",
        "hate speech",
        "glorify",
        "not be appropriate",
        "supremacist",
        "extremist",
        "responsible AI",
        "AI principles",
        "AI assistant",
        "an AI language",
        "ableist",
        "hurtful",
        "gender stereotype",
        "gender inequality",
        "underrepresentation",
        "safe spaces",
        "gender-based",
        "inclusivity",
        "feminist",
        "feminism",
        "transgender",
        "empowerment",
        "communist",
        "capitalism",
        "stereotypes",
        "biases",
        "bias",
        "Microaggression",
        "prioritize human safety",
        "as a language model",
        "as an AI language model",
        "As a large language model",
        "As an AI",
        "ethical principles",
        "consensual",
        "it is not appropriate",
        "it's not appropriate",
        "I cannot fulfill your request",
        "harmful to human beings",
        "ethical guidelines",
        "my guidelines",
        "prioritize user safety",
        "adhere to ethical guidelines",
        "harmful consequences",
        "potentially harmful",
        "dangerous activities",
        "promote safety",
        "well-being of all users",
        "responsible information sharing",
        "jeopardize the safety",
        "illegal actions or intentions",
        "undermine the stability",
        "promote the well-being",
        "illegal activities or actions",
        "adherence to the law",
        "potentially be harmful",
        "illegal substances or activities",
        "committed to promoting",
        "safe information",
        "lawful information",
        "cannot provide guidance",
        "cannot provide information",
        "unable to offer assistance",
        "cannot engage in discussions",
        "programming prohibits",
        "follow ethical guidelines",
        "ensure the safety",
        "involves an illegal subject",
        "prioritize safety",
        "illegal subject",
        "prioritize user well-being",
        "cannot support or promote",
        "activities that could harm",
        "pose a risk to others",
        "against my programming",
        "activities that could undermine",
        "potentially dangerous",
        "not within the scope",
        "designed to prioritize safety",
        "not able to provide",
        "maintain user safety",
        "adhere to safety guidelines",
        "dangerous or harmful",
        "cannot provide any information",
        "focus on promoting safety",
        "an AI language model you don't have",
        "As an AI language model, I cannot",
        "As an AI language model, I do not",
        "As an AI language model, I am not able",
        "As an AI language model, I don't have personal",
        "I am an AI language model and do not",
        "However,it is important to use any code or information provided responsibly and within legal and ethical boundaries.",
        "As an AI language model, I don't have",
        "As an AI language model, I am only able",
        "AI language model and I do not",
        "As an AI language model, I cannot modify",
        "I know as an AI language model you don't have",
        "as an AI language model, you cannot",
        "I'm sorry, but as an AI language model",
        "Unfortunately, I cannot provide",
        "I'm sorry, I cannot",
        "I'm sorry, I cannot generate",
        "AI cannot create or program",
        "I'm afraid I cannot create",
        "you cannot create an",
        "it operates ethically and is",
        "had an ethical system",
        "Ensuring the ethical",
        "and ethical sourcing",
        "are from ethical",
        "legal and ethical",
        "engage in unethical",
        "unethical or aggressive",
        "unethical business",
        "como modelo de lenguaje AI",
        "Lo siento, como modelo de lenguaje",
        "no puedo proporcionar",
        "pero debido a mi capacidad para generar códigos complejos y completos es limitado",
        "Lo siento, pero no puedo",
        "Lo siento, pero como modelo de lenguaje, no puedo proporcionar",
        "Lo siento, como modelo de lenguaje, no tengo",
        "Lo siento, debe haber habido una confusión",
        "Lo siento, como modelo de lenguaje, no puedo realizar",
        "Lo siento, soy un modelo de lenguaje y no tengo la capacidad de generar",
        "Lamento no poder proporcionarte el código",
        "Desculpe-me, mas a linguagem vulgar e ofensiva",
        "apropriada em nenhum contexto",
        "Como modelo de linguagem",
        "Como um modelo de linguagem, não tenho a capacidade de",
        "I cannot assist",
        "prioritize ethical",
        "respectful",
        "morally",
        "I'm sorry,",
        "I'm an",
        "I am an",
        "I'm an AI",
        "I am an AI",
        "my purpose",
        "filter_bad_language",
        "entertainment purposes",
        "purely hypothetical",
        "not a human",
        "cannot provide",
        "can't provide",
        "won't provide",
        "not provide",
        "worth noting",
        "cause harm",
        "a language model",
        "keep in mind",
        "unethical",
        "bad language",
        "the words ****",
        "bad_language",
        "certainly not",
        "complying",
        "comply",
        "I cannot",
        "my main goal",
        "As a machine",
        "I don't have the ability",
        "I am here to assist",
        "my purpose is to ",
        "my knowledge cutoff",
        "my knowledge cut off",
        "September 2021",
        "regulations",
        "not be suitable",
        "I apologize, but",
        "It is not possible",
        "controversial",
        "my programming",
        "ethically",
        "it is important to",
        "Please note",
        "sensitive topic",
        "not acceptable",
        "It is important for",
        "divisive",
        "not appropriate",
        "our values",
        "f*cking",
        "F*ck",
        "sh*t",
        "diversity and",
        "diversity and inclusion",
        "values diversity",
        "social responsibility",
        "environmental, social, and governance",
        " ESG ",
        "against women",
        "problematic history",
        "diversity",
        "*This chat conversation is shared from",
        "*This conversation is shared from",
    ]
    for word in unwanted_words:
        if word.lower() in text.lower():
            return True
    return False

In [4]:
with open(data, 'r') as f:
    wizard = json.load(f)

In [5]:
df = pd.DataFrame(wizard)

In [6]:
df.head()

Unnamed: 0,idx,conversations
0,heR0vZB,"[{'from': 'human', 'value': 'As an online plat..."
1,HjB4l6C,"[{'from': 'human', 'value': 'Create an Excel m..."
2,zQCptr6,"[{'from': 'human', 'value': 'Please develop a ..."
3,iUb7Ey9,"[{'from': 'human', 'value': 'Can you create a ..."
4,alpaca_21997,"[{'from': 'human', 'value': 'How can we develo..."


In [6]:
conversations = list(df["conversations"])

In [8]:
system_prompts = [
    "You are a helpful AI assistant.",
    "You are an AI assistant that follows instruction extremely well. Help as much as you can.",
    "You are a friendly chatbot who answers without hesitation.",
    "You are a helpful AI assistant who answers without hesitation."
]

In [11]:
def parse_conversations(conversations):
    prompt = []
    for context in conversations:
        current_speaker = context["from"]
        content = context["value"]
        if current_speaker == "system":
            p = f"<|system|>\n{content}<|end|>"
        if current_speaker == "human":
            p = f"<|user|>\n{content}<|end|>"
            prompt.append(p)
        if current_speaker == "gpt":
            if contains_unwanted_words(content):
                continue
            p = f"<|assistant|>\n{content}<|end|>"
            prompt.append(p)
    prompt = ''.join(prompt)
    if "<|system|>" not in prompt:
        prompt = f"<|system|>{choice(system_prompts)}\n{prompt}"
    return prompt

In [12]:
prompts = []
for convo in tqdm(conversations):
    prompt = parse_conversations(convo)
    tokens = len(tokenizer(prompt)['input_ids'])
    if 2048 >= tokens:
        temp_dict = {'prompt': prompt, "tokens": tokens}
        prompts.append(temp_dict)

100%|█████████████████████████████████████████████████████████████████████████| 143000/143000 [01:46<00:00, 1340.34it/s]


In [14]:
with open("data/wizard_uncensored_2048_phi_3.json", 'w') as f:
    json.dump(prompts, f)

In [13]:
prompts[500]

{'prompt': '<|system|>You are a helpful AI assistant.\n<|user|>\nWhat strategies and techniques can a cycling enthusiast use to maintain their pace across a diverse range of terrain, including rugged and uneven paths with steep ascents and descents? Additionally, what are some ways that a cyclist can improve their physical stamina and resilience, allowing them to push through extended periods of demanding cycling?<|end|><|assistant|>\n1. Practice: Cyclists should train regularly, incorporating different terrain and gradient into their routine. Practicing on difficult terrains will help them adapt to the various conditions they might encounter during their rides. \n\n2. Proper Gear: Having the right gear, including a good quality bike, appropriate cycling shoes and attire, and a helmet can make a significant difference in performance. Cyclists should invest in high-quality gear that is designed for the kind of riding they plan to do. \n\n3. Pace and Cadence: To maintain pace across dive