## synthetic data generation with AI

In [20]:
openaikey = "sk-proj-"

# setting it to the environment variable
import os
os.environ["OPENAI_API_KEY"] = openaikey

In [21]:
from openai import OpenAI
client = OpenAI(api_key=openaikey)

In [22]:
question = """
Create 1000 rows of synthetic tweet data for NER in CSV format.
The file should consist of a row for every token in the tweets.
Each row should include the following fields:
 - token
 - label
 - sentence_id

The token-column contains only one token for each row, the tokens put together should form concise social media posts.
The label represents the entity of every token. Only label companies, products and persons. Every other token gets the value "O".
The sentence_id increments every time a new sentence begins. It starts at 0. 


Make sure that the tweets make sense. Also only respond with the data.
"""

response = client.chat.completions.create(
  model="gpt-3.5-turbo",
  messages=[
    {"role": "system", "content": "You are a helpful assistant designed to generate synthetic data."},
    {"role": "user", "content": question}
  ]
)
res = response.choices[0].message.content
print(res)

In [None]:
import pandas as pd
import openai

def generate_text(prompt):
    response = openai.Completion.create(
        engine="text-davinci-003",  
        prompt=prompt,
        max_tokens=100  
    )
    return response['choices'][0]['text'].strip()


prompt = "Generate a twitter post that is suited ."
generated_text = generate_text(prompt)
print(generated_text)

## synthetic data generation with code

In [None]:
# templates for synthetic data generation
import csv
import random

corporations = ['Apple', 'Google', 'Microsoft', 'Tesla', 'Amazon', 'Samsung', 'Blackberry', 'Nokia', 'Sony', 'Nintendo', 'Sega', 'OpenAI', 'Huawei']
products = ['iPhone', 'iPad', 'Pixel', 'Windows 7', 'Model S', 'Echo', 'Galaxy', 'AirPods', 'AppleWatch', 'PlayStation', 'Xbox', 'GameCube', 'Fitwatch']
persons = ['Elon', 'Musk', 'Tim', 'Cook', 'Steve Jobs', 'Jeff', 'Bezos', 'Sundar', 'Pichai', 'Smith', 'Miller', 'Davids', 'John', 'Richardson']

tweet_templates = [
    "I love my new {product}, thanks {corporation}!",
    "{person} just announced the new {product} from {corporation}.",
    "The {product} from {corporation} is amazing!",
    "{corporation}'s latest update to {product} is revolutionary.",
    "{person} said that {corporation} is working on something big.",
    "Have you seen the latest {corporation} event? {product} is the star!",
    "{person} just bought a new {product}. Can't wait to try it!",
    "Rumors say {corporation} is releasing {product} soon.",
    "I hate the new {product}, the older ones are much better.",
    "I just upgraded to the new {product}, {corporation} never disappoint!",
    "{person} just hinted at new features in {corporation}'s upcoming {product}. I am hyped!",
    "{person} is revolutionizing the industry with {corporation} and their {product}.",
    "Just watched the {corporation} keynote. {product} looks impressive.",
    "I'm curious, what do you think about {person}'s role at {corporation}?",
    "Can not wait for {product} also. They should sell them down at SXSW.",
    "The {corporation} store still has {product} and short lines.",
    "more than 150 million mobile users for {product} for mobile #SXSW",
    "Less than 2 hours until we announce the details on the {product} giveaway!",
    "Is {corporation} going to drop another {product} soon? I need to know!",
    "Looks like {person} is shaking things up at {corporation} with {product}.",
    "{corporation} has a temporary Retail Store in Austin for the {product} release today. Opens at 5pm.",
    "Reminder: {person} will be talking about {company} and {product} access today.",
    "{corporation} is giving free {product} to open source coders who r attending this meet-up.",
    "Is {person} still leading the innovation at {corporation} with {product}?",
    "{corporation} just keeps raising the bar with every {product} they launch. Crazy!",
    "{person} hyped up {corporation}'s {product}, but it's not that great in reality.",
    "I am having so many issues with the {product}. {corporation} needs to fix this!",
    "The innovation in {corporation}'s {product} is something only {person} could pull off.",
    "Is it just me, or does {corporation}'s {product} feel rushed and unfinished?",
    "Is it just me or does the new {product} feel rushed and unfinished? Disappointing",
    "{corporation}'s {product} is overrated. I can't believe I fell for the marketing.",
    "All eyes are on {corporation} after the announcement of their new {product}.",
    "When {person} talks about {corporation}'s new {product}, you know it's going to be good.",
    "The latest from {corporation}? Their {product} just dropped, and it's all over the internet.",
    "I'm amazed at how {person} transformed {corporation} with innovations like {product}.",
    "The {product} is making me rethink my loyalty to {corporation}. Its not good.",
    "Honestly, {person} needs to focus on fixing {corporation}'s {product} before releasing new ones.",
    "{person} teased some big changes for {corporation}'s {product}. I wonder what's next.",
    "I trusted {person} to deliver a great {product} at {corporation}, but this is a flop.",
    "{corporation} just came out with a new model, the {product}, which is electric. I love that!",
    "{corporation}'s industry party tonight was great for the launch of {product}.",
    "{product} is the must-have gadget of the year.",
    "What is happening at {corporation}? {person} really needs to step up.",
    "Who else is gonna get the new {product} next month?",
    "It's time for {person} to leave {corporation}. What is he even doing.",
    "{corporation} has been selected as the top AI startup in Austria, wow!",
    "{person} was right! The {product} from {corporation} is revolutionary!"
]

In [None]:
# creating the data with a row for each token and its label
import re

def tokenize(sentence):
    return re.findall(r"\w+|[^\w\s]", sentence)

data = []
sentence_id = 0

while len(data) < 500000:
    template = random.choice(tweet_templates)
    company = random.choice(companies)
    product = random.choice(products)
    person = random.choice(persons)
    tweet = template.format(company=company, product=product, person=person)

    tokens = tokenize(tweet)

    i = 0
    while i < len(tokens):
        token = tokens[i]
        label = "O"
        
        if ' '.join(tokens[i:i + len(company.split())]) == company:
            for j in range(len(company.split())):
                data.append([tokens[i + j], "B-company" if j == 0 else "I-company", sentence_id])
            i += len(company.split())
            continue
        elif ' '.join(tokens[i:i + len(product.split())]) == product:
            for j in range(len(product.split())):
                data.append([tokens[i + j], "B-product" if j == 0 else "I-product", sentence_id])
            i += len(product.split())
            continue
        elif ' '.join(tokens[i:i + len(person.split())]) == person:
            for j in range(len(person.split())):
                data.append([tokens[i + j], "B-person" if j == 0 else "I-person", sentence_id])
            i += len(person.split())
            continue
        else:
            data.append([token, label, sentence_id])
            i += 1
    
    sentence_id += 1

csv_file_path = "./synthetic_tweet_data3.csv"
with open(csv_file_path, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["token", "label", "sentence_id"])
    writer.writerows(data[:500000])

csv_file_path

In [None]:
# creating the data grouped, with one row per generated tweet
import re
import random
import csv

def tokenize(sentence):
    return re.findall(r"\w+|[^\w\s]", sentence)

data = []

while len(data) < 100000:
    template = random.choice(tweet_templates)
    company = random.choice(companies)
    product = random.choice(products)
    person = random.choice(persons)
    tweet = template.format(company=company, product=product, person=person)

    tokens = tokenize(tweet)
    labels = []

    i = 0
    while i < len(tokens):
        token = tokens[i]
        label = "O"
        
        if ' '.join(tokens[i:i + len(company.split())]) == company:
            for j in range(len(company.split())):
                labels.append("B-company" if j == 0 else "I-company")
            i += len(company.split())
            continue
        elif ' '.join(tokens[i:i + len(product.split())]) == product:
            for j in range(len(product.split())):
                labels.append("B-product" if j == 0 else "I-product")
            i += len(product.split())
            continue
        elif ' '.join(tokens[i:i + len(person.split())]) == person:
            for j in range(len(person.split())):
                labels.append("B-person" if j == 0 else "I-person")
            i += len(person.split())
            continue
        else:
            labels.append(label)
            i += 1

    data.append([tokens, labels])

csv_file_path = "./synthetic_tweet_data_grouped.csv"
with open(csv_file_path, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["tokens", "labels"])
    writer.writerows(data)

csv_file_path

In [None]:
# not necessary for final function to work
def ensure_alignment(df):
    for i, row in df.iterrows():
        tokens = row['tokens']
        tags = row['tags']
        
        if len(tokens) != len(tags):
            print(f"Fixing misalignment in row {i}: tokens={len(tokens)}, tags={len(tags)}")
            
            if len(tokens) > len(tags):
                tags.extend(["O"] * (len(tokens) - len(tags)))
            elif len(tags) > len(tokens):
                tags = tags[:len(tokens)]
            
            df.at[i, 'tags'] = tags
    
    return df

df = generate_ner_dataset(500)
df = ensure_alignment(df)

In [None]:
import csv
import random
import pandas as pd
import numpy as np

# working function but only in same notebook as df, used in ner deep learning
def generate_ner_dataset(num_examples, output_file="new_synthetic_ner_dataset.csv"):
    
    persons = [
        "John Smith", "Emily Chen", "Michael Johnson", "Sarah Williams", 
        "David Lee", "Maria Rodriguez", "James Brown", "Emma Davis",
        "Robert Kim", "Jennifer Lopez", "Thomas Wilson", "Jessica Taylor",
        "Carlos Vega", "Aisha Patel", "Daniel Park", "Olivia Nguyen"
    ]
    
    corporations = [
        "Google", "Microsoft", "Apple", "Amazon", "Meta", 
        "IBM", "Tesla", "Netflix", "Walmart", "JP Morgan",
        "Acme Corp", "TechSolutions", "Global Systems", "DataWorks",
        "Quantum Industries", "NexGen", "FutureSpace", "EcoSystems"
    ]
    
    products = [
        "iPhone 13", "Galaxy S22", "Surface Pro", "PlayStation 5", "Xbox Series X",
        "MacBook Air", "Echo Dot", "AirPods Pro", "Tesla Model 3", "iPad Mini",
        "Dyson V11", "Fitbit Charge", "Nintendo Switch", "Kindle Paperwhite",
        "Roomba i7", "GoPro Hero", "Bose QuietComfort", "Instant Pot"
    ]
    
    events = [
        "CES 2023", "Web Summit", "SXSW", "TechCrunch Disrupt", "E3 Expo",
        "Google I/O", "WWDC", "Consumer Electronics Show", "Mobile World Congress",
        "Black Hat Conference", "DEF CON", "AWS re:Invent", "Game Developers Conference",
        "Dreamforce", "Comic-Con", "Coachella", "New York Fashion Week"
    ]
    
    locations = [
        "New York", "San Francisco", "London", "Tokyo", "Berlin",
        "Paris", "Sydney", "Toronto", "Chicago", "Seattle", 
        "Los Angeles", "Miami", "Singapore", "Hong Kong",
        "Dubai", "Barcelona", "Austin", "Stockholm", "Seoul"
    ]
    
    templates = [
        "[PERSON] from [CORPORATION] announced that [PRODUCT] will be showcased at [EVENT] in [LOCATION].",
        "At [EVENT], [PERSON] demonstrated how [PRODUCT] is revolutionizing [CORPORATION]'s approach in [LOCATION].",
        "[CORPORATION] has selected [LOCATION] as the venue for [EVENT], where [PERSON] will launch [PRODUCT].",
        "The new [PRODUCT] developed by [CORPORATION] will be presented by [PERSON] during [EVENT] in [LOCATION].",
        "[PERSON] confirmed that [CORPORATION] will be expanding its [PRODUCT] line at this year's [EVENT] in [LOCATION].",
        "According to [PERSON], [CORPORATION]'s latest [PRODUCT] has been well-received at [EVENT] in [LOCATION].",
        "Reviews from [EVENT] suggest that [PERSON] made a strong case for [CORPORATION]'s new [PRODUCT] in the [LOCATION] market.",
        "[CORPORATION] is planning to open a [PRODUCT] store in [LOCATION], announced [PERSON] at [EVENT].",
        "The collaboration between [CORPORATION] and [PERSON] resulted in [PRODUCT], which will debut at [EVENT] in [LOCATION].",
        "Attendees at [EVENT] in [LOCATION] were impressed when [PERSON] revealed [CORPORATION]'s innovative [PRODUCT].",
        "[PERSON] traveled to [LOCATION] to promote [PRODUCT] at [EVENT] on behalf of [CORPORATION].",
        "The [PRODUCT] team from [CORPORATION], led by [PERSON], won first prize at [EVENT] in [LOCATION].",
        "Consumers in [LOCATION] can now purchase [PRODUCT] after [CORPORATION]'s expansion announcement by [PERSON] at [EVENT]."
    ]
    
    data = []
    
    for _ in range(num_examples):
        template = random.choice(templates)
        
        person = random.choice(persons)
        corporation = random.choice(corporations)
        product = random.choice(products)
        event = random.choice(events)
        location = random.choice(locations)
        
        sentence = template.replace("[PERSON]", person)\
                          .replace("[CORPORATION]", corporation)\
                          .replace("[PRODUCT]", product)\
                          .replace("[EVENT]", event)\
                          .replace("[LOCATION]", location)
        
        tokens = []
        tags = []
        
        raw_words = []
        current_word = ""
        for char in sentence:
            if char.isalnum() or char in "-'":
                current_word += char
            else:
                if current_word:
                    raw_words.append(current_word)
                    current_word = ""
                if not char.isspace():
                    raw_words.append(char)
        if current_word:
            raw_words.append(current_word)
        
        i = 0
        while i < len(raw_words):
            token = raw_words[i]
            
            found_entity = False
            
            if i < len(raw_words) - 1 and f"{raw_words[i]} {raw_words[i+1]}" in persons:
                tokens.append(raw_words[i])
                tokens.append(raw_words[i+1])
                tags.append("B-person")
                tags.append("I-person")
                i += 2
                found_entity = True
                
            elif i < len(raw_words) - 2 and f"{raw_words[i]} {raw_words[i+1]} {raw_words[i+2]}" in products:
                tokens.append(raw_words[i])
                tokens.append(raw_words[i+1])
                tokens.append(raw_words[i+2])
                tags.append("B-product")
                tags.append("I-product")
                tags.append("I-product")
                i += 3
                found_entity = True
            elif i < len(raw_words) - 1 and f"{raw_words[i]} {raw_words[i+1]}" in products:
                tokens.append(raw_words[i])
                tokens.append(raw_words[i+1])
                tags.append("B-product")
                tags.append("I-product")
                i += 2
                found_entity = True
                
            elif i < len(raw_words) - 2 and f"{raw_words[i]} {raw_words[i+1]} {raw_words[i+2]}" in events:
                tokens.append(raw_words[i])
                tokens.append(raw_words[i+1])
                tokens.append(raw_words[i+2])
                tags.append("B-event")
                tags.append("I-event")
                tags.append("I-event")
                i += 3
                found_entity = True
            elif i < len(raw_words) - 1 and f"{raw_words[i]} {raw_words[i+1]}" in events:
                tokens.append(raw_words[i])
                tokens.append(raw_words[i+1])
                tags.append("B-event")
                tags.append("I-event")
                i += 2
                found_entity = True
                
            elif i < len(raw_words) - 1 and f"{raw_words[i]} {raw_words[i+1]}" in locations:
                tokens.append(raw_words[i])
                tokens.append(raw_words[i+1])
                tags.append("B-location")
                tags.append("I-location")
                i += 2
                found_entity = True
                
            if not found_entity:
                if token in [name.split()[0] for name in persons]:
                    tokens.append(token)
                    tags.append("B-person")
                    i += 1
                elif token in corporations:
                    tokens.append(token)
                    tags.append("B-corporation")
                    i += 1
                elif token in products:
                    tokens.append(token)
                    tags.append("B-product")
                    i += 1
                elif token in events:
                    tokens.append(token)
                    tags.append("B-event")
                    i += 1
                elif token in locations:
                    tokens.append(token)
                    tags.append("B-location")
                    i += 1
                else:
                    tokens.append(token)
                    tags.append("O")
                    i += 1
        
        data.append({"tokens": tokens, "tags": tags, "sentence": sentence})
    
    df = pd.DataFrame(data)
        
    return df

In [None]:
import random
import pandas as pd

# reworked new function
def generate_ner_dataset(num_examples, output_file="new_synthetic_ner_dataset.csv", templates=templates,
                         persons=persons, corporations=corporations, products=products, events=events, 
                         locations=locations):
    data = []
    
    for _ in range(num_examples):
        template = random.choice(templates)
        
        person = random.choice(persons)
        corporation = random.choice(corporations)
        product = random.choice(products)
        event = random.choice(events)
        location = random.choice(locations)
        
        sentence = template.format(
            person=person,
            corporation=corporation,
            product=product,
            event=event,
            location=location
        )
        
        tokens = []
        labels = []
        
        raw_words = []
        current_word = ""
        for char in sentence:
            if char.isalnum() or char in "-'":
                current_word += char
            else:
                if current_word:
                    raw_words.append(current_word)
                    current_word = ""
                if not char.isspace():
                    raw_words.append(char)
        if current_word:
            raw_words.append(current_word)
        
        i = 0
        while i < len(raw_words):
            token = raw_words[i]
            
            found_entity = False
            
            if i < len(raw_words) - 1 and f"{raw_words[i]} {raw_words[i+1]}" in persons:
                tokens.append(raw_words[i])
                tokens.append(raw_words[i+1])
                labels.append("B-person")
                labels.append("I-person")
                i += 2
                found_entity = True
                
            elif i < len(raw_words) - 2 and f"{raw_words[i]} {raw_words[i+1]} {raw_words[i+2]}" in products:
                tokens.append(raw_words[i])
                tokens.append(raw_words[i+1])
                tokens.append(raw_words[i+2])
                labels.append("B-product")
                labels.append("I-product")
                labels.append("I-product")
                i += 3
                found_entity = True
            elif i < len(raw_words) - 1 and f"{raw_words[i]} {raw_words[i+1]}" in products:
                tokens.append(raw_words[i])
                tokens.append(raw_words[i+1])
                labels.append("B-product")
                labels.append("I-product")
                i += 2
                found_entity = True
                
            elif i < len(raw_words) - 2 and f"{raw_words[i]} {raw_words[i+1]} {raw_words[i+2]}" in events:
                tokens.append(raw_words[i])
                tokens.append(raw_words[i+1])
                tokens.append(raw_words[i+2])
                labels.append("B-event")
                labels.append("I-event")
                labels.append("I-event")
                i += 3
                found_entity = True
            elif i < len(raw_words) - 1 and f"{raw_words[i]} {raw_words[i+1]}" in events:
                tokens.append(raw_words[i])
                tokens.append(raw_words[i+1])
                labels.append("B-event")
                labels.append("I-event")
                i += 2
                found_entity = True
                
            elif i < len(raw_words) - 1 and f"{raw_words[i]} {raw_words[i+1]}" in locations:
                tokens.append(raw_words[i])
                tokens.append(raw_words[i+1])
                labels.append("B-location")
                labels.append("I-location")
                i += 2
                found_entity = True
                
            if not found_entity:
                if token in [name.split()[0] for name in persons]:
                    tokens.append(token)
                    labels.append("B-person")
                    i += 1
                elif token in corporations:
                    tokens.append(token)
                    labels.append("B-corporation")
                    i += 1
                elif token in products:
                    tokens.append(token)
                    labels.append("B-product")
                    i += 1
                elif token in events:
                    tokens.append(token)
                    labels.append("B-event")
                    i += 1
                elif token in locations:
                    tokens.append(token)
                    labels.append("B-location")
                    i += 1
                else:
                    tokens.append(token)
                    labels.append("O")
                    i += 1
        
        data.append({"tokens": tokens, "labels": labels, "sentence": sentence})
    
    df = pd.DataFrame(data)
        
    return df

In [None]:
df = generate_ner_dataset(1000)
df.to_csv("synthetic_ner_data_3.csv")