In [1]:
import random
import ollama
import pandas as pd
from tqdm import tqdm

In [2]:
ASPECTS = [
    "Cost of Living", "Healthcare", "Education", "Personal Security",
    "Employment", "Transportation", "Government", "Environment",
    "Social Equality", "Taxation"
]

PERSONAS = [
    "struggling student", "wealthy business owner", "tired parent", 
    "political activist", "retired teacher", "tech worker", "nurse"
]

STYLES = [
    "sarcastic", "hopeful", "angry and ranting", "neutral/news-like", 
    "using slang and emojis", "formal complaint"
]

In [3]:
def generate_target_vector():
    """
    Randomly selects 1-2 aspects to focus on and assigns a sentiment score (-1, 0, 1)
    """

    vector = {aspect: 0 for aspect in ASPECTS}

    num_active = random.choice([1, 2])
    active_aspects = random.sample(ASPECTS, num_active)

    for aspect in active_aspects:
        # We avoid 0 here beacause we want the tweet to be opinionated on this topic 
        vector[aspect] = random.choice([-1, 1])
    
    return vector

In [5]:
vector = generate_target_vector()
vector

{'Cost of Living': 0,
 'Healthcare': 0,
 'Education': -1,
 'Personal Security': 0,
 'Employment': 0,
 'Transportation': -1,
 'Government': 0,
 'Environment': 0,
 'Social Equality': 0,
 'Taxation': 0}

In [6]:
def create_prompt(persona, style, vector):
    """
    Constructs the prompt for Ollama based on the chosen attributes.
    """
    # Filter only non-zero aspects to tell the LLM what to write about
    active_topics = [f"{k} ({'Positive' if v==1 else 'Negative'})" for k, v in vector.items() if v != 0]
    topics_str = ", ".join(active_topics)

    prompt = f"""
    You are a realistic Twitter user acting as a {persona}.
    Write a single tweet (max 280 characters).
    
    Style: {style}
    Topic: {topics_str}
    
    Constraint: Do NOT write hashtags like #Healthcare or #CostOfLiving directly if it feels unnatural. 
    Make it sound authentic to the persona. Do not include quotes or explanations, just the tweet text.
    """

    return prompt

In [7]:
def generate_synthetic_data(n_samples=10, model_name="llama3.2"):
    data = []

    print(f"Generating {n_samples} samples using {model_name} ...")

    for _ in tqdm(range(n_samples)):
        # Randomize Attributes
        persona = random.choice(PERSONAS)
        style = random.choice(STYLES)
        vector = generate_target_vector()
        
        # Generate Text
        prompt = create_prompt(persona, style, vector)

        try:
            response = ollama.chat(model=model_name, messages=[
                {'role': 'user', 'content': prompt}
            ])

            tweet_text = response['message']['content'].strip().replace('"', '')

            # Store Data
            row = vector.copy()
            row['tweet_text'] = tweet_text
            row['persona'] = persona
            row['style'] = style
            data.append(row)

        except Exception as e:
            print(f"Error: {e}")
        
    return pd.DataFrame(data)

In [8]:
df = generate_synthetic_data(n_samples=5)
pd.set_option('display.max_colwidth', None)

Generating 5 samples using llama3.2 ...


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:05<00:00,  1.08s/it]


In [9]:
df

Unnamed: 0,Cost of Living,Healthcare,Education,Personal Security,Employment,Transportation,Government,Environment,Social Equality,Taxation,tweet_text,persona,style
0,0,0,0,0,0,0,0,-1,0,0,Still optimistic about our planet's future despite all the bleak news. Every tiny innovation counts - from sustainable software to eco-friendly coding practices. We can make a difference one line of code at a time.,tech worker,hopeful
1,-1,0,0,0,0,0,0,0,0,0,Disappointed in the rising cost of living in our community. Just had to increase my grocery budget by 30% and I'm struggling to make ends meet. When did 'affordable' become a myth?,retired teacher,formal complaint
2,0,0,0,0,0,1,0,0,1,0,Disappointing to see the city's new transportation initiative prioritizing gentrification-friendly bike lanes over desperately needed accessible routes for low-income communities. A step backwards on both progress and social justice.,political activist,formal complaint
3,0,0,0,1,0,0,0,0,-1,0,Can't believe our school can't provide basic security measures for students with disabilities while simultaneously preaching about social equality - when will we prioritize fairness in practice over feel-good rhetoric?,tired parent,formal complaint
4,0,0,0,0,0,0,0,0,1,0,How I wish some folks would put their money where their mouth is when it comes to equality! Progress may be slow but those pushing for social change deserve our support and not just empty platitudes about 'we're working towards a better future' - let's see action!,retired teacher,angry and ranting


In [10]:
df[['tweet_text', 'Cost of Living', 'Healthcare']].head()

Unnamed: 0,tweet_text,Cost of Living,Healthcare
0,Still optimistic about our planet's future despite all the bleak news. Every tiny innovation counts - from sustainable software to eco-friendly coding practices. We can make a difference one line of code at a time.,0,0
1,Disappointed in the rising cost of living in our community. Just had to increase my grocery budget by 30% and I'm struggling to make ends meet. When did 'affordable' become a myth?,-1,0
2,Disappointing to see the city's new transportation initiative prioritizing gentrification-friendly bike lanes over desperately needed accessible routes for low-income communities. A step backwards on both progress and social justice.,0,0
3,Can't believe our school can't provide basic security measures for students with disabilities while simultaneously preaching about social equality - when will we prioritize fairness in practice over feel-good rhetoric?,0,0
4,How I wish some folks would put their money where their mouth is when it comes to equality! Progress may be slow but those pushing for social change deserve our support and not just empty platitudes about 'we're working towards a better future' - let's see action!,0,0
