In [None]:
# Interactive intake — robust inputs, clear urban/rural, consistent encodings
# gender: 1=male, 0=female
# is_urban / existing_modern_oral / existing_velo_user: 1=yes, 0=no
# strength_preference: 1=low, 2=medium, 3=high
# choice1..3 flavor ids: spice 1-10, mint&menthol 11-20, fruit 21-30

User_data = {
    "ID": [],
    "age": [],
    "gender": [],
    "region": [],
    "is_urban": [],
    "occupation": [],
    "user_category": [],
    "existing_modern_oral": [],
    "existing_velo_user": [],
    "strength_preference": [],
    "choice1": [],
    "choice2": [],
    "choice3": [],
}


def _norm(s: str) -> str:
    return s.strip().lower()

def ask_int(prompt, min_val=None, max_val=None):
    """Ask for an integer with optional bounds."""
    while True:
        s = input(prompt).strip()
        try:
            val = int(s)
        except ValueError:
            print("Invalid input. Please enter a whole number.")
            continue
        if min_val is not None and val < min_val:
            print(f"Please enter a number ≥ {min_val}.")
            continue
        if max_val is not None and val > max_val:
            print(f"Please enter a number ≤ {max_val}.")
            continue
        return val

def ask_binary_yes_no(prompt):
    """Ask a yes/no question; returns 1 (yes) or 0 (no). Accepts 1/0, y/yes, n/no, t/true, f/false."""
    YES = {"1","y","yes","t","true","yeah","yep"}
    NO  = {"0","n","no","f","false","nope"}
    while True:
        s = _norm(input(f"{prompt} (1=Yes, 0=No): "))
        if s in YES:
            return 1
        if s in NO:
            return 0
        print("Invalid input. Please respond with 1/0 or yes/no.")

def ask_choice(prompt, options, allow_text=True):
    """
    Show a numbered menu; returns 1-based index.
    Also accepts text matches (full or unique prefix), case-insensitive.
    """
    print(prompt)
    for idx, opt in enumerate(options, start=1):
        print(f"{idx}. {opt}")
    valid_str = "(" + ",".join(str(i) for i in range(1, len(options)+1)) + " or type the option)"
    norm_opts = [_norm(o) for o in options]

    while True:
        s = input("Enter the number or name of your choice: ").strip()
        # numeric?
        try:
            val = int(s)
            if 1 <= val <= len(options):
                return val
            print(f"Please enter a valid number {valid_str}.")
            continue
        except ValueError:
            pass

        if allow_text:
            ns = _norm(s)
            # exact text match
            if ns in norm_opts:
                return norm_opts.index(ns) + 1
            # unique prefix match
            prefix_hits = [i for i, o in enumerate(norm_opts) if o.startswith(ns)]
            if len(prefix_hits) == 1:
                return prefix_hits[0] + 1
            elif len(prefix_hits) > 1:
                # ambiguous prefix — show the narrowed list
                amb = ", ".join(options[i] for i in prefix_hits)
                print(f"Did you mean: {amb}? Please be more specific or use the number.")
            else:
                print(f"Not recognized. Choose a number {valid_str}.")
        else:
            print(f"Please enter a valid number {valid_str}.")

def parse_three_choices(raw, max_choice):
    """Parse exactly three distinct ints from 'a,b,c' within 1..max_choice."""
    parts = [p.strip() for p in raw.split(",") if p.strip()]
    if len(parts) != 3:
        raise ValueError("Please enter exactly three numbers separated by commas (e.g., 1, 4, 7).")
    try:
        nums = [int(p) for p in parts]
    except ValueError:
        raise ValueError("All entries must be integers.")
    if any(n < 1 or n > max_choice for n in nums):
        raise ValueError(f"All numbers must be between 1 and {max_choice}.")
    if len(set(nums)) != 3:
        raise ValueError("Choices must be three distinct numbers (no duplicates).")
    return nums

def ask_three_from_ten(prompt):
    """Prompt to pick 3 unique numbers in 1..10; validates input."""
    while True:
        raw = input(prompt + "\nEnter three numbers separated by commas (e.g., 2, 5, 9): ").strip()
        try:
            a, b, c = parse_three_choices(raw, 10)
            return a, b, c
        except ValueError as e:
            print(e)

# Flavours and categories
SPICE = [
    "Cinnamon Flame - Warm cinnamon with a fiery edge",
    "Zesty Lime - Zesty lime with a gentle jalapeño-style heat",
    "Peppermint Storm - Strong peppermint with an electric cooling",
    "Mango Flame - Sweet mango with a spicy kick",
    "Creamy Latte - Smooth coffee with vanilla and hazelnut notes",
    "Coffee - Rich roasted coffee",
    "Cinnamon - Classic cinnamon spice, clean and comforting",
    "Tropical Punch - Tropical, sweet, and subtly floral",
    "Black Cherry - Deep, cherry cherry with a rich undertone",
    "Citrus Burst - Bright orange, lemon, and lime blend",
]
MINT_MENTHOL = [
    "Crispy Peppermint - Sharp peppermint with an icy finish",
    "Bright Spearmint - Smooth, sweet spearmint for a softer menthol experience",
    "Arctic Peppermint - Intense peppermint with a great cold snap",
    "Smart Mint - Balanced and mellow mint taste",
    "Fresh Classic - Fresh mint, straightforward and refreshing",
    "Peppermint - Pure peppermint flavor, clean and crisp",
    "Wintergreen - Sweet, herbal mint with a creamy edge",
    "Minty Lemon - Zesty lemon - crisp mint finish",
    "Minty Watermelon - Sweet watermelon paired with cooling mint",
    "Icy Berries - Mixed berries with a frosty menthol twist",
]
FRUIT = [
    "Purple Grape - Bold, ripe grape with a sweet finish",
    "Ruby Berry - Juicy berry mix with a hint of strawberry",
    "Tropical Mango - Smooth mango with passionfruit and citrus notes",
    "Tropical Ice - Exotic fruits cooled with a hint of menthol",
    "Strawberry Ice - Sweet strawberries with a fresh chill",
    "Berry Ice - Tangy cherries softened by menthol",
    "Coco - Lightly creamy coconut with a clean finish",
    "Zesty Orange - Bright citrus orange with cooling sparkle",
    "Tangy Lime - Sharp, refreshing lime with a zesty bite",
    "Melon Fresh - Sweet, summery watermelon with a frosty finish",
]


def main():
    print("Welcome! Please answer the following questions.")

    # Age gate
    age = ask_int("Enter your age: ", 0, 120)
    if age < 18:
        print("Velo is for adult (18+) users. Thanks for your time.")
        return

    # Nicotine use gate
    is_nicotine_user = ask_binary_yes_no("Have you ever used nicotine before?")
    if is_nicotine_user == 0:
        print("This product contains nicotine and is intended for existing nicotine users. Thank you.")
        return

    # Gender (store 1=male, 0=female)
    gender_choice = ask_choice("What is your gender?", ["Male", "Female"])
    is_male = 1 if gender_choice == 1 else 0

    # Region (1..4)
    region = ask_choice("What is your region?", ["North", "Central", "Southern", "SNB"])

    # Urban/Rural - make it crystal clear
    is_urban = ask_binary_yes_no(
        "Do you live in an URBAN area (major city)?\n"
        "Examples: Karachi, Lahore, Islamabad, Multan.\n"
        "Answer 'Yes' if you live in/near a major city; otherwise 'No' for rural/suburban."
    )

    # Occupation
    occupation = ask_choice("What is your occupation?", ["Employed", "Unemployed", "Student"])

    # Intake category
    user_category = ask_choice("What is your preferred form of nicotine intake?", ["Vapor", "Combustible", "Others"])

    # Modern oral use (binary)
    existing_modern_oral = ask_binary_yes_no("Have you ever used modern oral nicotine pouches before?")

    # Velo + Strength(if modern oral user)
    if existing_modern_oral == 1:
        existing_velo_user = ask_binary_yes_no("Have you ever used Velo before?")
        strength_idx = ask_choice("What strength would you prefer?", ["Low", "Medium", "High"])
        strength_preference = strength_idx  # 1/2/3 mapping preserved
    else:
        existing_velo_user = 0
        strength_preference = 1  #default Low

    # Flavor category selection
    preference_category = ask_choice(
        "Which of these 3 flavor categories sounds the best to you?",
        ["Spice - Bold, warm, and unique.",
         "Mint & Menthol - Fresh, cool, and crisp.",
         "Fruit - Sweet, juicy, and vibrant."]
    )

    # Within-category picks (IDs: Spice=1..10, Mint=11..20, Fruit=21..30)
    if preference_category == 1:
        print("\nSelect 3 flavors from SPICE:")
        for i, item in enumerate(SPICE, start=1):
            print(f"{i}. {item}")
        c1, c2, c3 = ask_three_from_ten("Your Spice choices (1-10)")
        choice1, choice2, choice3 = c1, c2, c3
    elif preference_category == 2:
        print("\nSelect 3 flavors from MINT & MENTHOL:")
        for i, item in enumerate(MINT_MENTHOL, start=1):
            print(f"{i}. {item}")
        c1, c2, c3 = ask_three_from_ten("Your Mint & Menthol choices (1-10)")
        choice1, choice2, choice3 = c1 + 10, c2 + 10, c3 + 10
    else:
        print("\nSelect 3 flavors from FRUIT:")
        for i, item in enumerate(FRUIT, start=1):
            print(f"{i}. {item}")
        c1, c2, c3 = ask_three_from_ten("Your Fruit choices (1-10)")
        choice1, choice2, choice3 = c1 + 20, c2 + 20, c3 + 20

    # Next sequential ID
    next_id = (User_data["ID"][-1] + 1) if User_data["ID"] else 1

    # Append strongly-typed values
    User_data["ID"].append(int(next_id))
    User_data["age"].append(int(age))
    User_data["gender"].append(int(is_male))
    User_data["region"].append(int(region))
    User_data["is_urban"].append(int(is_urban))
    User_data["occupation"].append(int(occupation))
    User_data["user_category"].append(int(user_category))
    User_data["existing_modern_oral"].append(int(existing_modern_oral))
    User_data["existing_velo_user"].append(int(existing_velo_user))
    User_data["strength_preference"].append(int(strength_preference))
    User_data["choice1"].append(int(choice1))
    User_data["choice2"].append(int(choice2))
    User_data["choice3"].append(int(choice3))

    # Confirmation and dictionary output
    print("\nThanks! Your responses have been recorded.")
    print(f"Assigned ID: {next_id}")
    last = {k: v[-1] for k, v in User_data.items()}
    for k, v in last.items():
        print(f"{k}: {v}")

if __name__ == "__main__":
    try:
        main()
    except KeyboardInterrupt:
        print("\nSession cancelled by user.")


Welcome! Please answer the following questions.
Enter your age: fhfh
Invalid input. Please enter a whole number.


In [None]:

# Let's run the provided code to generate the simulated personas and save it as a CSV file.
import numpy as np
import pandas as pd

# Attempt to import the 'files' module from google.colab
# This is used to trigger the download in a Google Colab environment.
try:
    from google.colab import files
    IN_COLAB = True
except ImportError:
    IN_COLAB = False

# Settings
N = 10_000
rng = np.random.default_rng(2025)  # reproducible seed

# Decision tree branching probabilities
AREA_CHOICES = ["A1", "A2", "A3", "A4"]  # (North, Central, South, SNB)
AREA_P = [0.25, 0.25, 0.25, 0.25]
UR_CHOICES = ["Urban", "Rural"]
UR_P = [0.5, 0.5]
GENDER_CHOICES = ["Male", "Female"]
GENDER_P = [0.7, 0.3]
AGE_CHOICES = ["18-25", "26-40", "41+"]
AGE_P = [0.35, 0.40, 0.25]

# Flavors (30)
flavors = [
    "Cinnamon Flame", "Zesty Lime", "Peppermint Storm", "Mango Flame", "Creamy Latte",
    "Coffee", "Cinnamon", "Tropical Punch", "Black Cherry", "Citrus Burst",
    "Crispy Peppermint", "Bright Spearmint", "Arctic Peppermint", "Smart Mint", "Fresh Classic",
    "Peppermint", "Wintergreen", "Minty Lemon", "Minty Watermelon", "Icy Berries",
    "Purple Grape", "Ruby Berry", "Tropical Mango", "Tropical Ice", "Strawberry Ice",
    "Berry Ice", "Coco", "Zesty Orange", "Tangy Lime", "Melon Fresh"
]

# Map each flavor to a type for UR/Gender/Age multiplier rules
flavor_type = {
    "Cinnamon Flame": "Spicy", "Zesty Lime": "Fruity", "Peppermint Storm": "Mint",
    "Mango Flame": "Spicy", "Creamy Latte": "Coffee", "Coffee": "Coffee",
    "Cinnamon": "Spicy", "Tropical Punch": "Fruity", "Black Cherry": "Fruity", "Citrus Burst": "Fruity",
    "Crispy Peppermint": "Mint", "Bright Spearmint": "Mint", "Arctic Peppermint": "Mint",
    "Smart Mint": "Mint", "Fresh Classic": "Mint", "Peppermint": "Mint",
    "Wintergreen": "Mint", "Minty Lemon": "Mint", "Minty Watermelon": "Mint", "Icy Berries": "Mint",
    "Purple Grape": "Fruity", "Ruby Berry": "Fruity", "Tropical Mango": "Fruity",
    "Tropical Ice": "Fruity", "Strawberry Ice": "Fruity", "Berry Ice": "Fruity",
    "Coco": "Sweet", "Zesty Orange": "Fruity", "Tangy Lime": "Fruity", "Melon Fresh": "Fruity"
}

# Area multipliers (A1..A4) per flavor
area_multipliers = {
    "A1": {
        "Cinnamon Flame": 1.1, "Zesty Lime": 1.0, "Peppermint Storm": 1.0, "Mango Flame": 1.2, "Creamy Latte": 0.9,
        "Coffee": 1.0, "Cinnamon": 1.1, "Tropical Punch": 1.0, "Black Cherry": 0.9, "Citrus Burst": 1.0,
        "Crispy Peppermint": 1.2, "Bright Spearmint": 1.1, "Arctic Peppermint": 1.1, "Smart Mint": 1.0, "Fresh Classic": 1.0,
        "Peppermint": 1.1, "Wintergreen": 1.0, "Minty Lemon": 1.0, "Minty Watermelon": 0.9, "Icy Berries": 1.0,
        "Purple Grape": 1.0, "Ruby Berry": 1.1, "Tropical Mango": 1.2, "Tropical Ice": 1.0, "Strawberry Ice": 1.0,
        "Berry Ice": 1.0, "Coco": 0.9, "Zesty Orange": 1.0, "Tangy Lime": 1.0, "Melon Fresh": 1.0
    },
    "A2": {
        "Cinnamon Flame": 1.2, "Zesty Lime": 1.0, "Peppermint Storm": 1.1, "Mango Flame": 1.3, "Creamy Latte": 1.1,
        "Coffee": 1.2, "Cinnamon": 1.1, "Tropical Punch": 1.0, "Black Cherry": 0.9, "Citrus Burst": 1.0,
        "Crispy Peppermint": 1.0, "Bright Spearmint": 1.1, "Arctic Peppermint": 1.0, "Smart Mint": 1.0, "Fresh Classic": 1.1,
        "Peppermint": 1.0, "Wintergreen": 1.0, "Minty Lemon": 1.0, "Minty Watermelon": 0.9, "Icy Berries": 1.0,
        "Purple Grape": 0.9, "Ruby Berry": 1.0, "Tropical Mango": 1.2, "Tropical Ice": 1.1, "Strawberry Ice": 1.0,
        "Berry Ice": 1.0, "Coco": 1.1, "Zesty Orange": 1.0, "Tangy Lime": 1.0, "Melon Fresh": 1.0
    },
    "A3": {
        "Cinnamon Flame": 1.0, "Zesty Lime": 1.1, "Peppermint Storm": 1.0, "Mango Flame": 1.1, "Creamy Latte": 0.9,
        "Coffee": 0.9, "Cinnamon": 1.0, "Tropical Punch": 1.2, "Black Cherry": 1.0, "Citrus Burst": 1.1,
        "Crispy Peppermint": 1.0, "Bright Spearmint": 1.0, "Arctic Peppermint": 1.1, "Smart Mint": 1.0, "Fresh Classic": 1.0,
        "Peppermint": 1.0, "Wintergreen": 1.1, "Minty Lemon": 1.0, "Minty Watermelon": 1.1, "Icy Berries": 1.1,
        "Purple Grape": 1.2, "Ruby Berry": 1.2, "Tropical Mango": 1.1, "Tropical Ice": 1.2, "Strawberry Ice": 1.2,
        "Berry Ice": 1.1, "Coco": 1.0, "Zesty Orange": 1.1, "Tangy Lime": 1.1, "Melon Fresh": 1.2
    },
    "A4": {
        "Cinnamon Flame": 0.9, "Zesty Lime": 1.0, "Peppermint Storm": 1.1, "Mango Flame": 1.0, "Creamy Latte": 1.0,
        "Coffee": 1.1, "Cinnamon": 1.0, "Tropical Punch": 0.9, "Black Cherry": 1.1, "Citrus Burst": 1.0,
        "Crispy Peppermint": 1.1, "Bright Spearmint": 1.0, "Arctic Peppermint": 1.1, "Smart Mint": 1.0, "Fresh Classic": 1.1,
        "Peppermint": 1.0, "Wintergreen": 1.1, "Minty Lemon": 1.0, "Minty Watermelon": 1.0, "Icy Berries": 1.1,
        "Purple Grape": 1.0, "Ruby Berry": 1.1, "Tropical Mango": 1.0, "Tropical Ice": 1.0, "Strawberry Ice": 1.0,
        "Berry Ice": 1.1, "Coco": 1.0, "Zesty Orange": 1.0, "Tangy Lime": 1.0, "Melon Fresh": 1.0
    }
}

# UR / Gender / Age multipliers by flavor type
UR_mult = {
    "Coffee": {"Urban": 1.2, "Rural": 0.8},
    "Spicy": {"Urban": 1.0, "Rural": 1.1},
    "Mint": {"Urban": 1.2, "Rural": 0.9},
    "Fruity": {"Urban": 1.0, "Rural": 1.1},
    "Sweet": {"Urban": 1.1, "Rural": 0.9}
}
# Added 'Creamy Latte' to the dictionaries as it was missing, causing a KeyError
Gender_mult = {
    "Coffee": {"Male": 1.2, "Female": 0.9},
    "Creamy Latte": {"Male": 1.2, "Female": 0.9},
    "Spicy": {"Male": 1.2, "Female": 0.9},
    "Mint": {"Male": 1.0, "Female": 1.1},
    "Fruity": {"Male": 0.9, "Female": 1.2},
    "Sweet": {"Male": 0.9, "Female": 1.1}
}
Age_mult = {
    "Coffee": {"18-25": 0.9, "26-40": 1.2, "41+": 1.3},
    "Creamy Latte": {"18-25": 0.9, "26-40": 1.2, "41+": 1.3},
    "Spicy": {"18-25": 0.9, "26-40": 1.1, "41+": 1.2},
    "Mint": {"18-25": 1.2, "26-40": 1.0, "41+": 0.8},
    "Fruity": {"18-25": 1.2, "26-40": 1.0, "41+": 0.9},
    "Sweet": {"18-25": 1.1, "26-40": 1.0, "41+": 0.9}
}

# Generate 10,000 personas (decision tree)
areas = rng.choice(AREA_CHOICES, size=N, p=AREA_P)
urs = rng.choice(UR_CHOICES, size=N, p=UR_P)
genders = rng.choice(GENDER_CHOICES, size=N, p=GENDER_P)
ages = rng.choice(AGE_CHOICES, size=N, p=AGE_P)

# Compute flavor probability vector for each person
prob_matrix = np.zeros((N, len(flavors)), dtype=float)
for j, flv in enumerate(flavors):
    am = np.array([area_multipliers[a][flv] for a in areas], dtype=float)
    ftype = flavor_type[flv]
    um = np.array([UR_mult[ftype][u] for u in urs], dtype=float)
    gm = np.array([Gender_mult[ftype][g] for g in genders], dtype=float)
    ag = np.array([Age_mult[ftype][a] for a in ages], dtype=float)
    prob_matrix[:, j] = am * um * gm * ag

row_sums = prob_matrix.sum(axis=1, keepdims=True)
row_sums[row_sums == 0] = 1.0
prob_matrix = prob_matrix / row_sums  # normalize per person

# For each person, sample 3 flavors without replacement using their probabilities
top3 = [list(rng.choice(flavors, size=3, replace=False, p=prob_matrix[i])) for i in range(N)]

# Build records (list of dicts) and export
records = []
for i in range(N):
    rec = {
        "PersonaID": i + 1,
        "Area": areas[i],
        "UrbanRural": urs[i],
        "Gender": genders[i],
        "AgeGroup": ages[i],
        "TopFlavor1": top3[i][0],
        "TopFlavor2": top3[i][1],
        "TopFlavor3": top3[i][2],
    }
    for j, flv in enumerate(flavors):
        rec[f"prob_{flv}"] = prob_matrix[i, j]
    records.append(rec)

df_out = pd.DataFrame.from_records(records)
meta_cols = ["PersonaID", "Area", "UrbanRural", "Gender", "AgeGroup", "TopFlavor1", "TopFlavor2", "TopFlavor3"]
prob_cols = [c for c in df_out.columns if c.startswith("prob_")]
df_out = df_out[meta_cols + prob_cols]

#  Generate and download the CSV file
file_path = 'simulated_personas_10000.csv'

# Save the DataFrame to a CSV file in the current environment
df_out.to_csv(file_path, index=False)

# If the code is running in Google Colab, trigger a file download.
if IN_COLAB:
    print(f"File '{file_path}' is ready for download.")
    files.download(file_path)
else:
    # If not in Colab, inform the user that the file is saved locally.
    print(f"File '{file_path}' has been saved in the current directory.")
    print("If you are running this on a remote server, you may need to transfer it to your local machine.")

In [None]:
import pandas as pd
from google.colab import files
file_path_dict = files.upload()   #

file_path = list(file_path_dict.keys())[0]

# The delimiter in the CSV file appears to be ';' based on the previous cell's output attempt
df = pd.read_csv(file_path, delimiter=",")

# Define the scoring system
points = {"TopFlavor1": 2.0, "TopFlavor2": 1.5, "TopFlavor3": 1.0}

# Reshape the dataframe to long format
flavor_scores = pd.melt(
    df,
    id_vars=["Area", "UrbanRural"], # Changed Region to Area as per the DataFrame columns
    value_vars=["TopFlavor1", "TopFlavor2", "TopFlavor3"],
    var_name="Choice",
    value_name="Flavor"
)

# Map points to each choice
flavor_scores["Points"] = flavor_scores["Choice"].map(points)

# Group by Area, Urban/Rural, and Flavor, summing the points
score_summary = (
    flavor_scores.groupby(["Area", "UrbanRural", "Flavor"], as_index=False)["Points"]
    .sum()
)

# Sort and pick the top 5 flavors per Area/UrbanRural
top_flavors = (
    score_summary.sort_values(["Area", "UrbanRural", "Points"], ascending=[True, True, False])
    .groupby(["Area", "UrbanRural"])
    .head(5)
)

# Display the results
print(top_flavors)

In [None]:
import matplotlib.pyplot as plt

# Map Area codes to region names
area_map = {"A1": "North", "A2": "Central", "A3": "South", "A4": "SNB"}
score_summary["Region"] = score_summary["Area"].map(area_map)

# Plot histograms split by Region + Urban/Rural
for (region, ur), group in score_summary.groupby(["Region", "UrbanRural"]):
    # Sort by points (highest to lowest)
    group_sorted = group.sort_values("Points", ascending=False).reset_index(drop=True)

    # Default colors = gray
    colors = ["gray"] * len(group_sorted)

    # Top 3 = green
    for i in group_sorted.head(3).index:
        colors[i] = "green"

    # Bottom 3 = red
    for i in group_sorted.tail(3).index:
        colors[i] = "red"

    # Plot
    plt.figure(figsize=(12, 6))
    bars = plt.bar(
        group_sorted["Flavor"],
        group_sorted["Points"],
        color=colors,
        edgecolor="black",
        linewidth=0.5
    )

    # Add value labels on top
    for bar in bars:
        height = bar.get_height()
        plt.text(
            bar.get_x() + bar.get_width()/2,
            height + 5,
            f"{int(height)}",
            ha="center", va="bottom", fontsize=9
        )

    plt.title(f"Flavor Preferences - {region} ({ur})", fontsize=16)
    plt.xlabel("Flavor", fontsize=12)
    plt.ylabel("Points", fontsize=12)
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    plt.show()


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd


df = pd.DataFrame({
    "UrbanRural": ["Urban", "Rural", "Urban", "Rural", "Urban"],
    "TopFlavor1": ["Mango Twist", "Coffee Kick", "Cinnamon Spice", "Berry Burst", "Citrus Zing"],
    "TopFlavor2": ["Berry Burst", "Vanilla Smooth", "Mango Twist", "Citrus Zing", "Grape Wave"],
    "TopFlavor3": ["Grape Wave", "Cinnamon Spice", "Coffee Kick", "Mango Twist", "Berry Burst"]
})

# Combine flavors into one text string per row
df["all_flavors"] = df[["TopFlavor1","TopFlavor2","TopFlavor3"]].agg(" ".join, axis=1)

# Define vocabulary of fruit-related words
fruit_vocab = ["mango", "berry", "citrus", "grape", "vanilla", "coffee", "cinnamon"]

vectorizer = CountVectorizer(vocabulary=fruit_vocab, lowercase=True)
X = vectorizer.fit_transform(df["all_flavors"])

# Convert to DataFrame
fruit_counts = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# Add Urban/Rural info and aggregate
fruit_summary = pd.concat([df[["UrbanRural"]], fruit_counts], axis=1)
grouped = fruit_summary.groupby("UrbanRural").sum()

print(grouped)


            mango  berry  citrus  grape  vanilla  coffee  cinnamon
UrbanRural                                                        
Rural           1      1       1      0        1       1         1
Urban           2      2       1      2        0       1         1


In [None]:
# Histograms: one per region, showing counts of Male vs Female per age group

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


# Map Area -> Region names
area_map = {"A1": "North", "A2": "Central", "A3": "South", "A4": "SNB"}
df_out["Region"] = df_out["Area"].map(area_map)

# Orders for consistent plotting
age_order = ["18-25", "26-40", "41+"]
gender_order = ["Male", "Female"]
region_order = ["North", "Central", "South", "SNB"]

# Plot one grouped bar chart per region
for region in region_order:
    sub = df_out[df_out["Region"] == region]

    # counts by AgeGroup × Gender
    grouped = (
        sub.groupby(["AgeGroup", "Gender"])
           .size()
           .unstack(fill_value=0)              # columns = genders
           .reindex(index=age_order)           # ensure age order
           .reindex(columns=gender_order, fill_value=0)  # ensure gender order
    )

    x = np.arange(len(age_order))
    width = 0.38

    plt.figure(figsize=(12, 6))
    bars_m = plt.bar(x - width/2, grouped["Male"].values, width, label="Male")
    bars_f = plt.bar(x + width/2, grouped["Female"].values, width, label="Female")

    plt.title(f"Male vs Female Counts by Age Group — {region}")
    plt.xlabel("Age Group")
    plt.ylabel("Count")
    plt.xticks(x, age_order)
    plt.legend(title="Gender")

    # value labels
    for b in list(bars_m) + list(bars_f):
        h = b.get_height()
        plt.text(b.get_x() + b.get_width()/2, h, f"{int(h)}", ha="center", va="bottom", fontsize=9)

    plt.tight_layout()
    plt.show()

In [None]:
# Urban & Rural share per taste token (single script; assumes `df` already exists)

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# Flavor
FLAVOR_TO_TAGS = {
    "Cinnamon Flame": ["spicy"],
    "Zesty Lime": ["lime"],
    "Peppermint Storm": ["mint"],
    "Mango Flame": ["spicy", "mango"],
    "Creamy Latte": ["coffee"],
    "Coffee": ["coffee"],
    "Cinnamon": ["spicy"],
    "Tropical Punch": ["tropical"],
    "Black Cherry": ["cherry"],
    "Citrus Burst": ["citrus"],
    "Crispy Peppermint": ["mint"],
    "Bright Spearmint": ["mint"],
    "Arctic Peppermint": ["mint"],
    "Smart Mint": ["mint"],
    "Fresh Classic": ["mint"],
    "Peppermint": ["mint"],
    "Wintergreen": ["mint"],
    "Minty Lemon": ["mint", "lemon"],
    "Minty Watermelon": ["mint", "watermelon"],
    "Icy Berries": ["mint", "berry"],
    "Purple Grape": ["grape"],
    "Ruby Berry": ["berry"],
    "Tropical Mango": ["mango"],
    "Tropical Ice": ["tropical"],
    "Strawberry Ice": ["strawberry"],
    "Berry Ice": ["berry"],
    "Coco": ["coconut", "sweet"],
    "Zesty Orange": ["orange"],
    "Tangy Lime": ["lime"],
    "Melon Fresh": ["melon"],
}

# Build per-row taste text from TopFlavor1–3
def tags_for(f):
    return FLAVOR_TO_TAGS.get(f, [])

taste_text = df[["TopFlavor1","TopFlavor2","TopFlavor3"]].apply(
    lambda r: " ".join(sum([tags_for(r[c]) for c in ["TopFlavor1","TopFlavor2","TopFlavor3"]], [])),
    axis=1
)

# CountVectorizer over the exact token vocab
VOCAB = sorted({t for tags in FLAVOR_TO_TAGS.values() for t in tags})
vec = CountVectorizer(vocabulary=VOCAB, lowercase=True, token_pattern=r"[A-Za-z]+")
X = vec.fit_transform(taste_text)
token_counts = pd.DataFrame(X.toarray(), columns=vec.get_feature_names_out())

# Aggregate counts by Urban/Rural
counts_by_ur = (
    pd.concat([df[["UrbanRural"]].reset_index(drop=True), token_counts], axis=1)
      .groupby("UrbanRural").sum()
      .reindex(["Urban","Rural"]).fillna(0)
)

# Shares per token
den = counts_by_ur.sum(axis=0).replace(0, pd.NA)  # Urban+Rural per token
urban_share = (counts_by_ur.loc["Urban"] / den)
rural_share = (counts_by_ur.loc["Rural"] / den)

shares = pd.DataFrame({
    "UrbanShare": urban_share,
    "RuralShare": rural_share
}).fillna(0).sort_values("UrbanShare", ascending=False).round(4)

print(shares)


In [None]:
taste_text = df[["TopFlavor1","TopFlavor2","TopFlavor3"]].apply(
    lambda r: " ".join(sum([tags_for(r[c]) for c in ["TopFlavor1","TopFlavor2","TopFlavor3"]], [])),
    axis=1
)

# CountVectorizer over token vocab
VOCAB = sorted({t for tags in FLAVOR_TO_TAGS.values() for t in tags})
vec = CountVectorizer(vocabulary=VOCAB, lowercase=True, token_pattern=r"[A-Za-z]+")
X = vec.fit_transform(taste_text)
token_counts = pd.DataFrame(X.toarray(), columns=vec.get_feature_names_out())

# Aggregate counts and convert to shares
counts_by_ur = (
    pd.concat([df[["UrbanRural"]].reset_index(drop=True), token_counts], axis=1)
      .groupby("UrbanRural").sum()
      .reindex(["Urban","Rural"]).fillna(0)
)
den = counts_by_ur.sum(axis=0).replace(0, np.nan)
shares = pd.DataFrame({
    "UrbanShare": counts_by_ur.loc["Urban"] / den,
    "RuralShare": counts_by_ur.loc["Rural"] / den
}).fillna(0)

# Sort tokens for nicer plotting (by UrbanShare descending)
shares = shares.sort_values("UrbanShare", ascending=False)

tokens = shares.index.tolist()
x = np.arange(len(tokens))
xu, xr = x - 0.12, x + 0.12  # small horizontal offset to avoid overlap

plt.figure(figsize=(12, 6))
plt.scatter(xu, shares["UrbanShare"], s=60, color="green", label="Urban")
plt.scatter(xr, shares["RuralShare"], s=60, color="red", label="Rural")

# Labels
for i, y in enumerate(shares["UrbanShare"]):
    plt.text(xu[i], y + 0.015, f"{y:.2f}", ha="center", va="bottom", fontsize=8, color="green")
for i, y in enumerate(shares["RuralShare"]):
    plt.text(xr[i], y - 0.03, f"{y:.2f}", ha="center", va="top", fontsize=8, color="red")

plt.xticks(x, tokens, rotation=45, ha="right")
plt.ylim(0, 1)
plt.ylabel("Share within token")
plt.title("Urban (green) vs Rural (red) share per taste token")
plt.grid(axis="y", alpha=0.25)
plt.legend()
plt.tight_layout()
plt.show()

In [None]:

import pandas as pd
from wordcloud import WordCloud
import matplotlib.pyplot as plt


df = pd.read_csv("simulated_personas_10000.csv")

# Aggregate flavor frequencies from TopFlavor1..3
flavor_cols = ["TopFlavor1", "TopFlavor2", "TopFlavor3"]
flavors = pd.concat([df[c] for c in flavor_cols], ignore_index=True).dropna()
freq = flavors.value_counts().to_dict()  # counts per exact flavor name

# Treat each FULL flavor name as a single token (no splitting, no repeats)
freq_phrase = {name.replace(" ", "\u00A0"): count for name, count in freq.items()}

wc = WordCloud(
    width=1400,
    height=700,
    background_color="white",
    collocations=False,        # don't auto-make bigrams that duplicate names
    normalize_plurals=False,   # don't merge singular/plural variants
    repeat=False,              # do NOT re-draw the same token
    regexp=r"[\w\u00A0\-']+",
    max_words=len(freq_phrase) # include each unique flavor once
)

img = wc.generate_from_frequencies(freq_phrase)

plt.figure(figsize=(14, 7))
plt.imshow(img, interpolation="bilinear")
plt.axis("off")
plt.tight_layout()
plt.show()
