# Generation

## statistics

In [5]:
import pandas as pd

country_list = pd.read_csv("data/OAG/all/country_distribution_merged_lowercase.csv")
country_list

Unnamed: 0,country,count,percent
0,china,231579,17.253649
1,usa,148225,11.043411
2,france,102881,7.665085
3,japan,91606,6.825048
4,germany,85593,6.377053
...,...,...,...
94,bhutan,21,0.001565
95,north korea,16,0.001192
96,andorra,14,0.001043
97,laos,10,0.000745


In [7]:
need_supplement = country_list[(country_list["count"] < 6000) & (country_list["count"] > 100)]
need_supplement

Unnamed: 0,country,count,percent
33,malaysia,5679,0.42311
34,thailand,5403,0.402547
35,south africa,4872,0.362985
36,romania,4813,0.35859
37,norway,4794,0.357174
38,ukraine,3972,0.295931
39,hungary,3581,0.2668
40,bangladesh,3389,0.252495
41,chile,3271,0.243704
42,egypt,3144,0.234242


In [8]:
# Load your country tiers mapping
tiers = pd.read_csv("data/OAG/name_tier.csv")

# Select Tier 1 countries
tier1 = tiers[tiers["tier"] == 1]

# Merge with your existing 'need_supplement' DataFrame to find overlaps
overlap = pd.merge(need_supplement, tier1, on="country", how="inner")

# Show result
print(overlap[["country", "count", "percent", "tier"]])
print(f"\nTotal countries needing supplement: {len(overlap)}")


        country  count   percent  tier
0       romania   4813  0.358590     1
1        norway   4794  0.357174     1
2       hungary   3581  0.266800     1
3         chile   3271  0.243704     1
4   new zealand   3004  0.223811     1
5       ireland   2644  0.196990     1
6       vietnam   2475  0.184398     1
7      colombia   2183  0.162643     1
8      bulgaria   2107  0.156981     1
9       croatia   1935  0.144166     1
10     slovenia   1824  0.135896     1
11   luxembourg    907  0.067575     1
12    venezuela    862  0.064223     1
13       cyprus    800  0.059604     1
14      uruguay    797  0.059380     1
15  philippines    663  0.049396     1
16   kazakhstan    525  0.039115     1
17      ecuador    466  0.034719     1
18         peru    370  0.027567     1
19       panama    354  0.026375     1
20   costa rica    341  0.025406     1
21    guatemala    323  0.024065     1
22   uzbekistan    277  0.020638     1
23        malta    193  0.014379     1
24     paraguay    164  0

In [9]:
need_supplement = overlap

In [None]:
need_supplement["to_generate"] = 6000 - need_supplement["count"]
total_needed = need_supplement["to_generate"].sum()

print(need_supplement)
print("Total samples needed:", total_needed)


### distinct first/last names

In [None]:
import pandas as pd

first_name = pd.read_csv("/Users/mc/Desktop/Name2Demo/experiments/data/OAG/first_name_freq_by_country.csv")
last_name = pd.read_csv("/Users/mc/Desktop/Name2Demo/experiments/data/OAG/last_name_freq_by_country.csv")

In [None]:
first_name

In [None]:
first_distinct = (
    first_name.groupby("country")["first_name"]
    .nunique()
    .reset_index(name="distinct_first_name_count")
)

# --- Count distinct last names per country ---
last_distinct = (
    last_name.groupby("country")["last_name"]
    .nunique()
    .reset_index(name="distinct_last_name_count")
)

# --- Merge results ---
distinct_name_counts = pd.merge(first_distinct, last_distinct, on="country", how="outer").fillna(0)

# --- Sort for clarity ---
distinct_name_counts = distinct_name_counts.sort_values("distinct_first_name_count", ascending=False)

print(distinct_name_counts)
print(distinct_name_counts.describe())

In [None]:
country_list = pd.read_csv("data/OAG/country_distribution_merged_lowercase.csv")
country_list

In [None]:
# --- Merge by country ---
merged = country_list.merge(distinct_name_counts, on="country", how="left")

# --- Compute ratios ---
merged["first_name_diversity_ratio"] = merged["distinct_first_name_count"] / merged["count"]
merged["last_name_diversity_ratio"]  = merged["distinct_last_name_count"] / merged["count"]

# --- Handle missing or infinite values ---
merged = merged.fillna(0)

# --- Optional: sort to inspect extremes ---
merged = merged.sort_values("first_name_diversity_ratio", ascending=False)

# --- Display summary ---
print(merged[["country", "count", "distinct_first_name_count", "distinct_last_name_count",
              "first_name_diversity_ratio", "last_name_diversity_ratio"]].head())

print(merged.describe())

In [None]:
merged.to_csv("distinct_first_and_last.csv")

## generation

In [10]:
prompt0 = """
Generate {} realistic full names for people from {}.
Each line should contain a unique full name (first and last name).
Avoid repeating the same first or last names more than 3 times.
"""

In [None]:
from openai import OpenAI
import time

client = OpenAI(api_key="")

def name_generation_openai(number, country, model_type, user_prompt):
    """Send one prediction request and return model output safely."""
    try:
        response = client.chat.completions.create(
            model=model_type,
            messages=[
                {"role": "system", "content": "You are an expert in demography."},
                {"role": "user", "content": user_prompt.format(number, country)},
            ],
        )
        return response.choices[0].message.content
    except Exception as e:
        print(f"Error for '{number, country}': {e}")
        time.sleep(30)
        try:
            # one retry after waiting
            response = client.chat.completions.create(
                model=model_type,
                messages=[
                    {"role": "system", "content": "You are an expert in demography."},
                    {"role": "user", "content": user_prompt.format(number, country)},
                ],
            )
            return response.choices[0].message.content
        except Exception as e2:
            print(f"Retry failed for '{number, country}': {e2}")
            return None

In [12]:
import math
import pandas as pd

def generate_names_for_country(total_needed, country, model_type, user_prompt, batch_size=500):
    """
    Generate pseudo names for a given country in batches and save to CSV.
    Ensures exact total count and outputs only one column of names.
    """
    all_results = []
    num_batches = math.ceil(total_needed / batch_size)
    
    print(f"Generating {total_needed} names for {country} in {num_batches} batch(es)...")

    for i in range(num_batches):
        current_batch_size = min(batch_size, total_needed - len(all_results))
        if current_batch_size <= 0:
            break

        print(f"Batch {i+1}/{num_batches} ‚Üí requesting {current_batch_size} names...")

        result = name_generation_openai(current_batch_size, country, model_type, user_prompt)
        
        if not result:
            print(f"‚ö†Ô∏è Batch {i+1} for {country} failed. Skipping.")
            continue

        # Split the LLM output into clean individual names
        names = [n.strip(" -‚Ä¢1234567890.") for n in result.split("\n") if n.strip()]
        
        all_results.extend(names)

        # Truncate if overshooting (some models return extra lines)
        if len(all_results) >= total_needed:
            all_results = all_results[:total_needed]
            break

        print(f"‚úÖ {len(all_results)}/{total_needed} names collected so far...")

    # Convert to DataFrame (single column only)
    df = pd.DataFrame({"name": all_results})

    # Ensure folder path exists
    output_path = f"generated/{country.replace(' ', '_')}_{total_needed}.csv"
    df.to_csv(output_path, index=False)
    
    print(f"‚úÖ Finished {country}: {len(df)} names saved to {output_path}")
    return df


In [13]:
# generate_names_for_country(2000, "pakistan", "gpt-5", prompt0)

In [15]:
need_supplement 

Unnamed: 0,country,count,percent,tier
0,romania,4813,0.35859,1
1,norway,4794,0.357174,1
2,hungary,3581,0.2668,1
3,chile,3271,0.243704,1
4,new zealand,3004,0.223811,1
5,ireland,2644,0.19699,1
6,vietnam,2475,0.184398,1
7,colombia,2183,0.162643,1
8,bulgaria,2107,0.156981,1
9,croatia,1935,0.144166,1


In [16]:
import concurrent.futures
import math
import time
from tqdm import tqdm

# --- CONFIGURATION ---
COUNTRIES_PER_ITER = 5          # number of threads per round
MODEL_TYPE = "gpt-5"      # or whichever model you prefer
USER_PROMPT = prompt0
TOTAL_PER_COUNTRY = 5000        # number of samples per country

# make sure need_supplement is already loaded
# e.g. need_supplement = pd.read_csv("need_supplement.csv")

def run_parallel_generation(df, model_type=MODEL_TYPE, user_prompt=USER_PROMPT, batch_size=COUNTRIES_PER_ITER):
    """
    Run name generation concurrently in batches of countries.
    Uses ThreadPoolExecutor to avoid hitting API rate limits.
    """
    total_countries = len(df)
    print(f"üåç Starting generation for {total_countries} countries ({batch_size} at a time)...")

    for i in range(0, total_countries, batch_size):
        batch = df.iloc[i:i+batch_size]
        print(f"\nüöÄ Batch {i//batch_size + 1}: {', '.join(batch['country'])}")

        with concurrent.futures.ThreadPoolExecutor(max_workers=batch_size) as executor:
            futures = {
                executor.submit(
                    generate_names_for_country,
                    TOTAL_PER_COUNTRY,
                    row.country,
                    model_type,
                    user_prompt
                ): row.country
                for _, row in batch.iterrows()
            }

            for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
                country = futures[future]
                try:
                    _ = future.result()
                    print(f"‚úÖ Finished {country}")
                except Exception as e:
                    print(f"‚ö†Ô∏è Error for {country}: {e}")

        # optional cooldown to avoid hitting OpenAI rate limits
        print("‚è∏Ô∏è Cooling down for 30 seconds before next batch...")
        time.sleep(30)

    print("\nüéâ All countries processed!")

# --- RUN ---
run_parallel_generation(need_supplement)


üåç Starting generation for 25 countries (5 at a time)...

üöÄ Batch 1: romania, norway, hungary, chile, new zealand
Generating 5000 names for romania in 10 batch(es)...
Batch 1/10 ‚Üí requesting 500 names...
Generating 5000 names for norway in 10 batch(es)...
Batch 1/10 ‚Üí requesting 500 names...
Generating 5000 names for hungary in 10 batch(es)...
Batch 1/10 ‚Üí requesting 500 names...
Generating 5000 names for chile in 10 batch(es)...
Batch 1/10 ‚Üí requesting 500 names...
Generating 5000 names for new zealand in 10 batch(es)...
Batch 1/10 ‚Üí requesting 500 names...


  0%|          | 0/5 [00:00<?, ?it/s]

‚úÖ 500/5000 names collected so far...
Batch 2/10 ‚Üí requesting 500 names...
‚úÖ 500/5000 names collected so far...
Batch 2/10 ‚Üí requesting 500 names...
‚úÖ 500/5000 names collected so far...
Batch 2/10 ‚Üí requesting 500 names...
‚úÖ 500/5000 names collected so far...
Batch 2/10 ‚Üí requesting 500 names...
‚úÖ 500/5000 names collected so far...
Batch 2/10 ‚Üí requesting 500 names...
‚úÖ 999/5000 names collected so far...
Batch 3/10 ‚Üí requesting 500 names...
‚úÖ 1000/5000 names collected so far...
Batch 3/10 ‚Üí requesting 500 names...
‚úÖ 1000/5000 names collected so far...
Batch 3/10 ‚Üí requesting 500 names...
‚úÖ 994/5000 names collected so far...
Batch 3/10 ‚Üí requesting 500 names...
‚úÖ 1499/5000 names collected so far...
Batch 4/10 ‚Üí requesting 500 names...
‚úÖ 1000/5000 names collected so far...
Batch 3/10 ‚Üí requesting 500 names...
‚úÖ 1500/5000 names collected so far...
Batch 4/10 ‚Üí requesting 500 names...
‚úÖ 1500/5000 names collected so far...
Batch 4/10 ‚Üí requ

 20%|‚ñà‚ñà        | 1/5 [44:04<2:56:17, 2644.28s/it]

‚úÖ Finished hungary: 5000 names saved to generated/hungary_5000.csv
‚úÖ Finished hungary
‚úÖ 4000/5000 names collected so far...
Batch 9/10 ‚Üí requesting 500 names...
‚úÖ 4741/5000 names collected so far...
Batch 9/10 ‚Üí requesting 259 names...
‚úÖ 3997/5000 names collected so far...
Batch 9/10 ‚Üí requesting 500 names...


 40%|‚ñà‚ñà‚ñà‚ñà      | 2/5 [48:57<1:03:03, 1261.20s/it]

‚úÖ Finished norway: 5000 names saved to generated/norway_5000.csv
‚úÖ Finished norway
‚úÖ 4497/5000 names collected so far...
Batch 10/10 ‚Üí requesting 500 names...
‚úÖ 4498/5000 names collected so far...
Batch 10/10 ‚Üí requesting 500 names...


 60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 3/5 [54:34<27:58, 839.12s/it]   

‚úÖ 4998/5000 names collected so far...
‚úÖ Finished romania: 4998 names saved to generated/romania_5000.csv
‚úÖ Finished romania
‚úÖ 3500/5000 names collected so far...
Batch 8/10 ‚Üí requesting 500 names...


 80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 4/5 [55:55<08:59, 539.93s/it]

‚úÖ 4997/5000 names collected so far...
‚úÖ Finished new zealand: 4997 names saved to generated/new_zealand_5000.csv
‚úÖ Finished new zealand
‚úÖ 4000/5000 names collected so far...
Batch 9/10 ‚Üí requesting 500 names...
‚úÖ 4500/5000 names collected so far...
Batch 10/10 ‚Üí requesting 500 names...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [1:08:15<00:00, 819.17s/it]

‚úÖ Finished chile: 5000 names saved to generated/chile_5000.csv
‚úÖ Finished chile
‚è∏Ô∏è Cooling down for 30 seconds before next batch...






üöÄ Batch 2: ireland, vietnam, colombia, bulgaria, croatia
Generating 5000 names for ireland in 10 batch(es)...
Batch 1/10 ‚Üí requesting 500 names...
Generating 5000 names for vietnam in 10 batch(es)...
Batch 1/10 ‚Üí requesting 500 names...
Generating 5000 names for colombia in 10 batch(es)...
Batch 1/10 ‚Üí requesting 500 names...
Generating 5000 names for bulgaria in 10 batch(es)...
Batch 1/10 ‚Üí requesting 500 names...
Generating 5000 names for croatia in 10 batch(es)...
Batch 1/10 ‚Üí requesting 500 names...


  0%|          | 0/5 [00:00<?, ?it/s]

‚úÖ 500/5000 names collected so far...
Batch 2/10 ‚Üí requesting 500 names...
‚úÖ 500/5000 names collected so far...
Batch 2/10 ‚Üí requesting 500 names...
‚úÖ 500/5000 names collected so far...
Batch 2/10 ‚Üí requesting 500 names...
‚úÖ 500/5000 names collected so far...
Batch 2/10 ‚Üí requesting 500 names...
‚úÖ 999/5000 names collected so far...
Batch 3/10 ‚Üí requesting 500 names...
‚úÖ 1000/5000 names collected so far...
Batch 3/10 ‚Üí requesting 500 names...
‚úÖ 1499/5000 names collected so far...
Batch 4/10 ‚Üí requesting 500 names...
Error for '(500, 'croatia')': Connection error.
‚úÖ 1000/5000 names collected so far...
Batch 3/10 ‚Üí requesting 500 names...
Error for '(500, 'vietnam')': Request timed out.
‚úÖ 1000/5000 names collected so far...
Batch 3/10 ‚Üí requesting 500 names...
Error for '(500, 'bulgaria')': Connection error.
‚úÖ 500/5000 names collected so far...
Batch 2/10 ‚Üí requesting 500 names...
‚úÖ 1500/5000 names collected so far...
Batch 4/10 ‚Üí requesting 500 

 20%|‚ñà‚ñà        | 1/5 [46:57<3:07:48, 2817.23s/it]

‚úÖ Finished colombia: 5000 names saved to generated/colombia_5000.csv
‚úÖ Finished colombia
‚úÖ 3996/5000 names collected so far...
Batch 9/10 ‚Üí requesting 500 names...


 40%|‚ñà‚ñà‚ñà‚ñà      | 2/5 [48:39<1:01:00, 1220.08s/it]

‚úÖ 4999/5000 names collected so far...
‚úÖ Finished croatia: 4999 names saved to generated/croatia_5000.csv
‚úÖ Finished croatia
‚úÖ 4496/5000 names collected so far...
Batch 10/10 ‚Üí requesting 500 names...


 60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 3/5 [50:12<23:30, 705.33s/it]   

‚úÖ Finished ireland: 5000 names saved to generated/ireland_5000.csv
‚úÖ Finished ireland
‚úÖ 4496/5000 names collected so far...
Batch 10/10 ‚Üí requesting 500 names...


 80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 4/5 [57:37<10:02, 602.63s/it]

‚úÖ 4996/5000 names collected so far...
‚úÖ Finished vietnam: 4996 names saved to generated/vietnam_5000.csv
‚úÖ Finished vietnam


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [57:43<00:00, 692.70s/it]

‚úÖ 4996/5000 names collected so far...
‚úÖ Finished bulgaria: 4996 names saved to generated/bulgaria_5000.csv
‚úÖ Finished bulgaria
‚è∏Ô∏è Cooling down for 30 seconds before next batch...






üöÄ Batch 3: slovenia, luxembourg, venezuela, cyprus, uruguay
Generating 5000 names for slovenia in 10 batch(es)...
Batch 1/10 ‚Üí requesting 500 names...
Generating 5000 names for luxembourg in 10 batch(es)...
Batch 1/10 ‚Üí requesting 500 names...
Generating 5000 names for venezuela in 10 batch(es)...
Batch 1/10 ‚Üí requesting 500 names...
Generating 5000 names for cyprus in 10 batch(es)...
Batch 1/10 ‚Üí requesting 500 names...
Generating 5000 names for uruguay in 10 batch(es)...
Batch 1/10 ‚Üí requesting 500 names...


  0%|          | 0/5 [00:00<?, ?it/s]

‚úÖ 500/5000 names collected so far...
Batch 2/10 ‚Üí requesting 500 names...
‚úÖ 499/5000 names collected so far...
Batch 2/10 ‚Üí requesting 500 names...
‚úÖ 500/5000 names collected so far...
Batch 2/10 ‚Üí requesting 500 names...
‚úÖ 1000/5000 names collected so far...
Batch 3/10 ‚Üí requesting 500 names...
‚úÖ 999/5000 names collected so far...
Batch 3/10 ‚Üí requesting 500 names...
‚úÖ 1000/5000 names collected so far...
Batch 3/10 ‚Üí requesting 500 names...
‚úÖ 500/5000 names collected so far...
Batch 2/10 ‚Üí requesting 500 names...
‚úÖ 500/5000 names collected so far...
Batch 2/10 ‚Üí requesting 500 names...
‚úÖ 1500/5000 names collected so far...
Batch 4/10 ‚Üí requesting 500 names...
‚úÖ 1499/5000 names collected so far...
Batch 4/10 ‚Üí requesting 500 names...
‚úÖ 1000/5000 names collected so far...
Batch 3/10 ‚Üí requesting 500 names...
‚úÖ 2000/5000 names collected so far...
Batch 5/10 ‚Üí requesting 500 names...
‚úÖ 1999/5000 names collected so far...
Batch 5/10 ‚Üí req

 20%|‚ñà‚ñà        | 1/5 [47:59<3:11:59, 2879.97s/it]

‚úÖ Finished venezuela: 5000 names saved to generated/venezuela_5000.csv
‚úÖ Finished venezuela
‚úÖ 4450/5000 names collected so far...
Batch 10/10 ‚Üí requesting 500 names...
‚úÖ 2992/5000 names collected so far...
Batch 7/10 ‚Üí requesting 500 names...


 40%|‚ñà‚ñà‚ñà‚ñà      | 2/5 [52:12<1:06:42, 1334.17s/it]

‚úÖ 4950/5000 names collected so far...
‚úÖ Finished slovenia: 4950 names saved to generated/slovenia_5000.csv
‚úÖ Finished slovenia


 60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 3/5 [52:14<24:11, 725.87s/it]   

‚úÖ Finished luxembourg: 5000 names saved to generated/luxembourg_5000.csv
‚úÖ Finished luxembourg


 80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 4/5 [53:09<07:41, 461.23s/it]

‚úÖ Finished uruguay: 5000 names saved to generated/uruguay_5000.csv
‚úÖ Finished uruguay
‚úÖ 3492/5000 names collected so far...
Batch 8/10 ‚Üí requesting 500 names...
‚úÖ 3992/5000 names collected so far...
Batch 9/10 ‚Üí requesting 500 names...
‚úÖ 4490/5000 names collected so far...
Batch 10/10 ‚Üí requesting 500 names...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [1:29:16<00:00, 1071.29s/it]

‚úÖ 4991/5000 names collected so far...
‚úÖ Finished cyprus: 4991 names saved to generated/cyprus_5000.csv
‚úÖ Finished cyprus
‚è∏Ô∏è Cooling down for 30 seconds before next batch...






üöÄ Batch 4: philippines, kazakhstan, ecuador, peru, panama
Generating 5000 names for philippines in 10 batch(es)...
Batch 1/10 ‚Üí requesting 500 names...
Generating 5000 names for kazakhstan in 10 batch(es)...
Batch 1/10 ‚Üí requesting 500 names...
Generating 5000 names for ecuador in 10 batch(es)...
Batch 1/10 ‚Üí requesting 500 names...
Generating 5000 names for peru in 10 batch(es)...
Batch 1/10 ‚Üí requesting 500 names...
Generating 5000 names for panama in 10 batch(es)...
Batch 1/10 ‚Üí requesting 500 names...


  0%|          | 0/5 [00:00<?, ?it/s]

‚úÖ 500/5000 names collected so far...
Batch 2/10 ‚Üí requesting 500 names...
‚úÖ 500/5000 names collected so far...
Batch 2/10 ‚Üí requesting 500 names...
‚úÖ 500/5000 names collected so far...
Batch 2/10 ‚Üí requesting 500 names...
‚úÖ 1000/5000 names collected so far...
Batch 3/10 ‚Üí requesting 500 names...
‚úÖ 1000/5000 names collected so far...
Batch 3/10 ‚Üí requesting 500 names...
‚úÖ 500/5000 names collected so far...
Batch 2/10 ‚Üí requesting 500 names...
‚úÖ 1000/5000 names collected so far...
Batch 3/10 ‚Üí requesting 500 names...
‚úÖ 1000/5000 names collected so far...
Batch 3/10 ‚Üí requesting 500 names...
‚úÖ 1500/5000 names collected so far...
Batch 4/10 ‚Üí requesting 500 names...
‚úÖ 500/5000 names collected so far...
Batch 2/10 ‚Üí requesting 500 names...
‚úÖ 1500/5000 names collected so far...
Batch 4/10 ‚Üí requesting 500 names...
‚úÖ 1500/5000 names collected so far...
Batch 4/10 ‚Üí requesting 500 names...
‚úÖ 2000/5000 names collected so far...
Batch 5/10 ‚Üí re

 20%|‚ñà‚ñà        | 1/5 [30:37<2:02:29, 1837.32s/it]

‚úÖ 4998/5000 names collected so far...
‚úÖ Finished philippines: 4998 names saved to generated/philippines_5000.csv
‚úÖ Finished philippines


 40%|‚ñà‚ñà‚ñà‚ñà      | 2/5 [31:48<39:55, 798.58s/it]   

‚úÖ Finished kazakhstan: 5000 names saved to generated/kazakhstan_5000.csv
‚úÖ Finished kazakhstan


 60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 3/5 [31:53<14:32, 436.09s/it]

‚úÖ Finished ecuador: 5000 names saved to generated/ecuador_5000.csv
‚úÖ Finished ecuador
‚úÖ 3500/5000 names collected so far...
Batch 8/10 ‚Üí requesting 500 names...
‚úÖ 4500/5000 names collected so far...
Batch 10/10 ‚Üí requesting 500 names...
‚úÖ 4000/5000 names collected so far...
Batch 9/10 ‚Üí requesting 500 names...


 80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 4/5 [34:55<05:35, 335.85s/it]

‚úÖ Finished panama: 5000 names saved to generated/panama_5000.csv
‚úÖ Finished panama
‚úÖ 4500/5000 names collected so far...
Batch 10/10 ‚Üí requesting 500 names...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [38:50<00:00, 466.02s/it]

‚úÖ Finished peru: 5000 names saved to generated/peru_5000.csv
‚úÖ Finished peru
‚è∏Ô∏è Cooling down for 30 seconds before next batch...






üöÄ Batch 5: costa rica, guatemala, uzbekistan, malta, paraguay
Generating 5000 names for costa rica in 10 batch(es)...
Batch 1/10 ‚Üí requesting 500 names...
Generating 5000 names for guatemala in 10 batch(es)...
Batch 1/10 ‚Üí requesting 500 names...
Generating 5000 names for uzbekistan in 10 batch(es)...
Batch 1/10 ‚Üí requesting 500 names...
Generating 5000 names for malta in 10 batch(es)...
Batch 1/10 ‚Üí requesting 500 names...
Generating 5000 names for paraguay in 10 batch(es)...
Batch 1/10 ‚Üí requesting 500 names...


  0%|          | 0/5 [00:00<?, ?it/s]

‚úÖ 500/5000 names collected so far...
Batch 2/10 ‚Üí requesting 500 names...
‚úÖ 500/5000 names collected so far...
Batch 2/10 ‚Üí requesting 500 names...
‚úÖ 500/5000 names collected so far...
Batch 2/10 ‚Üí requesting 500 names...
‚úÖ 1000/5000 names collected so far...
Batch 3/10 ‚Üí requesting 500 names...
‚úÖ 500/5000 names collected so far...
Batch 2/10 ‚Üí requesting 500 names...
‚úÖ 1000/5000 names collected so far...
Batch 3/10 ‚Üí requesting 500 names...
‚úÖ 499/5000 names collected so far...
Batch 2/10 ‚Üí requesting 500 names...
‚úÖ 1000/5000 names collected so far...
Batch 3/10 ‚Üí requesting 500 names...
‚úÖ 1500/5000 names collected so far...
Batch 4/10 ‚Üí requesting 500 names...
‚úÖ 1498/5000 names collected so far...
Batch 4/10 ‚Üí requesting 500 names...
‚úÖ 1000/5000 names collected so far...
Batch 3/10 ‚Üí requesting 500 names...
‚úÖ 999/5000 names collected so far...
Batch 3/10 ‚Üí requesting 500 names...
‚úÖ 2000/5000 names collected so far...
Batch 5/10 ‚Üí req

 20%|‚ñà‚ñà        | 1/5 [23:19<1:33:16, 1399.23s/it]

‚úÖ Finished guatemala: 5000 names saved to generated/guatemala_5000.csv
‚úÖ Finished guatemala
‚úÖ 3500/5000 names collected so far...
Batch 8/10 ‚Üí requesting 500 names...
‚úÖ 3500/5000 names collected so far...
Batch 8/10 ‚Üí requesting 500 names...


 40%|‚ñà‚ñà‚ñà‚ñà      | 2/5 [25:39<32:56, 658.73s/it]   

‚úÖ 4978/5000 names collected so far...
‚úÖ Finished costa rica: 4978 names saved to generated/costa_rica_5000.csv
‚úÖ Finished costa rica
‚úÖ 3497/5000 names collected so far...
Batch 8/10 ‚Üí requesting 500 names...
‚úÖ 4000/5000 names collected so far...
Batch 9/10 ‚Üí requesting 500 names...
‚úÖ 4000/5000 names collected so far...
Batch 9/10 ‚Üí requesting 500 names...
‚úÖ 3997/5000 names collected so far...
Batch 9/10 ‚Üí requesting 500 names...
‚úÖ 4500/5000 names collected so far...
Batch 10/10 ‚Üí requesting 500 names...
‚úÖ 4500/5000 names collected so far...
Batch 10/10 ‚Üí requesting 500 names...


 60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 3/5 [32:45<18:24, 552.18s/it]

‚úÖ Finished paraguay: 5000 names saved to generated/paraguay_5000.csv
‚úÖ Finished paraguay
‚úÖ 4497/5000 names collected so far...
Batch 10/10 ‚Üí requesting 500 names...


 80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 4/5 [33:33<05:53, 353.31s/it]

‚úÖ Finished malta: 5000 names saved to generated/malta_5000.csv
‚úÖ Finished malta


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [37:06<00:00, 445.29s/it]

‚úÖ 4997/5000 names collected so far...
‚úÖ Finished uzbekistan: 4997 names saved to generated/uzbekistan_5000.csv
‚úÖ Finished uzbekistan
‚è∏Ô∏è Cooling down for 30 seconds before next batch...






üéâ All countries processed!


## evaluation

In [None]:
import pandas as pd

df_true = pd.read_json("generated/small_countries_lt6000/pakistan.jsonl", lines=True)
df_generated = pd.read_csv("generated/pakistan_2000.csv")

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
import matplotlib.pyplot as plt

# --- 1Ô∏è‚É£ Basic setup ---
real_names = df_true['name'].dropna().astype(str).str.strip()
gen_names  = df_generated['name'].dropna().astype(str).str.strip()

print(f"Real names: {len(real_names)} | Generated names: {len(gen_names)}")

# --- 2Ô∏è‚É£ Basic descriptive statistics ---
def name_stats(series):
    lengths = series.str.len()
    words = series.str.split().map(len)
    chars = Counter("".join(series))
    return {
        "avg_length": lengths.mean(),
        "median_length": lengths.median(),
        "avg_word_count": np.mean(words),
        "unique_names": series.nunique(),
        "duplication_ratio": 1 - series.nunique()/len(series),
        "top_5_chars": dict(Counter(chars).most_common(5))
    }

stats_real = name_stats(real_names)
stats_gen = name_stats(gen_names)

print("\nüìä Basic Statistics Comparison:")
print(pd.DataFrame([stats_real, stats_gen], index=['Real','Generated']))

# --- 3Ô∏è‚É£ Length histograms ---
plt.hist(real_names.str.len(), bins=30, alpha=0.5, label='Real')
plt.hist(gen_names.str.len(), bins=30, alpha=0.5, label='Generated')
plt.xlabel('Name length (characters)')
plt.ylabel('Count')
plt.legend()
plt.title('Distribution of name lengths')
plt.show()

# --- 4Ô∏è‚É£ Lexical (character n-gram) similarity ---
tfidf = TfidfVectorizer(analyzer='char', ngram_range=(3,3), min_df=2)
X_real = tfidf.fit_transform(real_names)
X_gen  = tfidf.transform(gen_names)

# Pairwise cosine similarity (each generated vs all real)
sim = cosine_similarity(X_gen, X_real)
avg_sim = sim.mean()
max_sim = sim.max(axis=1).mean()

print(f"\nüß© Average cosine similarity (all pairs): {avg_sim:.3f}")
print(f"üîç Average max similarity (nearest real name): {max_sim:.3f}")

# --- 5Ô∏è‚É£ Duplication across datasets ---
intersect = set(real_names.str.lower()) & set(gen_names.str.lower())
print(f"\n‚ö†Ô∏è Overlapping (exact match) names: {len(intersect)} "
      f"({len(intersect)/len(gen_names)*100:.2f}% of generated)")

# --- 6Ô∏è‚É£ Placeholder for back-prediction (if you have classifier) ---
# pred_real = name2nat.predict(real_names)
# pred_gen  = name2nat.predict(gen_names)
# accuracy_gen = (pred_gen == 'new zealand').mean()
# print(f"Classifier consistency on generated data: {accuracy_gen:.2%}")
