diff --git a/nemo/NeMo-Data-Designer/self-hosted-tutorials/community-contributions/README.md b/nemo/NeMo-Data-Designer/self-hosted-tutorials/community-contributions/README.md index 1b3934796..af7de6df6 100644 --- a/nemo/NeMo-Data-Designer/self-hosted-tutorials/community-contributions/README.md +++ b/nemo/NeMo-Data-Designer/self-hosted-tutorials/community-contributions/README.md @@ -16,16 +16,16 @@ | Notebook | Domain | Description | |---------------------------------------------------|---------------------|-----------------------------------------------------------------| -| [person-sampler-tutorial.ipynb](./advanced/person-samplers/person-sampler-tutorial.ipynb) | Persona Samplers | Generate realistic personas using the person sampler | -| [clinical-trials.ipynb](./advanced/healthcare-datasets/clinical-trials.ipynb) | Healthcare | Build synthetic clinical trial datasets with realistic PII for testing data protection | -| [insurance-claims.ipynb](./advanced/healthcare-datasets/insurance-claims.ipynb) | Healthcare | Create synthetic insurance claims datasets with realistic claim data and processing information | -| [physician-notes-with-realistic-personal-details.ipynb](./advanced/healthcare-datasets/physician-notes-with-realistic-personal-details.ipynb) | Healthcare | Generate realistic patient data and physician notes with embedded personal information | -| [w2-dataset.ipynb](./advanced/forms/w2-dataset.ipynb) | Forms & Documents | Generate synthetic W-2 tax form datasets with realistic employee and employer information | -| [multi-turn-conversation.ipynb](./advanced/multi-turn-chat/multi-turn-conversation.ipynb) | Conversational AI | Build synthetic conversational data with realistic person details and multi-turn dialogues | -| [visual-question-answering-using-vlm.ipynb](./advanced/multimodal/visual-question-answering-using-vlm.ipynb) | Multimodal | Create visual question answering datasets using Vision Language Models | -| [product-question-answer-generator.ipynb](./advanced/qa-generation/product-question-answer-generator.ipynb) | Q&A Generation | Build product information datasets with corresponding questions and answers | -| [generate-rag-evaluation-dataset.ipynb](./advanced/rag-examples/generate-rag-evaluation-dataset.ipynb) | RAG & Retrieval | Generate diverse RAG evaluation datasets for testing retrieval-augmented generation systems | -| [reasoning-traces.ipynb](./advanced/reasoning/reasoning-traces.ipynb) | Reasoning | Build synthetic reasoning traces to demonstrate step-by-step problem-solving processes | -| [text-to-python.ipynb](./advanced/text-to-code/text-to-python.ipynb) | Text-to-Code | Generate Python code from natural language instructions with validation and evaluation | -| [text-to-python-evol.ipynb](./advanced/text-to-code/text-to-python-evol.ipynb) | Text-to-Code | Build advanced Python code generation with evolutionary improvements and iterative refinement | -| [text-to-sql.ipynb](./advanced/text-to-code/text-to-sql.ipynb) | Text-to-Code | Create SQL queries from natural language descriptions with validation and testing | +| [person-sampler-tutorial.ipynb](./person-samplers/person-sampler-tutorial.ipynb) | Persona Samplers | Generate realistic personas using the person sampler | +| [clinical-trials.ipynb](./healthcare-datasets/clinical-trials.ipynb) | Healthcare | Build synthetic clinical trial datasets with realistic PII for testing data protection | +| [insurance-claims.ipynb](./healthcare-datasets/insurance-claims.ipynb) | Healthcare | Create synthetic insurance claims datasets with realistic claim data and processing information | +| [physician-notes-with-realistic-personal-details.ipynb](./healthcare-datasets/physician-notes-with-realistic-personal-details.ipynb) | Healthcare | Generate realistic patient data and physician notes with embedded personal information | +| [w2-dataset.ipynb](./forms/w2-dataset.ipynb) | Forms & Documents | Generate synthetic W-2 tax form datasets with realistic employee and employer information | +| [multi-turn-conversation.ipynb](./multi-turn-chat/multi-turn-conversation.ipynb) | Conversational AI | Build synthetic conversational data with realistic person details and multi-turn dialogues | +| [visual-question-answering-using-vlm.ipynb](./multimodal/visual-question-answering-using-vlm.ipynb) | Multimodal | Create visual question answering datasets using Vision Language Models | +| [product-question-answer-generator.ipynb](./qa-generation/product-question-answer-generator.ipynb) | Q&A Generation | Build product information datasets with corresponding questions and answers | +| [generate-rag-evaluation-dataset.ipynb](./rag-examples/generate-rag-evaluation-dataset.ipynb) | RAG & Retrieval | Generate diverse RAG evaluation datasets for testing retrieval-augmented generation systems | +| [reasoning-traces.ipynb](./reasoning/reasoning-traces.ipynb) | Reasoning | Build synthetic reasoning traces to demonstrate step-by-step problem-solving processes | +| [text-to-python.ipynb](./text-to-code/text-to-python.ipynb) | Text-to-Code | Generate Python code from natural language instructions with validation and evaluation | +| [text-to-python-evol.ipynb](./text-to-code/text-to-python-evol.ipynb) | Text-to-Code | Build advanced Python code generation with evolutionary improvements and iterative refinement | +| [text-to-sql.ipynb](./text-to-code/text-to-sql.ipynb) | Text-to-Code | Create SQL queries from natural language descriptions with validation and testing | diff --git a/nemo/NeMo-Data-Designer/self-hosted-tutorials/community-contributions/forms/w2-dataset.ipynb b/nemo/NeMo-Data-Designer/self-hosted-tutorials/community-contributions/forms/w2-dataset.ipynb index 8cd491e5e..b3199b39a 100644 --- a/nemo/NeMo-Data-Designer/self-hosted-tutorials/community-contributions/forms/w2-dataset.ipynb +++ b/nemo/NeMo-Data-Designer/self-hosted-tutorials/community-contributions/forms/w2-dataset.ipynb @@ -1,592 +1,4 @@ { -<<<<<<< HEAD - "cells": [ - { - "cell_type": "markdown", - "id": "00fcbf4b", - "metadata": {}, - "source": [ - "# ๐Ÿงพ NeMo Data Designer: W-2 Dataset Generator" - ] - }, - { - "cell_type": "markdown", - "id": "e8ca7bf9", - "metadata": {}, - "source": [ - "> โš ๏ธ **Warning**: NeMo Data Designer is current in Early Release and is not recommended for production use.\n", - ">\n", - "> **Note**: In order to run this notebook, you must have the NeMo Data Designer microservice deployed locally via docker compose. See the [deployment guide](http://docs.nvidia.com/nemo/microservices/latest/set-up/deploy-as-microservices/data-designer/docker-compose.html) for more details.\n", - ">\n", - "> Alternatively, you can use the [NeMo Data Designer managed service](https://build.nvidia.com/nemo/data-designer). Please refer the [intro-tutorials](../../intro-tutorials/1-the-basics.ipynb) on how to connect to it. \n", - ">\n", - "> **Note**: If you are using the NeMo Data Designer managed service, you will only be able to launch preview jobs. You will not be able to launch jobs using the `create` method." - ] - }, - { - "cell_type": "markdown", - "id": "016ba3fd", - "metadata": {}, - "source": [ - "In this notebook we demonstrate how you can combine numerical samplers, the person sampler and LLMs to create a synthetic dataset of W-2 forms (US Wage & Tax Statements).\n", - "\n", - "### Generating realistic numerical values\n", - "\n", - "We will use generate numerical fields using statistics published by the IRS for the year 2021:\n", - "\n", - "- https://www.irs.gov/pub/irs-pdf/p5385.pdf\n", - "\n", - "### Generating realistic taxpayers\n", - "\n", - "We will use the person sampler to generate realistic US taxpayers. When the US locale is chosen, statistics for generated persons reflect real-world census data.\n" - ] - }, - { - "cell_type": "markdown", - "id": "ed91fb59", - "metadata": {}, - "source": [ - "\n", - "#### ๐Ÿ’พ Install dependencies\n", - "\n", - "**IMPORTANT** ๐Ÿ‘‰ If you haven't already, follow the instructions in the [README](../../README.md) to install the necessary dependencies. Note you may need to restart your kernel after setting up the environment.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "52263153", - "metadata": {}, - "outputs": [], - "source": [ - "from nemo_microservices import NeMoMicroservices\n", - "from nemo_microservices.beta.data_designer import (\n", - " DataDesignerConfigBuilder,\n", - " DataDesignerClient,\n", - ")\n", - "from nemo_microservices.beta.data_designer.config import columns as C\n", - "from nemo_microservices.beta.data_designer.config import params as P" - ] - }, - { - "cell_type": "markdown", - "id": "94213d7c", - "metadata": {}, - "source": [ - "### โš™๏ธ Initialize the NeMo Data Designer Client\n", - "\n", - "- The data designer client is responsible for submitting generation requests to the Data Designer microservice.\n", - "- In this notebook, we connect to a local deployment of data designer. You can deploy your own instance of data designer by following the deployment instructions [here](https://docs.nvidia.com/nemo/microservices/latest/set-up/deploy-as-microservices/data-designer/docker-compose.html).\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "834f3a96", - "metadata": {}, - "outputs": [], - "source": [ - "data_designer_client = DataDesignerClient(client=NeMoMicroservices(base_url=\"http://localhost:8080\"))" - ] - }, - { - "cell_type": "markdown", - "id": "fadf0f19", - "metadata": {}, - "source": [ - "### ๐Ÿ—๏ธ Initialize the Data Designer Config Builder\n", - "\n", - "- The Data Designer config defines the dataset schema and generation process.\n", - "\n", - "- The config builder provides an intuitive interface for building this configuration.\n", - "\n", - "- You must provide a list of model configs to the builder at initialization.\n", - "\n", - "- This list contains the models you can choose from (via the `model_alias` argument) during the generation process.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "id": "a214afcd", - "metadata": {}, - "outputs": [], - "source": [ - "# We specify the endpoint of the model during deployment using the model_provider_registry.\n", - "model_id = \"nvidia/nvidia-nemotron-nano-9b-v2\"\n", - "model_alias = \"nemotron-nano-9b-v2\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dcc34745", - "metadata": {}, - "outputs": [], - "source": [ - "config_builder = DataDesignerConfigBuilder(\n", - " model_configs=[\n", - " P.ModelConfig(\n", - " alias=model_alias,\n", - " provider=\"nvidiabuild\",\n", - " model=model_id,\n", - " inference_parameters=P.InferenceParameters(\n", - " max_tokens=1024,\n", - " temperature=0.6,\n", - " top_p=0.9,\n", - " ),\n", - " is_reasoner=True\n", - " ),\n", - " ]\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "bbcb3538", - "metadata": {}, - "source": [ - "## Setting up taxpayer and employer sampling" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "149e2abf", - "metadata": {}, - "outputs": [], - "source": [ - "# Create a samplers for an American taxpayer (employee), and employer.\n", - "config_builder.add_column(\n", - " C.SamplerColumn(\n", - " name=\"taxpayer\",\n", - " type=P.SamplerType.PERSON,\n", - " params=P.PersonSamplerParams(\n", - " locale=\"en_US\",\n", - " age_range=[18, 75]\n", - " ),\n", - " )\n", - ")\n", - "\n", - "# While the employer isn't technically a \"person\", we'll use the person sampler for generating the employer address.\n", - "config_builder.add_column(\n", - " C.SamplerColumn(\n", - " name=\"employer\",\n", - " type=P.SamplerType.PERSON,\n", - " params=P.PersonSamplerParams(\n", - " locale=\"en_US\",\n", - " ),\n", - " )\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "28397d74", - "metadata": {}, - "source": [ - "## Defining the fields\n", - "\n", - "We will focus on the following:\n", - "- Box 1 (Wages, tips, and other compensation)\n", - "- Box 2 (Federal income tax withheld)\n", - "- Box 3 (Social security wages)\n", - "- Box 4 (Social security tax withheld)\n", - "- Box 5 (Medicare wages and tips)\n", - "- Box 6 (Medicare tax withheld)\n", - "- Box 7 (Social security tips)\n", - "- Box a (Employee's social security number)\n", - "- Box c (Employer's name, address and zip code)\n", - "- Box e (Employee's fist name, initial, and last name)\n", - "- Box f (Employee's address and zip code)" - ] - }, - { - "cell_type": "markdown", - "id": "060f6c0f", - "metadata": {}, - "source": [ - "### Numerical fields\n", - "\n", - "Here, we'll define how to generate numerical samples for the currency fields of the W-2 (Boxes 1-7). We'll use the W-2 statistics from the IRS linked above to generate realistic samples." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "be7e98e1", - "metadata": {}, - "outputs": [], - "source": [ - "### BOX 1 (TOTAL WAGES, TIPS, AND OTHER COMPENSATION) ###\n", - "\n", - "# From Page 6 of the IRS Statistics, we know that 276,388,660 / 277,981,454 W-2 forms had a non-zero value for Box 1 (99.4%).\n", - "# From Page 8 of the IRS Statistics, we know that the sum of this field across all forms was 9,920,000,000*$1000 = $9,920,000,000,000 dollars.\n", - "# Since there were 276,388,660 non-zero Box 1 values, the average value of Box 1 was $9,920,000,000,000 / 276,388,660 = $35,891.49.\n", - "# We will use a Bernoulli-Exponential mixture distribution to sample values for this field.\n", - "config_builder.add_column(\n", - " C.SamplerColumn(\n", - " name=\"box_1_wages_tips_other_compensation\",\n", - " type=P.SamplerType.BERNOULLI_MIXTURE,\n", - " params=P.BernoulliMixtureSamplerParams(\n", - " p=0.994,\n", - " dist_name=\"expon\",\n", - " dist_params={\"scale\": 35891.49}\n", - " ),\n", - " convert_to=\"int\",\n", - " )\n", - ")\n", - "\n", - "### BOX 2 (FEDERAL INCOME TAX WITHHELD) ###\n", - "\n", - "# Note: The calculations below are a simplification based on the assumption that this is an individual's only W-2.\n", - "# In practice, the taxable income is based on all wages for individuals with multiple W-2s.\n", - "\n", - "# 2022 standard deduction\n", - "config_builder.add_column(\n", - " C.ExpressionColumn(\n", - " name=\"standard_deduction\",\n", - " expr=\"{% if taxpayer.marital_status == 'married_present' %}25900{% else %}12950{% endif %}\",\n", - " convert_to=\"float\",\n", - " ),\n", - ")\n", - "\n", - "config_builder.add_column(\n", - " C.ExpressionColumn(\n", - " name=\"taxable_income\",\n", - " expr=\"{{ [0, box_1_wages_tips_other_compensation - standard_deduction]|max }}\",\n", - " convert_to=\"float\",\n", - " )\n", - ")\n", - "\n", - "# We'll sum over the tax incurred at each 2022 tax bracket.\n", - "# For simplicity, we'll assume that the taxpayer is single here.\n", - "BRACKETS = [\n", - " {\"name\": \"bracket1\", \"rate\": 0.10, \"max\": 10275, \"min\": 0},\n", - " {\"name\": \"bracket2\", \"rate\": 0.12, \"max\": 41775, \"min\": 10275},\n", - " {\"name\": \"bracket3\", \"rate\": 0.22, \"max\": 89075, \"min\": 41775},\n", - " {\"name\": \"bracket4\", \"rate\": 0.24, \"max\": 170050, \"min\": 89075},\n", - " {\"name\": \"bracket5\", \"rate\": 0.32, \"max\": 215950, \"min\": 170050},\n", - " {\"name\": \"bracket6\", \"rate\": 0.35, \"max\": 539900, \"min\": 215950},\n", - " {\"name\": \"bracket7\", \"rate\": 0.37, \"max\": 10000000000000, \"min\": 539900},\n", - "]\n", - "for bracket in BRACKETS:\n", - " expression = f\"{bracket['rate']}*([[taxable_income,{bracket['max']}]|min - {bracket['min']}, 0] | max)\"\n", - " config_builder.add_column(\n", - " C.ExpressionColumn(\n", - " name=bracket[\"name\"],\n", - " expr=\"{{ \" + expression + \" }}\",\n", - " convert_to=\"float\",\n", - " )\n", - " )\n", - "\n", - "# Sum the tax brackets to get the total withheld, on average\n", - "config_builder.add_column(\n", - " C.ExpressionColumn(\n", - " name=\"mean_tax_liability\",\n", - " expr=\"{{ bracket1 + bracket2 + bracket3 + bracket4 + bracket5 + bracket6 + bracket7 }}\",\n", - " convert_to=\"int\",\n", - " )\n", - ")\n", - "\n", - "# Add some noise to get the actual withholding\n", - "config_builder.add_column(\n", - " C.SamplerColumn(\n", - " name=\"tax_liability_noise\",\n", - " type=P.SamplerType.GAUSSIAN,\n", - " params=P.GaussianSamplerParams(mean=1, stddev=0.1),\n", - " )\n", - ")\n", - "config_builder.add_column(\n", - " C.ExpressionColumn(\n", - " name=\"box_2_federal_income_tax_withheld\",\n", - " expr=\"{{ (mean_tax_liability * tax_liability_noise) | int }}\",\n", - " )\n", - ")\n", - "\n", - "### BOX 3 (SOCIAL SECURITY WAGES) ###\n", - "\n", - "# From Page 8 of the IRS Statistics, we know that social security wages are, on average, 8,150,000,000/9,920,000,000 ~= 82.16% of total wages.\n", - "# We'll sample a ratio from a normal distribution with mean 0.8216 and standard deviation 0.2.\n", - "config_builder.add_column(\n", - " C.SamplerColumn(\n", - " name=\"social_security_wages_ratio\",\n", - " type=P.SamplerType.GAUSSIAN,\n", - " params=P.GaussianSamplerParams(mean=0.8216, stddev=0.2),\n", - " convert_to=\"float\",\n", - " )\n", - ")\n", - "\n", - "config_builder.add_column(\n", - " C.ExpressionColumn(\n", - " name=\"box_3_social_security_wages\",\n", - " expr=\"{{ (box_1_wages_tips_other_compensation * social_security_wages_ratio) | int }}\",\n", - " )\n", - ")\n", - "\n", - "### BOX 4 (SOCIAL SECURITY TAX WITHHELD) ###\n", - "\n", - "# In 2022, social security tax was withheld at a rate of 6.2% of social security wages, up to a maximum of $147,000.\n", - "config_builder.add_column(\n", - " C.ExpressionColumn(\n", - " name=\"box_4_social_security_tax_withheld\",\n", - " expr=\"{{ (([box_3_social_security_wages, 147000]|min) * 0.062) | int }}\",\n", - " )\n", - ")\n", - "\n", - "### BOX 5 (MEDICARE WAGES AND TIPS) ###\n", - "\n", - "# From Page 8 of the IRS Statistics, we know that Medicare wages and tips are, on average, 10,300,000,000/9,920,000,000 ~= 103.8% of total wages.\n", - "config_builder.add_column(\n", - " C.SamplerColumn(\n", - " name=\"medicare_wages_and_tips_ratio\",\n", - " type=P.SamplerType.GAUSSIAN,\n", - " params=P.GaussianSamplerParams(mean=1.038, stddev=0.2),\n", - " convert_to=\"float\",\n", - " )\n", - ")\n", - "\n", - "config_builder.add_column(\n", - " C.ExpressionColumn(\n", - " name=\"box_5_medicare_wages_and_tips\",\n", - " expr=\"{{ (box_1_wages_tips_other_compensation * medicare_wages_and_tips_ratio) | int }}\",\n", - " )\n", - ")\n", - "\n", - "### BOX 6 (MEDICARE TAX WITHHELD) ###\n", - "\n", - "# The standard employee Medicare tax rate in 2022 was 1.45% on all Medicare wages.\n", - "# The Additional Medicare Tax rate in 2022 was 0.9% on all Medicare wages in excess of $200,000.\n", - "config_builder.add_column(\n", - " C.ExpressionColumn(\n", - " name=\"box_6_medicare_tax_withheld\",\n", - " expr=\"{{ ((box_5_medicare_wages_and_tips * 0.0145) + (([box_5_medicare_wages_and_tips - 200000, 0]|max) * 0.009)) | int }}\",\n", - " )\n", - ")\n", - "\n", - "### BOX 7 (SOCIAL SECURITY TIPS) ###\n", - "\n", - "# From Page 6 of the IRS Statistics, we know that only 12,620,946 / 277,981,454 W-2 forms had a non-zero value for Box 7 (4.54%).\n", - "# From Page 8 of the IRS Statistics, we know that the sum of this field across all forms was 55,897,014*$1000 = $55,897,014,000.\n", - "# Since there were 12,620,946 non-zero Box 7 values, the average value of Box 7 was $55,897,014,000 / 12,620,946 = $4428.91.\n", - "# We will use a Bernoulli-Exponential mixture distribution to sample values for this field.\n", - "config_builder.add_column(\n", - " C.SamplerColumn(\n", - " name=\"box_7_social_security_tips\",\n", - " type=P.SamplerType.BERNOULLI_MIXTURE,\n", - " params=P.BernoulliMixtureSamplerParams(\n", - " p=0.0454,\n", - " dist_name=\"expon\",\n", - " dist_params={\"scale\": 4428.91}\n", - " ),\n", - " convert_to=\"int\",\n", - " )\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "f1cbd72b", - "metadata": {}, - "source": [ - "### Non-numerical fields\n", - "\n", - "The remaining fields contain information about the employee (taxpayer) and the employer. We'll use the person sampler in combination with an LLM to generate values here." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bf3ba45b", - "metadata": {}, - "outputs": [], - "source": [ - "### BOX A (EMPLOYEE'S SOCIAL SECURITY NUMBER) ###\n", - "\n", - "# We can use the ssn field of the person sampler to generate a valid SSN for the employee.\n", - "\n", - "config_builder.add_column(\n", - " C.ExpressionColumn(\n", - " name=\"box_a_employee_ssn\",\n", - " expr=\"{{ taxpayer.ssn }}\",\n", - " )\n", - ")\n", - "\n", - "### BOX C (EMPLOYER'S NAME, ADDRESS AND ZIP CODE) ###\n", - "\n", - "# We want to generate a realistic company name.\n", - "# We'll start by generating a list of industries, expanded with magic.\n", - "config_builder.add_column(\n", - " C.LLMTextColumn(\n", - " name=\"employer_business\",\n", - " model_alias=model_alias,\n", - " system_prompt=(\"You are assisting a user generate synthetic W-2 forms.\"\n", - " \"You must generate a realistic industry category for the employer\"\n", - " \"eg: software, health insurance, shoe store, restaurant, plumbing\"),\n", - " prompt=(\"Generate the industry category for the employer. Ensure it is consistent with the employer location\"\n", - " \"City: {{ employer.city }}\\nState: {{ employer.state }}\"),\n", - " )\n", - ")\n", - "\n", - "# Next, we'll generate an actual name based on the type of business.\n", - "config_builder.add_column(\n", - " C.LLMTextColumn(\n", - " name=\"employer_name\",\n", - " model_alias=model_alias,\n", - " prompt=\"Generate an original name for a {{ employer_business }} business in {{ employer.city }}.\",\n", - " )\n", - ")\n", - "\n", - "# Finally, we'll combine the employer name with the address of the employer.\n", - "config_builder.add_column(\n", - " C.ExpressionColumn(\n", - " name=\"box_c_employer_name_address_zip\",\n", - " expr=\"{{ employer_name }}\\n{{ employer.street_number }} {{ employer.street_name }}\\n{{ employer.city }}, {{ employer.state }} {{ employer.postcode }}\",\n", - " )\n", - ")\n", - "\n", - "### BOX E (EMPLOYEE'S FIRST NAME, INITIAL, AND LAST NAME) ###\n", - "\n", - "# We can extract the first name, initial, and last name from the person sampler.\n", - "\n", - "config_builder.add_column(\n", - " C.ExpressionColumn(\n", - " name=\"box_e_employee_first_name_initial_last_name\",\n", - " expr=\"{{ taxpayer.first_name }} {{ taxpayer.middle_name[:1] }} {{ taxpayer.last_name }}\",\n", - " )\n", - ")\n", - "\n", - "### BOX F (EMPLOYEE'S ADDRESS AND ZIP CODE) ###\n", - "\n", - "# Similarly, we can extract the employee's address and zip code from the person sampler.\n", - "\n", - "config_builder.add_column(\n", - " C.ExpressionColumn(\n", - " name=\"box_f_employee_address_zip\",\n", - " expr=\"{{ taxpayer.street_number }} {{ taxpayer.street_name }}\\n{{ taxpayer.city }}, {{ taxpayer.state }} {{ taxpayer.postcode }}\",\n", - " )\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "7800b823", - "metadata": {}, - "source": [ - "## Preview the dataset\n", - "\n", - "We'll define the actual columns we want to appear in the dataset and generate a small 10-row preview." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "62432301", - "metadata": {}, - "outputs": [], - "source": [ - "# These are the columns we want in the final dataset, after dropping latent variables.\n", - "FINAL_COLUMNS = [\n", - " \"box_1_wages_tips_other_compensation\",\n", - " \"box_2_federal_income_tax_withheld\",\n", - " \"box_3_social_security_wages\",\n", - " \"box_4_social_security_tax_withheld\",\n", - " \"box_5_medicare_wages_and_tips\",\n", - " \"box_6_medicare_tax_withheld\",\n", - " \"box_7_social_security_tips\",\n", - " \"box_a_employee_ssn\",\n", - " \"box_c_employer_name_address_zip\",\n", - " \"box_e_employee_first_name_initial_last_name\",\n", - " \"box_f_employee_address_zip\",\n", - "]\n", - "\n", - "# Preview the results\n", - "preview = data_designer_client.preview(config_builder, verbose_logging=True)\n", - "preview.dataset[FINAL_COLUMNS]" - ] - }, - { - "cell_type": "markdown", - "id": "4925ab9d", - "metadata": {}, - "source": [ - "## Generating and Saving the Final Dataset\n", - "\n", - "Once we're happy with the preview, we can generate a larger dataset." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "21e660d4", - "metadata": {}, - "outputs": [], - "source": [ - "# Generate a final dataset\n", - "job_results = data_designer_client.create(config_builder, num_records=20, wait_until_done=False)\n", - "\n", - "job_results.wait_until_done()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5a8b4dbe", - "metadata": {}, - "outputs": [], - "source": [ - "# Load the dataset into a pandas DataFrame\n", - "dataset = job_results.load_dataset()\n", - "\n", - "# Show the final dataset with only the W-2 relevant columns\n", - "final_dataset = dataset[FINAL_COLUMNS]\n", - "\n", - "print(f\"Generated dataset with {len(final_dataset)} records\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b9eb0671", - "metadata": {}, - "outputs": [], - "source": [ - "# Create data directory if it doesn't exist\n", - "import os\n", - "os.makedirs(\"./data\", exist_ok=True)\n", - "\n", - "# Save the dataset to CSV\n", - "csv_filename = \"./data/synthetic-w2-dataset.csv\"\n", - "final_dataset.to_csv(csv_filename, index=False)\n", - "print(f\"Dataset saved to {csv_filename}\")\n", - "\n", - "# Show a sample of the final dataset\n", - "final_dataset.head()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "sdg_venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.11" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -======= "cells": [ { "cell_type": "markdown", @@ -1033,10 +445,10 @@ " LLMTextColumnConfig(\n", " name=\"employer_business\",\n", " model_alias=MODEL_ALIAS,\n", - " system_prompt=(\"You are assisting a user generate synthetic W-2 forms.\\n\"\n", - " \"You must generate a realistic industry category for the employer\\n\"\n", + " system_prompt=(\"You are assisting a user generate synthetic W-2 forms.\"\n", + " \"You must generate a realistic industry category for the employer\"\n", " \"eg: software, health insurance, shoe store, restaurant, plumbing /no_think\"),\n", - " prompt=(\"Generate the industry category for the employer. Ensure it is consistent with the employer location\\n\"\n", + " prompt=(\"Generate the industry category for the employer. Ensure it is consistent with the employer location\"\n", " \"City: {{ employer.city }}\\nState: {{ employer.state }}\"),\n", " )\n", ")\n", @@ -1237,5 +649,4 @@ }, "nbformat": 4, "nbformat_minor": 5 ->>>>>>> 8b9be04 (refactored w2 dataset notebook for 25.10) }