In [10]:
# imports

import os
from dotenv import load_dotenv
import google.generativeai
from IPython.display import Markdown, display, update_display

In [11]:
# Load environment variables in a file called .env
# Print the key prefixes to help with any debugging

load_dotenv()
google_api_key = os.getenv('GOOGLE_API_KEY')

In [12]:
# This is the set up code for Gemini
# Having problems with Google Gemini setup? Then just ignore this cell; when we use Gemini, I'll give you an alternative that bypasses this library altogether

google.generativeai.configure()

In [13]:
system_message = "You are an assistant that is great at telling jokes"
user_prompt = "Tell a light-hearted joke for an audience of Data Scientists"

In [14]:
prompts = [
    {"role": "system", "content": system_message},
    {"role": "user", "content": user_prompt}
  ]

In [22]:
# The API for Gemini has a slightly different structure.
# I've heard that on some PCs, this Gemini code causes the Kernel to crash.
# If that happens to you, please skip this cell and use the next cell instead - an alternative approach.

gemini = google.generativeai.GenerativeModel(
    model_name='gemini-1.5-flash',
    system_instruction=system_message
)
response = gemini.generate_content(user_prompt)
print(response.text)

Why was the data scientist sad?  Because he didn't get the results he'd hoped for... and his p-value was greater than 0.05!



# LangChain

In [15]:
from langchain.prompts import FewShotPromptTemplate, PromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.pydantic_v1 import BaseModel
from langchain.pydantic_v1 import BaseModel
from langchain_experimental.tabular_synthetic_data.base import SyntheticDataGenerator
from langchain_experimental.tabular_synthetic_data.prompts import SYNTHETIC_FEW_SHOT_SUFFIX, SYNTHETIC_FEW_SHOT_PREFIX


In [16]:
llm = ChatGoogleGenerativeAI(
        model="gemini-1.5-pro",
        temperature=0,
        max_token=50,
        timeout=None,
        max_retries=2
        )

In [193]:
ai_msg = llm.invoke(prompts)
ai_msg

AIMessage(content='Why was the dataset always tired?\n\nBecause it kept running regressions all night! \n', additional_kwargs={}, response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'safety_ratings': []}, id='run-7f614158-b349-45b4-a095-aa5c41cfc47d-0', usage_metadata={'input_tokens': 23, 'output_tokens': 18, 'total_tokens': 41, 'input_token_details': {'cache_read': 0}})

## Define class object 

In [73]:
from pydantic import BaseModel, Field
class PotentialAccidents(BaseModel):
    patient_id : int = Field(..., description="6 digit numeric id of the patient")
    patient_name: str = Field(..., description="A full name, e.g., John Doe")
    diagonosis_code :str= Field(..., description="Diagnosis Code starts with character in (D,R,C,S) followed by 4 numbers, e.g., D2312")
    insurance_claim_amount :float= Field(..., description="Dollar amount of the claims. e.g $123")


In [158]:
def generate_prompt_from_pydantic(model: BaseModel):
    schema = model.schema()["properties"]
    schema_description = []

    for field_name, field_info in schema.items():
        field_type = field_info.get("type", "unknown")
        description = field_info.get("description", "")
        
        # Add constraints like minimum, maximum values
        constraints = []
        if "minimum" in field_info:
            constraints.append(f"minimum: {field_info['minimum']}")
        if "maximum" in field_info:
            constraints.append(f"maximum: {field_info['maximum']}")

        # Combine type, description, and constraints
        constraint_str = f" ({', '.join(constraints)})" if constraints else ""
        schema_description.append(
            f'"{field_name}": "{field_type}{constraint_str} ({description})"'
        )
 
    schema_str = ",\n    ".join(schema_description)
    return f"""You are a synthetic data generator. Generate list of data in the following JSON format:
            Output each JSON object on a new line seperated by a comma 
            {{
                {schema_str}
            }}
            Ensure the output strictly adheres to this format and includes realistic and diverse values for each field."""


In [196]:
generate_schema(PotentialAccidents)

'"patient_id": "integer (6 digit numeric id of the patient)",\n    "patient_name": "string (A full name, e.g., John Doe)",\n    "diagonosis_code": "string (Diagnosis Code starts with character in (D,R,C,S) followed by 4 numbers, e.g., D2312)",\n    "insurance_claim_amount": "number (Dollar amount of the claims. e.g $123)"'

## Generate Sample Data

In [160]:
examples = [{"example": """patient_id:123456, patient_name=John Mulcahy, diagonosis_code=D3451,insurance_claim_amount=$452 """},
            {"example": """patient_id:542345, patient_name=Jyoti Kaur, diagonosis_code=C4565,insurance_claim_amount=$123 """},
            {"example": """patient_id:898767, patient_name=July Macnore, diagonosis_code=R5645,insurance_claim_amount=$523 """}
            ]

## Craft Prompt Template

In [161]:
GOOGLE_TEMPLATE = PromptTemplate(input_variables =["example"],template ="{example}")

prompt_template= FewShotPromptTemplate(
    prefix= SYNTHETIC_FEW_SHOT_PREFIX,
    examples = examples,
    suffix=SYNTHETIC_FEW_SHOT_SUFFIX, 
    input_variables=["subject","extra"],
    example_prompt = GOOGLE_TEMPLATE
)


In [162]:
generator = SyntheticDataGenerator(template=prompt_template, llm=llm)
results = generator.generate(subject="Patients Data", 
                             extra=generate_prompt_from_pydantic(PotentialAccidents)
                             , runs=1)

In [168]:
print(results)

['{"patient_id": "789012", "patient_name": "Alice Johnson", "diagonosis_code": "D1234", "insurance_claim_amount": "$321"},\n{"patient_id": "345678", "patient_name": "Bob Smith", "diagonosis_code": "R5678", "insurance_claim_amount": "$654"},\n{"patient_id": "901234", "patient_name": "Eva Garcia", "diagonosis_code": "C9012", "insurance_claim_amount": "$987"},\n{"patient_id": "567890", "patient_name": "Michael Lee", "diagonosis_code": "S3456", "insurance_claim_amount": "$210"},\n{"patient_id": "123456", "patient_name": "Maria Rodriguez", "diagonosis_code": "D7890", "insurance_claim_amount": "$543"},\n{"patient_id": "789012", "patient_name": "David Williams", "diagonosis_code": "R1234", "insurance_claim_amount": "$876"},\n{"patient_id": "345678", "patient_name": "Sarah Brown", "diagonosis_code": "C5678", "insurance_claim_amount": "$109"},\n{"patient_id": "901234", "patient_name": "Kevin Davis", "diagonosis_code": "S9012", "insurance_claim_amount": "$432"},\n{"patient_id": "567890", "patien

In [164]:
list_of_json_objects = json.loads("["+results[0]+"]")

In [165]:
df = pd.DataFrame(list_of_json_objects)

In [166]:
df

Unnamed: 0,patient_id,patient_name,diagonosis_code,insurance_claim_amount
0,789012,Alice Johnson,D1234,$321
1,345678,Bob Smith,R5678,$654
2,901234,Eva Garcia,C9012,$987
3,567890,Michael Lee,S3456,$210
4,123456,Maria Rodriguez,D7890,$543
5,789012,David Williams,R1234,$876
6,345678,Sarah Brown,C5678,$109
7,901234,Kevin Davis,S9012,$432
8,567890,Ashley Miller,D3456,$765
9,123456,Christopher Wilson,R7890,$098


## Augment data from Pandas Data Frame to handle class imbalance

In [40]:
import pandas as pd
pd.set_option('display.max_colwidth', None)  # No limit on column width
pd.set_option('display.max_rows', None)  # No limit on the number of rows displayed
pd.set_option('display.max_columns', None)  # No limit on the number of columns displayed

 

In [19]:
 df = pd.read_excel('AccidentsData.xlsx')

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 425 entries, 0 to 424
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   Unnamed: 0                425 non-null    int64         
 1   Data                      425 non-null    datetime64[ns]
 2   Countries                 425 non-null    object        
 3   Local                     425 non-null    object        
 4   Industry Sector           425 non-null    object        
 5   Accident Level            425 non-null    object        
 6   Potential Accident Level  425 non-null    object        
 7   Genre                     425 non-null    object        
 8   Employee or Third Party   425 non-null    object        
 9   Critical Risk             425 non-null    object        
 10  Description               425 non-null    object        
dtypes: datetime64[ns](1), int64(1), object(9)
memory usage: 36.7+ KB


In [21]:
# Drop the 'Unnamed: 0' column
df.drop('Unnamed: 0', axis=1, inplace=True)

# Rename the columns
df.rename(columns={
    'Data':'Date',
    'Countries':'Country',
    'Local':'Location',
    'Industry Sector':'Industry_Sector',
    'Accident Level':'Accident_Level',
    'Potential Accident Level':'Potential_Accident_Level',
    'Genre':'Gender',
    'Employee or Third Party':'Employee_Type',
    'Critical Risk':'Critical_Risk'
}, inplace=True)

In [22]:
# Convert ordinal levels to numeric
df_encoded = df.copy()

ordinal_mapping = {'I': 1, 'II': 2, 'III': 3, 'IV': 4, 'V': 5, 'VI': 6}

df_encoded['Accident_Level'] = df_encoded['Accident_Level'].map(ordinal_mapping)
df_encoded['Potential_Accident_Level'] = df_encoded['Potential_Accident_Level'].map(ordinal_mapping)
 

In [29]:
def generate_schema(model: BaseModel):
    schema = model.schema()["properties"]
    schema_description = []

    for field_name, field_info in schema.items():
        field_type = field_info.get("type", "unknown")
        description = field_info.get("description", "")
        
        # Add constraints like minimum, maximum values
        constraints = []
        if "minimum" in field_info:
            constraints.append(f"minimum: {field_info['minimum']}")
        if "maximum" in field_info:
            constraints.append(f"maximum: {field_info['maximum']}")

        # Combine type, description, and constraints
        constraint_str = f" ({', '.join(constraints)})" if constraints else ""
        schema_description.append(
            f'"{field_name}": "{field_type}{constraint_str} ({description})"'
        )
 
    schema_str = ",\n    ".join(schema_description)
    return  schema_str
           

In [91]:
from pydantic import BaseModel, Field
from datetime import datetime
import json

class industry_accident(BaseModel):     
    Date: datetime= Field(..., description="timestamp or time/date information. valid range Range: January 2016 to July 2017")
    Country :str= Field(..., description="which country the accident occurred (anonymised). Can have values as Country_01, Country_02,  Country_03 ")
    Location :str= Field(..., description="the city where the manufacturing plant is located (anonymised). can contain values from Local_01, Local_02 to Local_12")
    Industry_Sector :str= Field(..., description="which sector the plant belongs to. valid values : Mining, Metal, Others")
    Accident_Level:str= Field(..., description="from 1 to 5, it registers how severe was the accident (I means not severe but VI means very severe).")
    Potential_Accident_Level:str= Field(..., description="Depending on the Accident Level, the database also registers how severe the accident could have been (due to other factors involved in the accident)")
    Gender:str= Field(..., description="if the person is male of female.")
    Employee_Type:str= Field(..., description="if the injured person is an employee or a third party. Unique values: Third Party, Employee, Third Party (Remote)")
    Critical_Risk:str= Field(..., description="some description of the risk involved in the accident.")
    Description:str= Field(..., description="Detailed description of how the accident happened.")
    

In [102]:
def data_augmentation(df, llm, targetField, targetFieldValue, sampleSize,noOfRowsToAdd):
    # Step 2: Apply a `where` clause to filter rows
    filtered_df = df.query(f"{targetField} == {targetFieldValue}")  # Example: Filter rows where `type` is 'data'
    
    
    examples = [
        {
            "example": f"""Date:{row.Date}, Country={row.Country}, Location={row.Location},Industry_Sector={row.Industry_Sector}
            ,Accident_Level={row.Accident_Level},Potential_Accident_Level={row.Potential_Accident_Level},Gender={row.Gender}
            ,Employee_Type={row.Employee_Type}
            ,Critical_Risk={row.Critical_Risk},Description={row.Description}"""
        }
        for _, row in filtered_df.sample(n=min(len(filtered_df), sampleSize)).iterrows()
    ]

    SYNTHETIC_FEW_SHOT_PREFIX=f"""You are a industry expert and are helping to generate Synthetic data. 
                                    Generate list of data in the following JSON format:                                  
                                    {{schema}}
                                    Output each JSON object on a new line seperated by a comma. Do not iutput keyword "JSON" in the output
                                    Generate value of {targetField} as {targetFieldValue} only, 
                                    description field should be as verbose as the examples and should be of approximately the same length.
                                    """

    SYNTHETIC_FEW_SHOT_SUFFIX="Ensure the output strictly adheres to this format and includes realistic and diverse values for each field."
    #create the prompt template 
    prompt_template = PromptTemplate (input_variables  =["example"],template ="{example}")
    
    #Configure Few Shot Prompt Template
    prompt_template= FewShotPromptTemplate(
        prefix= SYNTHETIC_FEW_SHOT_PREFIX,
        examples = examples,
        suffix= SYNTHETIC_FEW_SHOT_SUFFIX,
        input_variables=["schema"],
        example_prompt = prompt_template
    )

    generator = SyntheticDataGenerator(template=prompt_template, llm=llm)
    results = generator.generate(subject="Mining and Metal Industry Accident related data", 
                             schema=generate_schema(industry_accident)
                             , runs=1)
    list_of_json_objects = json.loads("["+results[0]+"]")
    data = pd.DataFrame(list_of_json_objects)
    
    return data
    

In [115]:
def augment_for_class_imbalance(llm,df, targetField,sampleSize):
    df_augmented = df.copy()
    # Get class counts
    class_counts = df[targetField].value_counts()
    
    # Find the maximum count (target for balancing)
    max_count = class_counts.max()
    
    # Calculate the rows needed for each class to reach the max count
    rows_to_add = {cls: max_count - count for cls, count in class_counts.items()}
    for targetClass, count in rows_to_add.items():
        if count>0:
            counter = 0
            
            print(f"Adding {count} rows to {targetClass}") 
            while counter < count:
                df_new= data_augmentation(df, llm, targetField, targetClass, sampleSize, count)
                df_augmented = pd.concat([df_augmented, df_new], axis=0, ignore_index=True)
                counter = counter + df_new.shape[0]
                print(f"Added { df_new.shape[0]} rows to {targetClass}") 
             
    
    return df_augmented

 
 

In [116]:
df_augmented= augment_for_class_imbalance(llm,df_encoded, "Potential_Accident_Level", 5)

Adding 37 rows to 3
Added 8 rows to 3
Added 8 rows to 3
Added 6 rows to 3
Added 6 rows to 3
Added 8 rows to 3
Added 7 rows to 3
Adding 48 rows to 2
Added 8 rows to 2
Added 8 rows to 2
Added 9 rows to 2
Added 7 rows to 2
Added 7 rows to 2
Added 9 rows to 2
Adding 94 rows to 1
Added 6 rows to 1
Added 6 rows to 1
Added 6 rows to 1
Added 6 rows to 1
Added 7 rows to 1
Added 8 rows to 1
Added 6 rows to 1
Added 7 rows to 1
Added 9 rows to 1


Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..


Added 5 rows to 1


Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..


ResourceExhausted: 429 Resource has been exhausted (e.g. check quota).

In [88]:
df_encoded.shape

(425, 10)

In [112]:
df_augmented.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 467 entries, 0 to 466
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Date                      467 non-null    object
 1   Country                   467 non-null    object
 2   Location                  467 non-null    object
 3   Industry_Sector           467 non-null    object
 4   Accident_Level            467 non-null    object
 5   Potential_Accident_Level  467 non-null    object
 6   Gender                    467 non-null    object
 7   Employee_Type             467 non-null    object
 8   Critical_Risk             467 non-null    object
 9   Description               467 non-null    object
dtypes: object(10)
memory usage: 36.6+ KB


In [113]:
df_augmented.shape

(467, 10)

In [114]:
df_augmented.tail()

Unnamed: 0,Date,Country,Location,Industry_Sector,Accident_Level,Potential_Accident_Level,Gender,Employee_Type,Critical_Risk,Description
462,2016-09-27T08:15:00Z,Country_02,Local_05,Metals,2,3,Female,Employee,Pinch points,"During routine maintenance on a conveyor belt, a worker's hand became caught between the belt and a roller. The resulting injury required first aid treatment for minor abrasions and bruising."
463,2017-03-01T16:45:00Z,Country_03,Local_08,Others,1,3,Male,Third Party,"Slips, trips, and falls",A delivery driver slipped on a wet patch of concrete near the loading dock. The driver fell and sustained a minor sprain to their ankle.
464,2016-11-18T11:00:00Z,Country_01,Local_01,Mining,4,3,Male,Employee,Machinery malfunction,A malfunction in a ventilation fan caused a build-up of methane gas in a section of the mine. The resulting explosion caused significant damage to the ventilation system and resulted in minor injuries to two workers who were in the vicinity.
465,2017-05-23T19:30:00Z,Country_02,Local_12,Metals,1,3,Female,Third Party (Remote),Exposure to hazardous materials,A contractor working on the roof of a processing plant was exposed to a small amount of a chemical sealant. The worker experienced minor skin irritation and was treated with a topical cream.
466,2016-07-05T06:00:00Z,Country_03,Local_02,Others,3,3,Male,Employee,Electrical hazards,"An electrician received an electrical shock while repairing a faulty circuit breaker. The worker was wearing appropriate safety gear, which minimized the severity of the shock, but they still required medical observation."


In [53]:
df_augmented = data_augmentation(df_encoded, llm, "Potential_Accident_Level", 5, 10)

In [54]:
filtered_df = df_encoded.query("Potential_Accident_Level == 5") 
filtered_df.sample(n=min(len(filtered_df), 5))["Description"]

154                                                                                                                                                                                                                                                                                                                                                                   At approximately 5:45 pm, the operator Paulo (operator of the filters) informed the autoclave operator via radio of a leak on the side of the scruber. The autoclave III feed was stopped by the control and officials Georli and Renato initiated the procedures for closing the autoclave transfer valve for flash TQs. Soon after, there was a break in the chicken, projecting pulp hot and reaching three employees who were inside the room near the equipment.
379                                                                                                                                                                                         

In [41]:
df_augmented["Description"]

0                    Worker was struck by a falling piece of metal while working on the assembly line.
1       Third-party contractor slipped on a wet surface near the mine entrance and sustained injuries.
2      Employee experienced minor skin irritation due to chemical exposure during cleaning procedures.
3    Remote third-party technician received an electric shock while working on a faulty control panel.
4                         Mining equipment malfunctioned, resulting in serious injury to the operator.
5                           Employee developed back pain due to repetitive lifting of heavy materials.
6                 Third-party delivery driver tripped on uneven pavement and sustained minor injuries.
7                        A gas leak led to an explosion in the mine, resulting in multiple fatalities.
8                    Fire broke out in a remote third-party warehouse due to faulty electrical wiring.
9                               Worker was injured when caught between tw

In [25]:
# Step 2: Apply a `where` clause to filter rows
filtered_df = df_encoded.query("Potential_Accident_Level == 5")  # Example: Filter rows where `type` is 'data'

# Step 3: Convert filtered rows to JSON strings
#fewShotExamples  =  "".join(filtered_df.sample(n=min(len(filtered_df), 5)).to_dict(orient="records"))
accident_examples = [
    {
        "example": f"""Date:{row.Date}, Country={row.Country}, Location={row.Location},Industry_Sector={row.Industry_Sector}
        ,Accident_Level={row.Accident_Level},Potential_Accident_Level={row.Potential_Accident_Level},Gender={row.Gender}
        ,Employee_Type={row.Employee_Type}
        ,Critical_Risk={row.Critical_Risk},Description={row.Description}"""
    }
    for _, row in filtered_df.sample(n=min(len(filtered_df), 5)).iterrows()
]
# Step 2: Loop to pick different samples each time
#for i in range(3):  # Run the loop 3 times
 #   random_rows = filtered_df.sample(n=min(len(filtered_df), 5))  # Pick up to 5 rows
  #  print(f"Sample {i + 1}:\n{random_rows}\n")

In [26]:
SYNTHETIC_FEW_SHOT_PREFIX=f"""You are a synthetic data generator. Generate list of data in the following JSON format:
            Output each JSON object on a new line seperated by a comma. Do not iutput keyword "JSON" in the output
            {{schema}}
            """

SYNTHETIC_FEW_SHOT_SUFFIX="Ensure the output strictly adheres to this format and includes realistic and diverse values for each field."

In [27]:
prompt_template = PromptTemplate (input_variables  =["example"],template ="{example}")
#PROMPT_TEMPLATE = PromptTemplate(
#input_variables=['Country', 'Location', 'Industry_Sector', 'Accident_Level', 'Potential_Accident_Level', 'Gender', 'Employee_Type', 'Critical_Risk', 'Description', 'Month'],
 #   template="'Country:{Country}\nLocation:{Location}\nIndustry_Sector:{Industry_Sector}\nAccident_Level:{Accident_Level}\nPotential_Accident_Level:{Potential_Accident_Level}\nGender:{Gender }\nEmployee_Type:{Employee_Type }\nCritical_Risk:{Critical_Risk}\nDescription:{Description}\nMonth:{Month}\"
#)

prompt_template= FewShotPromptTemplate(
    prefix= SYNTHETIC_FEW_SHOT_PREFIX,
    examples = accident_examples,
    suffix= SYNTHETIC_FEW_SHOT_SUFFIX,
    input_variables=["schema"],
    example_prompt = prompt_template
)

In [30]:
generator = SyntheticDataGenerator(template=prompt_template, llm=llm)
results = generator.generate(subject="Mining and Metal Industry Accident related data", 
                             schema=generate_schema(industry_accident)
                             , runs=1)

In [31]:
results[0]

'{"Date": "2016-05-12 00:00:00", "Countries": "Country_01", "Location": "Local_07", "Industry_Sector": "Mining", "Accident_Level": "III", "Potential_Accident_Level": "IV", "Gender": "Male", "Employee_Type": "Employee", "Critical_Risk": "Falling objects", "Description": "While bolting the roof, a loose rock fell and struck the worker\'s shoulder."},\n{"Date": "2016-11-28 00:00:00", "Countries": "Country_02", "Location": "Local_03", "Industry_Sector": "Metal", "Accident_Level": "I", "Potential_Accident_Level": "II", "Gender": "Female", "Employee_Type": "Employee", "Critical_Risk": "Sharp edges", "Description": "Worker sustained a minor cut to her hand while handling sheet metal."},\n{"Date": "2017-01-03 00:00:00", "Countries": "Country_03", "Location": "Local_10", "Industry_Sector": "Others", "Accident_Level": "II", "Potential_Accident_Level": "III", "Gender": "Male", "Employee_Type": "Third Party", "Critical_Risk": "Slippery surface", "Description": "Delivery driver slipped and fell whi