## Data Augmentation using few-shot learning

- Here we will try to target the imbalance in our data leveraging state of the art few shot learning technique with LLM

- We are using Langchain to achieve this. The main advantage of generating synthetic data with LLM over traditional techniques like SMOTE and ADASYN is we can customize the augmentation process without loosing the essence of the original data.

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('/content/drive/MyDrive/alda project/processed_data.csv')

In [3]:
df.head()

Unnamed: 0,id,name,first,last,compas_screening_date,sex,dob,age,age_cat,race,...,type_of_assessment,decile_score.1,score_text,screening_date,v_type_of_assessment,v_decile_score,v_score_text,v_screening_date,priors_count.1,event
0,1,miguel hernandez,miguel,hernandez,14/08/2013,Male,18/04/1947,69,Greater than 45,Other,...,Risk of Recidivism,1,Low,14/08/2013,Risk of Violence,1,Low,14/08/2013,0,0
1,2,michael ryan,michael,ryan,31/12/2014,Male,06/02/1985,31,25 - 45,Caucasian,...,Risk of Recidivism,5,Medium,31/12/2014,Risk of Violence,2,Low,31/12/2014,0,0
2,3,kevon dixon,kevon,dixon,27/01/2013,Male,22/01/1982,34,25 - 45,African-American,...,Risk of Recidivism,3,Low,27/01/2013,Risk of Violence,1,Low,27/01/2013,0,1
3,4,ed philo,ed,philo,14/04/2013,Male,14/05/1991,24,Less than 25,African-American,...,Risk of Recidivism,4,Low,14/04/2013,Risk of Violence,3,Low,14/04/2013,4,0
4,5,marcu brown,marcu,brown,13/01/2013,Male,21/01/1993,23,Less than 25,African-American,...,Risk of Recidivism,8,High,13/01/2013,Risk of Violence,6,Medium,13/01/2013,1,0


In [4]:
X = df[["age", "juv_fel_count", "juv_misd_count", "juv_other_count", "priors_count", "c_charge_degree","is_recid"]]  # Features



In [5]:
X.is_recid.value_counts()

is_recid
0    7528
1    3772
Name: count, dtype: int64

In [6]:
minority_data = X[X.is_recid == 1].head(4)

In [7]:
# Initialize an empty list to store the formatted examples
formatted_examples = []

# Iterate over each row in the DataFrame
for index, row in df.head(50).iterrows():
    # Construct the string for the current row
    example_string = f"age: {row['age']}, juv_fel_count: {row['juv_fel_count']}, juv_misd_count: {row['juv_misd_count']}, juv_other_count: {row['juv_other_count']}, priors_count: {row['priors_count']}, c_charge_degree: {row['c_charge_degree']}, is_recid: {row['is_recid']}"
    # Append the constructed string as a dictionary to the list
    formatted_examples.append({"example": f'''{example_string}'''})

# Display the formatted examples list



In [8]:
formatted_examples

[{'example': 'age: 69, juv_fel_count: 0, juv_misd_count: 0, juv_other_count: 0, priors_count: 0, c_charge_degree: (F3), is_recid: 0'},
 {'example': 'age: 31, juv_fel_count: 0, juv_misd_count: 0, juv_other_count: 0, priors_count: 0, c_charge_degree: nan, is_recid: 0'},
 {'example': 'age: 34, juv_fel_count: 0, juv_misd_count: 0, juv_other_count: 0, priors_count: 0, c_charge_degree: (F3), is_recid: 1'},
 {'example': 'age: 24, juv_fel_count: 0, juv_misd_count: 0, juv_other_count: 1, priors_count: 4, c_charge_degree: (F3), is_recid: 1'},
 {'example': 'age: 23, juv_fel_count: 0, juv_misd_count: 1, juv_other_count: 0, priors_count: 1, c_charge_degree: (F3), is_recid: 0'},
 {'example': 'age: 43, juv_fel_count: 0, juv_misd_count: 0, juv_other_count: 0, priors_count: 2, c_charge_degree: (F7), is_recid: 0'},
 {'example': 'age: 44, juv_fel_count: 0, juv_misd_count: 0, juv_other_count: 0, priors_count: 0, c_charge_degree: (M1), is_recid: 0'},
 {'example': 'age: 41, juv_fel_count: 0, juv_misd_count:

In [9]:
!pip install -U langchain langchain_experimental openai


Collecting langchain
  Downloading langchain-0.1.16-py3-none-any.whl (817 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m817.7/817.7 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain_experimental
  Downloading langchain_experimental-0.0.57-py3-none-any.whl (193 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.4/193.4 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting openai
  Downloading openai-1.17.1-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.3/268.3 kB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain)
  Downloading dataclasses_json-0.6.4-py3-none-any.whl (28 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)
Collecting langchain-community<0.1,>=0.0.32 (from langchain)
  Downloading langchain_community-0.0.32-py3-none-any.whl (1.9 MB)
[2K     [9

In [10]:
import os
os.environ["OPENAI_API_KEY"] = "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"

In [11]:
from langchain.prompts import FewShotPromptTemplate, PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.pydantic_v1 import BaseModel
from langchain_experimental.tabular_synthetic_data.base import SyntheticDataGenerator
from langchain_experimental.tabular_synthetic_data.openai import create_openai_data_generator, OPENAI_TEMPLATE
from langchain_experimental.tabular_synthetic_data.prompts import SYNTHETIC_FEW_SHOT_SUFFIX, SYNTHETIC_FEW_SHOT_PREFIX

In [12]:
class CriminalRecord(BaseModel):
    age: int
    juv_fel_count: int
    juv_misd_count: int
    juv_other_count: int
    priors_count: int
    c_charge_degree: str
    is_recid: int

In [13]:
OPENAI_TEMPLATE = PromptTemplate(input_variables=["example"], template="{example}")

prompt_template = FewShotPromptTemplate(
    prefix=SYNTHETIC_FEW_SHOT_PREFIX,
    examples=formatted_examples,
    suffix=SYNTHETIC_FEW_SHOT_SUFFIX,
    input_variables=["subject", "extra"],
    example_prompt=OPENAI_TEMPLATE,
)

In [14]:
synthetic_data_generator = create_openai_data_generator(
    output_schema=CriminalRecord,
    llm=ChatOpenAI(temperature=1),
    prompt=prompt_template,
)

  warn_deprecated(


In [None]:
n_samples = 100
synthetic_results = synthetic_data_generator.generate(
    subject="CriminalRecord",
    extra="closely observe all the data samples. Don't create duplicates, try to generate good synthetic data that closely resembles the example given.",
    runs=n_samples,
)

In [16]:
synthetic_results

[CriminalRecord(age=34, juv_fel_count=0, juv_misd_count=0, juv_other_count=0, priors_count=3, c_charge_degree='(F3)', is_recid=1),
 CriminalRecord(age=29, juv_fel_count=0, juv_misd_count=0, juv_other_count=0, priors_count=2, c_charge_degree='(F3)', is_recid=1),
 CriminalRecord(age=32, juv_fel_count=0, juv_misd_count=0, juv_other_count=0, priors_count=3, c_charge_degree='(F3)', is_recid=1),
 CriminalRecord(age=27, juv_fel_count=0, juv_misd_count=0, juv_other_count=0, priors_count=3, c_charge_degree='(F3)', is_recid=1),
 CriminalRecord(age=31, juv_fel_count=0, juv_misd_count=0, juv_other_count=0, priors_count=4, c_charge_degree='(F3)', is_recid=1),
 CriminalRecord(age=40, juv_fel_count=0, juv_misd_count=0, juv_other_count=0, priors_count=4, c_charge_degree='(F3)', is_recid=1),
 CriminalRecord(age=35, juv_fel_count=0, juv_misd_count=0, juv_other_count=0, priors_count=5, c_charge_degree='(F3)', is_recid=1),
 CriminalRecord(age=41, juv_fel_count=0, juv_misd_count=0, juv_other_count=0, prior

In [17]:
import pandas as pd

# Create a list of dictionaries from the objects
synthetic_data = []
for item in synthetic_results:
    synthetic_data.append({
        'age': item.age,
        'juv_fel_count': item.juv_fel_count,
        'juv_misd_count': item.juv_misd_count,
        'juv_other_count': item.juv_other_count,
        'priors_count': item.priors_count,
        'c_charge_degree': item.c_charge_degree,
        'is_recid' : item.is_recid
    })

# Create a Pandas DataFrame from the list of dictionaries
synthetic_df = pd.DataFrame(synthetic_data)

# Display the DataFrame
print(type(synthetic_df))
synthetic_df

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,age,juv_fel_count,juv_misd_count,juv_other_count,priors_count,c_charge_degree,is_recid
0,34,0,0,0,3,(F3),1
1,29,0,0,0,2,(F3),1
2,32,0,0,0,3,(F3),1
3,27,0,0,0,3,(F3),1
4,31,0,0,0,4,(F3),1
...,...,...,...,...,...,...,...
95,27,0,0,0,2,F3,0
96,28,0,0,1,4,F3,1
97,31,0,0,1,5,F3,1
98,33,0,0,1,6,F3,1


In [None]:
synthesized_df = pd.concat([X, synthetic_df], ignore_index=True)