In [None]:
# | default_exp solution_gpt

# GPT Solution

> Ask GPT to perform the data cleaning <br>
<br>
- I think LLM's have the potential to be *incredibly* useful in the field of data cleaning / ETL <br>
- While some data transformations require thoughtful planning, many require basic knowledge of language <br>
    - Example: You need to clean survey results and parse peoples locations but the data's messed up!! <br>
        - Some people put cities like 'NYC', 'Windy City', & 'san fran' <br> 
    - Formating those names isn't difficult, but the task can often be long and frustrating <br>
    - I believe problems like this can be solved by AI *right now* and may help improve efficiency <br>
- I'm going to demonstrate the usefullness of LLM's in data cleaning though the use of 2 libraries: <br>
    1. [openai](https://platform.openai.com/docs/api-reference) <br>
    2. [pandas_gpt](https://github.com/rvanasa/pandas-gpt) <br>

## Configuring OpenAI Credentials

In [None]:
# | export
import pandas as pd
import openai
import pandas_gpt
import json
import os

from dotenv import load_dotenv
from virtuous_interview.utils import contacts, contact_methods, gifts
from nbdev.showdoc import *

In [None]:
# | export
load_dotenv()

In [None]:
# | exports
openai.api_key = os.getenv("OPENAI_API_KEY")

In [None]:
# |export
class GptPrompt:
    """Class For sending messages to Open AI  using GPT-3.5 Turbo"""

    def __init__(self, messages):
        self.messages = messages

    def add_data(self, data):
        messages = self.messages[:]

        messages.append({"role": "user", "content": data})

        return messages

    def call_gpt(self, data, model="gpt-3.5-turbo-16k"):
        response = openai.ChatCompletion.create(
            model=model,
            messages=self.add_data(data),
            temperature=0.1,
            max_tokens=11520,
            top_p=0.5,
            frequency_penalty=0,
            presence_penalty=0,
        )

        return json.loads(response.choices[0]["message"]["content"])

In [None]:
# |exporti
def to_csl(pd_series):
    return ",".join(pd_series.dropna().unique())

# OpenAI Solutoins
> Solutions to a subeset of the data modifications using OpenAI API :)

## CreditCardType
> Can only be Visa, Mastercard, AMEX, Discover<br>
<br>
- Solution: <br>
    - Asking Chat GPT to format the unique CreditCardTypes into the pre-approved list of values <br>

In [None]:
# |exports
credit_card_prompt = [
    {
        "role": "system",
        "content": """
      You will be given a comma separated list of items. 
      Each item is supposed to be a unique credit card type taken from a column on a database table. 
      The only acceptable credit card types are [Visa, Mastercard, AMEX, Discover]
      Your job is to examine each item in the list to see if it matches one of the acceptable credit card types or not.
      For each item in the list that is not ALREADY in the list of acceptable credit card types you will need to provide which credit card type it matches with. 
      If a item doesn't match ANY of the acceptable credit card types [Visa, Mastercard, AMEX, Discover] then match it with an empty string ''
      Format your response in JSON
      """,
    },
    {
        "role": "user",
        "content": "Americn Ex,AMEX,Visa,Master car,Mastercard,Discover,Jazz",
    },
    {
        "role": "assistant",
        "content": """{
              "Americn Ex": "AMEX",
              "Master car": "Mastercard",
              "Jazz":""
          }
          """,
    },
]

In [None]:
# |exports
CreditCardCleaner = GptPrompt(messages=credit_card_prompt)

In [None]:
# |exports
values_to_replace = CreditCardCleaner.call_gpt(to_csl(gifts["CreditCardType"]))

In [None]:
values_to_replace

In [None]:
# |exports
gifts["CreditCardType"] = gifts["CreditCardType"].replace(values_to_replace)

## GiftType
> Can only be Cash, Check, Credit, Other, or Reversing Transaction <br>
<br>
- Solution: <br>
    - Asking Chat GPT to format the unique GiftTypes into the pre-approved list of values <br>
    - Replace 'Reversing Transaction' using simple logic in Python <br>

In [None]:
# |exports
gift_type_prompt = [
    {
        "role": "system",
        "content": """
          You will be given a comma separated list of items. 
          Each item is supposed to be a unique payment method taken from a column on a database table. 
          The only acceptable payment methods are [Cash, Check, Credit, or Other]
          Your job is to examine each item in the list to see if it matches one of the acceptable payment methods or not. If it doesn't map to 
          For each item in the list that is not ALREADY in the list of acceptable payment method  you will need to provide which payment method  type it matches with
          Format your response in JSON
          """,
    },
    {"role": "user", "content": "$,cash,Credit,AMEX,Square"},
    {
        "role": "assistant",
        "content": """{
                  "$: "Cash",
                  "cash: "Cash",
                  "AMEX":"Credit",
                  "Square":"Other",
                  
              }
              """,
    },
]

In [None]:
# |exports
GiftTypeCleaner = GptPrompt(messages=gift_type_prompt)

In [None]:
# |exports
values_to_replace = GiftTypeCleaner.call_gpt(to_csl(gifts["PaymentMethod"]))
values_to_replace[""] = "Other"

In [None]:
values_to_replace

In [None]:
# | exports
gifts.apply(
    lambda row: (
        "Reversing Transaction"
        if row["AmountReceived"] < 0
        else values_to_replace[row["PaymentMethod"]]
    ),
    axis=1,
)

# Pandas GPT Solutions

## ContactType
`is required and can only be Household or Organization <br>
<br>
- Source Table: Contacts Table <br>
- Solution: <br>
    - Create procedure to add new column ContactType <br>

In [None]:
# |exports
df = contacts.copy()
gpt_response = df.ask(
    "create a new column called ContactType. The value is required and can only be either Household or Organization. If CompanyName is '' assume it's a household"
)
gpt_response[["Number", "CompanyName", "ContactType"]].head(5)

## Postal Code
> if address is present and is US, must be a valid zip code, either 12345 or 12345-1234 <br>
<br>
- Source Table: Contacts
- Solution:
    - Create procedure to remove any postal codees that doesn't match the approved format from the [usps](https://pe.usps.com/archive/html/dmmarchive20030810/A010.htm)

In [None]:
contacts[["Postal"]]

In [None]:
# |exports
df = contacts.copy()
gpt_response = df.ask(
    "Clean the Postal Column. If address is present and is US, must be a valid zip code, either 12345 or 12345-1234. Don't delete rows with an invalid zip, just replace the invalid zip with ''"
)
gpt_response[["Postal"]]

## IsDeceased
> can only be TRUE or FALSE <br>
<br>
- Source Table: Contacts <br>
- Solution: <br>
    - Create procedure to update Deceased to TRUE/FALSE <br>

In [None]:
gpt_response.Deceased.unique()

In [None]:
# |exports
df = contacts.copy()
gpt_response = df.ask(
    "Can you convert the Deceased column to a boolean. Assume empty strings "
    " are False"
)
gpt_response.Deceased.unique()

# Export

In [None]:
import nbdev

In [None]:
nbdev.nbdev_export("03_Gpt_Solution.ipynb")