In [None]:
#| default_exp solution_gpt

# GPT Solution

`Ask GPT to perform the data cleaning` <br>
<br>
- I think LLM's have the potential to be *incredibly* useful in the field of data cleaning / ETL <br>
- While some data transformations require thoughtful planning, many require basic knowledge of language <br>
    - Example: You need to clean survey results and parse peoples locations but the data's messed up!! <br>
        - Some people put cities like 'NYC', 'Windy City', & 'san fran' <br> 
    - Formating those names isn't difficult, but the task can often be long and frustrating <br>
    - I believe problems like this can be solved by AI *right now* and may help improve efficiency <br>
- I'm going to demonstrate the usefullness of LLM's in data cleaning though the use of 2 libraries: <br>
    1. [openai](https://platform.openai.com/docs/api-reference) <br>
    2. [pandas_gpt](https://github.com/rvanasa/pandas-gpt) <br>

## Configuring OpenAI Credentials

In [None]:
#| export
import pandas as pd
import openai
import pandas_gpt
import json

from dotenv import dotenv_values
from virtuous_interview.utils import contacts, contact_methods, gifts

In [None]:
#| export
openai.api_key = dotenv_values()['OPENAI_API_KEY']

## GptPrompt
`Class For sending messages to Open AI` <br>
<br>

Simple class to make reusing a prompt easier

In [None]:
#|export solution_gpt
class GptPrompt:
    def __init__(self, messages):
        self.messages = messages

    def add_data(self, data): 
        messages = self.messages[:]

        messages.append({
          "role": "user",
          "content": data
        })
        
        return messages
    
    def call_gpt(self, data, model="gpt-3.5-turbo-16k-0613"):
        response = openai.ChatCompletion.create(
            model=model, 
            messages=self.add_data(data),
            temperature=.1,
            max_tokens=11520,
            top_p=.5,
            frequency_penalty=0,
            presence_penalty=0
        )

        return json.loads(response.choices[0]['message']['content'])

### to_csl
`Formats pd series to a comma seperated list of unique values`

In [None]:
#|export solution_gpt
def to_csl(pd_series): return ','.join(pd_series.dropna().unique())

# OpenAI Solutoins
`Solutions to a subeset of the data modifications using OpenAI API :)`

## CreditCardType
`Can only be Visa, Mastercard, AMEX, Discover`<br>
<br>
- Solution: <br>
    - Asking Chat GPT to format the unique CreditCardTypes into the pre-approved list of values <br>

In [None]:
#|export solution_gpt
credit_card_prompt =[ 
    {
      "role": "system",
      "content": """
      You will be given a comma separated list of items. 
      Each item is supposed to be a unique credit card type taken from a column on a database table. 
      The only acceptable credit card types are [Visa, Mastercard, AMEX, Discover]
      Your job is to examine each item in the list to see if it matches one of the acceptable credit card types or not.
      For each item in the list that is not ALREADY in the list of acceptable credit card types you will need to provide which credit card type it matches with. 
      If a item doesn't match ANY of the acceptable credit card types [Visa, Mastercard, AMEX, Discover] then match it with an empty string ''
      Format your response in JSON
      """
    },
    {
      "role": "user",
      "content": "Americn Ex,AMEX,Visa,Master car,Mastercard,Discover,Jazz"
    },
    {
      "role": "assistant",
      "content": """{
              "Americn Ex": "AMEX",
              "Master car": "Mastercard",
              "Jazz":""
          }
          """
    },
]

In [None]:
#|export solution_gpt
CreditCardCleaner = GptPrompt(messages=credit_card_prompt)

In [None]:
#|export solution_gpt
values_to_replace = CreditCardCleaner.call_gpt(to_csl(gifts['CreditCardType']))

In [None]:
values_to_replace

{'American Ex': 'AMEX', 'Master card': 'Mastercard', '': ''}

In [None]:
#|export solution_gpt
gifts['CreditCardType'] = gifts['CreditCardType'].replace(values_to_replace)

## GiftType
`Can only be Cash, Check, Credit, Other, or Reversing Transaction` <br>
<br>
- Solution: <br>
    - Asking Chat GPT to format the unique GiftTypes into the pre-approved list of values <br>
    - Replace 'Reversing Transaction' using simple logic in Python <br>

In [None]:
#|export solution_gpt
gift_type_prompt = [
        {
          "role": "system",
          "content": """
          You will be given a comma separated list of items. 
          Each item is supposed to be a unique payment method taken from a column on a database table. 
          The only acceptable payment methods are [Cash, Check, Credit, or Other]
          Your job is to examine each item in the list to see if it matches one of the acceptable payment methods or not. If it doesn't map to 
          For each item in the list that is not ALREADY in the list of acceptable payment method  you will need to provide which payment method  type it matches with
          Format your response in JSON
          """
        },
        {
          "role": "user",
          "content": "$,cash,Credit,AMEX,Square"
        },
        {
          "role": "assistant",
          "content": """{
                  "$: "Cash",
                  "cash: "Cash",
                  "AMEX":"Credit",
                  "Square":"Other",
                  
              }
              """
        },
      ]

In [None]:
#|export solution_gpt
GiftTypeCleaner = GptPrompt(messages=gift_type_prompt)

In [None]:
#|export solution_gpt
values_to_replace = GiftTypeCleaner.call_gpt(to_csl(gifts['PaymentMethod']))
values_to_replace[''] = 'Other'

In [None]:
values_to_replace

{'PayPal': 'Other',
 'check': 'Check',
 'cash': 'Cash',
 'credit card': 'Credit',
 'money order': 'Other',
 '': 'Other'}

In [None]:
#| export solution_gpt
gifts.apply(lambda row: 'Reversing Transaction' if row['AmountReceived'] < 0 else values_to_replace[row['PaymentMethod']], axis=1)

0                     Other
1                     Check
2                      Cash
3                    Credit
4                      Cash
5                     Other
6                     Check
7                     Check
8                     Check
9                    Credit
10                   Credit
11                    Other
12                    Check
13    Reversing Transaction
14                    Other
15                    Other
16                     Cash
17                    Other
18                    Other
19                    Other
20                    Check
21                   Credit
22                     Cash
23                    Other
24                    Other
25                     Cash
26    Reversing Transaction
27                   Credit
28                     Cash
29                    Other
30                   Credit
dtype: object

# Pandas GPT Solutions

## ContactType
`is required and can only be Household or Organization <br>
<br>
- Source Table: Contacts Table <br>
- Solution: <br>
    - Create procedure to add new column ContactType <br>

In [None]:
#|export solution_gpt
df = contacts.copy()
gpt_response = df.ask("create a new column called ContactType. The value is required and can only be either Household or Organization. If CompanyName is '' assume it's a household")
gpt_response[['Number', 'CompanyName', 'ContactType']].head(5)

    Number CompanyName FirstName  LastName   ContactType
0        1                  John       Doe     Household
1        2    ABC Inc.      Jane     Smith  Organization
2        3                  Mark   Johnson     Household
3        4   XYZ Corp.     Sarah     Brown  Organization
4        5                 David    Taylor     Household
5        6                 Emily  Anderson     Household
6        7               Michael    Wilson     Household
7        8                Hannah       Lee     Household
8        9                Andrew    Thomas     Household
9       10                Olivia     Clark     Household
10      11                Daniel    Wright     Household


Unnamed: 0,Number,CompanyName,ContactType
0,653377813-7,,Household
1,390551098-7,,Household
2,093004505-X,,Household
3,729707142-0,A Company Co.,Organization
4,488464926-5,,Household


## Postal Code
`if address is present and is US, must be a valid zip code, either 12345 or 12345-1234` <br>
<br>
- Source Table: Contacts
- Solution:
    - Create procedure to remove any postal codees that doesn't match the approved format from the [usps](https://pe.usps.com/archive/html/dmmarchive20030810/A010.htm)

In [None]:
contacts[['Postal']]

Unnamed: 0,Postal
0,20535-871
1,89130
2,
3,8104
4,49560
5,837016
6,
7,30066
8,68164
9,


In [None]:
#|export solution_gpt
df = contacts.copy()
gpt_response = df.ask("Clean the Postal Column. If address is present and is US, must be a valid zip code, either 12345 or 12345-1234. Don't delete rows with an invalid zip, just replace the invalid zip with ''")
gpt_response[['Postal']]

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/opt/homebrew/Caskroom/miniforge/base/envs/virtuous/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3508, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/folders/59/9vt5xw452yv5rsgx5hzqr70m0000gn/T/ipykernel_4492/972607772.py", line 3, in <module>
    gpt_response = df.ask("Clean the Postal Column. If address is present and is US, must be a valid zip code, either 12345 or 12345-1234. Don't delete rows with an invalid zip, just replace the invalid zip with ''")
  File "/opt/homebrew/Caskroom/miniforge/base/envs/virtuous/lib/python3.10/site-packages/pandas_gpt/__init__.py", line 123, in __call__
    return ask(goal, data, *args)
  File "/opt/homebrew/Caskroom/miniforge/base/envs/virtuous/lib/python3.10/site-packages/pandas_gpt/__init__.py", line 97, in __call__
    return self._eval(source, *args)
  File "/opt/homebrew/Caskroom/miniforge/base/envs/virtuous/lib/python3.10/site-packages/pandas_gp

## IsDeceased
`can only be TRUE or FALSE` <br>
<br>
- Source Table: Contacts <br>
- Solution: <br>
    - Create procedure to update Deceased to TRUE/FALSE <br>

In [None]:
gpt_response.Deceased.unique()

array(['', 'No', 'Yes'], dtype=object)

In [None]:
#|export solution_gpt
df = contacts.copy()
gpt_response = df.ask('Can you convert the Deceased column to a boolean. Assume empty strings '' are False')
gpt_response.Deceased.unique()

array([False,  True])

# Export

In [None]:
#| hide
import nbdev

In [None]:
#| hide
nbdev.nbdev_export('04_Gpt_Solution.ipynb')