# Python Solution
> Performing the data cleaning / ETL using Pandas <br>
<br>

- I'm going to demonstrate how pandas can be used the finish the rest of the ETL process

In [1]:
#| default_exp solution_pd

In [2]:
#| export
import pandas as pd
import numpy as np
import re

from virtuous_interview.utils import contacts, contact_methods, gifts

# Solution
> Pandas :)

## Private
> Does someone want to be private <br>
<br>
- Source Table: Contacts Table
- Solution:
    - Create procedure to add new column Private

Looking for notes

In [3]:
contacts.Remarks[~contacts.Remarks.isin([''])]

0                                 Is anonymous
6    Met in person on 5/9/2018 at Annual Event
8                      Electronic receipt only
Name: Remarks, dtype: object

In [4]:
contacts['Private'] = contacts.Remarks.apply(lambda x: True if x == 'Is anonymous' else False)

## ContactType
> is required and can only be Household or Organization <br>
<br>
- Source Table: Contacts Table
- Solution:
    - Create procedure to add new column ContactType

In [5]:
#| hide
contacts[['Number', 'CompanyName']]

Unnamed: 0,Number,CompanyName
0,653377813-7,
1,390551098-7,
2,093004505-X,
3,729707142-0,A Company Co.
4,488464926-5,
5,315297729-8,
6,848348568-0,
7,029456846-8,
8,687119652-8,
9,739131380-7,


In [6]:
#|export solution_pd
contacts['ContactType'] = contacts.apply(lambda x: 'Household' if x['CompanyName'] == '' else 'Organization', axis=1)

In [7]:
#| hide
contacts[['Number', 'CompanyName', 'ContactType']].head(3)

Unnamed: 0,Number,CompanyName,ContactType
0,653377813-7,,Household
1,390551098-7,,Household
2,093004505-X,,Household


## Postal Code
> if address is present and is US, must be a valid zip code, either 12345 or 12345-1234 <br>
<br>
- Source Table: Contacts <br>
- Solution: <br>
    - Create procedure to remove any postal codees that doesn't match the approved format from the [usps](https://pe.usps.com/archive/html/dmmarchive20030810/A010.htm)

In [8]:
#| hide
contacts['Postal'].head(3)

0    20535-871
1        89130
2             
Name: Postal, dtype: object

In [9]:
#|export solution_pd
postal_code_pattern = '[0-9]{5}(?:-[0-9]{4})?$'

In [10]:
#|export solution_pd
contacts['Postal'] = contacts.Postal.apply(lambda x: x if re.match(postal_code_pattern, x) else '')

In [11]:
#| hide
contacts['Postal'].head(3)

0         
1    89130
2         
Name: Postal, dtype: object

## IsDeceased
> can only be TRUE or FALSE <br>
<br>
- Source Table: Contacts <br>
- Solution: <br>
    - Create procedure to update Deceased to TRUE/FALSE

In [12]:
#| hide
contacts.Deceased.head(3)

0    
1    
2    
Name: Deceased, dtype: object

In [13]:
#|export solution_pd
contacts['Deceased'] = contacts.Deceased.apply(lambda x: True if x == 'Yes' else False)

In [14]:
#| hide
contacts.Deceased.head(3)

0    False
1    False
2    False
Name: Deceased, dtype: bool

## GiftType
> Can only be Cash, Check, Credit, Other, or Reversing Transaction <br>
<br>
- Source Table: Gifts <br>
- Solution: <br>
    - Identify Incorrect Gift Types <br>
    - Create procedure to replace invalid gift types <br>

In [15]:
#| hide
gifts[['AmountReceived', 'PaymentMethod']].tail(10)

Unnamed: 0,AmountReceived,PaymentMethod
21,4.21,credit card
22,9.28,cash
23,2.74,PayPal
24,9.0,money order
25,1.88,cash
26,-6.76,check
27,7.58,credit card
28,5.49,cash
29,8.93,money order
30,2.62,credit card


In [16]:
#|export solution_pd
def clean_payment_type(row):
    payment_method = ''
    orginal_payment_method = str(row['PaymentMethod']).lower()
    
    if row['AmountReceived'] < 0:
        payment_method = 'Reversing Transaction'
    elif re.match('credit', orginal_payment_method):
        payment_method = 'Credit'
    elif orginal_payment_method in ['check', 'cash', 'reversing transaction']:
        payment_method = orginal_payment_method.title()
    else:
        payment_method = 'Other'

    return payment_method

In [17]:
#|export solution_pd
gifts['PaymentMethod'] = gifts.apply(clean_payment_type, axis=1)

In [18]:
#| hide
gifts.PaymentMethod.tail(10)

21                   Credit
22                     Cash
23                    Other
24                    Other
25                     Cash
26    Reversing Transaction
27                   Credit
28                     Cash
29                    Other
30                   Credit
Name: PaymentMethod, dtype: object

## CreditCardType
> Can only be Visa, Mastercard, AMEX, Discover <br>
<br>
- Solution: <br>
    - Identify Incorrect Credit Types <br>
    - Create procedure to replace invalid credit types

In [19]:
#| hide
gifts.CreditCardType.unique()

array(['', 'American Ex', 'AMEX', 'Visa', 'Master card', 'Mastercard',
       'Discover'], dtype=object)

Creating functions to validate each of the credit card types

In [20]:
#| export
def validate_mastercard(string):
    # Match strings that contain "mastercard" or "master card", case insensitive
    pattern = r'master\s?card'
    return bool(re.search(pattern, string, re.IGNORECASE))

def validate_amex(string):
    # Match strings that contain "amex", "american express", or "american ex", case insensitive
    pattern = r'amex|american\s?express|american\s?ex'
    return bool(re.search(pattern, string, re.IGNORECASE))

def validate_visa(string):
    # Match strings that contain "visa", case insensitive
    pattern = r'visa'
    return bool(re.search(pattern, string, re.IGNORECASE))

def validate_discover(string):
    # Match strings that contain "discover", case insensitive
    pattern = r'discover'
    return bool(re.search(pattern, string, re.IGNORECASE))

def validate_credit_card(string): 
    if validate_mastercard(string):
        return 'Mastercard'
    elif validate_amex(string):
        return 'AMEX'
    elif validate_visa(string):
        return 'Visa'
    elif validate_discover(string):
        return 'Discover'
    else:
        return ''

In [21]:
#| export
gifts['CreditCardType'] = gifts.CreditCardType.apply(validate_credit_card)

# Creating Final Datasets

## Contacts
> Table of constituent contact information

Checking for any missing contacts

In [22]:
columns = [
    'LegacyContactId', 'LegacyIndividualId', 'ContactName', 'FirstName', 
    'LastName', 'SecondaryLegacyIndividualId', 'SecondaryFirstName',
    'SecondaryLastName', 'HomePhone', 'HomeEmail', 'Address1', 
    'City', 'State', 'PostalCode', 'IsPrivate', 'IsDeceased',
    'ContactType'
    ]

In [23]:
contacts.rename(columns={
    'Number':'LegacyContactId',
    'Street': 'Address1',
    'Postal': 'PostalCode',
    'Phone': 'HomePhone',
    'EMail': 'HomeEmail',
    'Private': 'IsPrivate',
    'Deceased': 'IsDeceased',
}, inplace=True)

In [24]:
final_contacts = contacts.loc[:, columns]

In [25]:
final_contacts

Unnamed: 0,LegacyContactId,LegacyIndividualId,ContactName,FirstName,LastName,SecondaryLegacyIndividualId,SecondaryFirstName,SecondaryLastName,HomePhone,HomeEmail,Address1,City,State,PostalCode,IsPrivate,IsDeceased,ContactType
0,653377813-7,0,Karita & Kelvin Lumbers,Karita,Lumbers,1.0,Kelvin,Lumbers,,kklumbers@ yahoo.co,4 Bunting Parkway,Washington,DC,,True,False,Household
1,390551098-7,2,Helga Benech,Helga,Benech,,,,,ebenech1@goodreads.com,48684 Jenifer Way,Las Vegas,NV,89130.0,False,False,Household
2,093004505-X,3,Masha Butt Gow,Masha,Butt Gow,,,,577-374-96523,,353 Schmedeman Park,Indianapolis,IN,,False,False,Household
3,729707142-0,4,Cymbre Cross,Cymbre,Cross,,,,,,2055 Lakewood Parkway,Camden,NJ,,False,False,Organization
4,488464926-5,5,Hoyt Castille,Hoyt,Castille,,,,,fcastille4@timesonline.co.uk,37 8th Trail,Grand Rapids,MI,49560.0,False,False,Household
5,315297729-8,6,Benedict Oscar & Idell Mouncey,Benedict,Oscar,7.0,Idell,Mouncey,,,4225 Madison Ave,Boise,ID,,False,False,Household
6,848348568-0,8,Mannie Turpin,Mannie,Turpin,,,,702-844-9524,,,,NV,,False,True,Household
7,029456846-8,9,Romy Doley,Romy,Doley,,,,,jdoley6@telegraph.co.uk,608 Old Shore Alley,Marietta,GA,30066.0,False,False,Household
8,687119652-8,10,Ruggiero Makepeace,Ruggiero,Makepeace,,,,,cmakepeace7@1688.com,15 Sunbrook Center,Omaha,NE,68164.0,False,False,Household
9,739131380-7,11,Rosemaria & Rogelio Dimond,Rosemaria,Dimond,12.0,Rogelio,Dimond,,,,Juneau,AK,,False,False,Household


In [26]:
final_contacts.to_csv('data/final_contacts.csv', index=False)

## Contact Methods
> Table of constituent contact information

Merging Contact and Contact Methods Tables To get ALL the contact information

In [27]:
#|export solution_pd
temp_contacts = contacts[['LegacyContactId', 'HomePhone', 'HomeEmail']].copy()

In [28]:
#|export solution_pd
temp_contact_methods = contact_methods.rename(columns={'DonorNumber':'LegacyContactId'})

In [29]:
#|export solution_pd
contacts_v = temp_contacts.merge(temp_contact_methods,
                                how='left',
                                on='LegacyContactId',
                                ).sort_values('LegacyContactId')

Creating a function to get the valid Email and Phone for each row <br>
<br>
> Assuming thath the phone / email from the contact_method table is superior

In [30]:
#|export solution_pd
def add_valid_contact(row):
    phone = row['Phone'] if row['Phone'] != '' else row['HomePhone']
    email = row['EMail'] if row['EMail'] != '' else row['HomeEmail']
    return pd.Series([phone, email])

In [31]:
#|export solution_pd
contacts_v[['HomePhone', 'HomeEmail']] = contacts_v.apply(add_valid_contact, axis=1)

In [32]:
#|export solution_pd
contacts_v = contacts_v[['LegacyContactId', 'HomePhone', 'HomeEmail', 'Fax']].copy()

In [33]:
#| export
contacts_v.head(3)

Unnamed: 0,LegacyContactId,HomePhone,HomeEmail,Fax
8,029456846-8,,jdoley6@telegraph.co.uk,
2,093004505-X,818-323-9865,,818-156-7985
5,315297729-8,,,


Pivoting the Data, converting the HomePhone, HomeEmail, And Fax values to individual Type : Value pairs

In [34]:
#| export
contacts_v = contacts_v.melt(id_vars=['LegacyContactId'], value_vars=['HomePhone', 'HomeEmail', 'Fax'], var_name='Type', value_name='Value')


In [35]:
contacts_v.head(3)

Unnamed: 0,LegacyContactId,Type,Value
0,029456846-8,HomePhone,
1,093004505-X,HomePhone,818-323-9865
2,315297729-8,HomePhone,


Creating a function to safley check for nan values

In [36]:
def is_nan(x):
    try:
        return np.isnan(x)
    except TypeError:
        return False

In [37]:
final_contact_methods = contacts_v[~((contacts_v.Value == '') | (contacts_v.Value.apply(is_nan)))].reset_index(drop=True).copy()

In [38]:
final_contact_methods

Unnamed: 0,LegacyContactId,Type,Value
0,093004505-X,HomePhone,818-323-9865
1,653377813-7,HomePhone,832-442-4988
2,848348568-0,HomePhone,702-844-9524
3,029456846-8,HomeEmail,jdoley6@telegraph.co.uk
4,315297729-8,HomeEmail,dmouncey9@cnn.com
5,390551098-7,HomeEmail,ebenech1@goodreads.com
6,488464926-5,HomeEmail,fcastille4@timesonline.co.uk
7,653377813-7,HomeEmail,kklumbers@ yahoo.co
8,687119652-8,HomeEmail,cmakepeace7@1688.com
9,093004505-X,Fax,818-156-7985


In [39]:
final_contact_methods.to_csv('data/final_contact_methods.csv', index=False)

## Gifts
> Table with gift history

In [40]:
columns = ['LegacyContactId', 'LegacyGiftId', 'GiftType', 'GiftDate',
           'GiftAmount', 'Notes', 'CreditCardType', 'Project1Code',
           'Project2Code', 'LegacyPledgeID']

In [41]:
gifts.rename(columns={
    'DonorNumber': 'LegacyContactId',
    'GiftId': 'LegacyGiftId',
    'PaymentMethod': 'GiftType',
    'AmountReceived': 'GiftAmount',
}, inplace=True)

In [42]:
gifts[columns]

Unnamed: 0,LegacyContactId,LegacyGiftId,GiftType,GiftDate,GiftAmount,Notes,CreditCardType,Project1Code,Project2Code,LegacyPledgeID
0,848348568-0,95196378,Other,2019-03-04,4.15,,,,,0
1,729707142-0,95196889,Check,2019-03-05,2.3648,,,ChildSponsorship,,1
2,687119652-8,95197689,Cash,2019-03-07,1.31,,,,,2
3,653377813-7,95198998,Credit,2019-03-10,2.04,In honor of Mannie Turpin,AMEX,,,3
4,390551098-7,95198999,Cash,2019-01-10,5.8,,,,,89752384
5,848348568-0,95296677,Other,2019-03-20,9.28,,,General,ReliefFund,5
6,029456846-8,95298831,Check,2019-03-24,5.0,ACH check #7687,,,,6
7,093004505-X,95298845,Check,2019-04-09,4.83,,,,,7
8,315297729-8,95298997,Check,2019-04-12,7.0,,,SchoolSupplies2019,,8
9,809975531-Y,0,Credit,2019-08-14,8.48,,AMEX,,,9


In [43]:
final_gifts = gifts[columns].copy()

In [44]:
final_gifts.shape

(31, 10)

In [45]:
final_gifts.to_csv('data/final_gifts.csv', index=False)

# Export

In [46]:
#| hide
import nbdev

In [47]:
#| hide
nbdev.nbdev_export('02_Pandas_Solution.ipynb')