# Setup
> Monday August 14th, 2023

In [None]:
#| default_exp utils

# Previewing The Initial Datasets
> Loading Datasets using Pandas

In [None]:
#| export
from nbdev.showdoc import *
import pandas as pd
import re

In [None]:
#| export
contact_methods = pd.read_csv('data/contact_methods.csv').fillna('')
contacts = pd.read_csv('data/contacts.csv').fillna('')
gifts = pd.read_csv('data/gifts.csv').fillna('')

In [None]:
contacts.head()

Unnamed: 0,Number,Company Name,First Name,Last Name,Street,City,State,Postal,Phone,E-mail,Remarks,Deceased?
0,653377813-7,,Karita & Kelvin,Lumbers,4 Bunting Parkway,Washington,DC,20535-871,kklumbers@ yahoo.co,,Is anonymous,
1,390551098-7,,Helga,Benech,48684 Jenifer Way,Las Vegas,NV,89130,,ebenech1@goodreads.com,,
2,093004505-X,,Masha,,353 Schmedeman Park,Indianapolis,IN,,,577-374-96523,,
3,729707142-0,A Company Co.,,,2055 Lakewood Parkway,Camden,NJ,8104,,,,No
4,488464926-5,,Hoyt,Castille,37 8th Trail,Grand Rapids,MI,49560,,fcastille4@timesonline.co.uk,,No


In [None]:
gifts.head()

Unnamed: 0,donor_number,gift_id,first_name,last_name,amount received,date,fund_id,credit card type,payment method,pledge_number,notes
0,848348568-0,95196378.0,Mannie,Turpin,$4.15,3/4/2019,,,PayPal,,
1,729707142-0,95196889.0,Cymbre,Cross,2.3648,3/5/2019,ChildSponsorship,,check,,
2,687119652-8,95197689.0,Ruggiero,Makepeace,$1.31,3/7/2019,,,cash,,
3,653377813-7,95198998.0,Karita,Lumbers,$2.04,3/10/2019,,American Ex,credit card,,In honor of Mannie Turpin
4,390551098-7,95198999.0,Helga,Benech,$5.80,2019/1/10,,,cash,89752384.0,


In [None]:
contact_methods.head()

Unnamed: 0,donor_number,Phone,E-mail,Fax
0,653377813-7,832-442-4988,,
1,390551098-7,,ebenech1@goodreads.com,
2,093004505-X,818-323-9865,,818-156-7985
3,729707142-0,,,
4,488464926-5,,fcastille4@timesonline.co.uk,


# Inital Transformations
> Transforming certain task on load in order to reduce redundency ... and some work for me :)

## Column Name Transformation
<br>

- Nothing worse than malformed column names, amiright? <br>
<br>

### Creating a Func to Convert String to CamelCase

In [None]:
#| exports
def to_camel_case(s):
    # Remove all non-alphanumeric characters and replace with a space
    s = re.sub(r'[^a-zA-Z0-9]', ' ', s)
    
    # Split by space and capitalize the first letter of each word
    words = s.split()
    return ''.join(word.capitalize() for word in words)

### Creating seperate function to apply string transformation to pandas columns

In [None]:
#| exports
def transform_cnames(df, func=to_camel_case):
    df.columns = df.columns.map(func)
    return None

### Applying Transformation 

In [None]:
#| exports
for df in [contact_methods, contacts, gifts]:
    transform_cnames(df)

In [None]:
#| hide
contacts.columns

Index(['Number', 'CompanyName', 'FirstName', 'LastName', 'Street', 'City',
       'State', 'Postal', 'Phone', 'EMail', 'Remarks', 'Deceased'],
      dtype='object')

## Column Type Transformation
<br><br>

I identified 2 columns on the gift table that should be ints, replacing those values

In [None]:
#| exports
int_cols = ['GiftId', 'PledgeNumber']

In [None]:
#| exports
gifts[int_cols] = gifts[int_cols].replace({'':0}).astype(int)

Looks like AmountRecieved should really be a float, I'm removing any special characters (besides dashes and periods) and converting to a float :)

In [None]:
#| exports
gifts['AmountReceived'] = gifts.AmountReceived.apply(lambda x: float(re.sub(r'[^a-zA-Z0-9\.-]', '', x)))

Previewing change

In [None]:
gifts.AmountReceived.head(5)

0    4.1500
1    2.3648
2    1.3100
3    2.0400
4    5.8000
Name: AmountReceived, dtype: float64

Replacing 0 (formally null) values with unique id

In [None]:
#| exports
gifts.loc[ gifts.PledgeNumber == 0, 'PledgeNumber'] = gifts[gifts.PledgeNumber == 0].index
gifts.loc[ gifts.GiftId == 0, 'GiftId'] = gifts[gifts.GiftId == 0].index

In [None]:
#| exports
gifts = gifts.rename(columns={'PledgeNumber': 'LegacyPledgeID', 'GiftId': 'LegacyGiftId'})

## Misplaced Data Transformation
> Identify and clean any pieces of data in the wrong column

In [None]:
contacts[['Phone', 'EMail']].head(3)

Unnamed: 0,Phone,EMail
0,kklumbers@ yahoo.co,
1,,ebenech1@goodreads.com
2,,577-374-96523


Creating a function that uses regular expressions to identify a string as a phone or a email

In [None]:
#| exports
def classify_phone_email(value):
    if "@" in value:
        return "email"
    if re.search(r'\d{3}-\d{3}-\d{4}', value):
        return "phone"
    return None

### Applying Phone/Email Transformation

Looping over phone & email records.<br>
<br>
If a phone and/or email are identified in the wrong column, they will be swapped

In [None]:
#| exports
for index, row in contacts.iterrows():
    phone_classification = classify_phone_email(row['Phone'])
    email_classification = classify_phone_email(row['EMail'])

    if phone_classification == "email":
        contacts.at[index, 'EMail'] = row['Phone']
        contacts.at[index, 'Phone'] = ''

    if email_classification == "phone":
        contacts.at[index, 'Phone'] = row['EMail']
        contacts.at[index, 'EMail'] = ''

In [None]:
contacts[['Phone', 'EMail']].head(3)

Unnamed: 0,Phone,EMail
0,,kklumbers@ yahoo.co
1,,ebenech1@goodreads.com
2,577-374-96523,


## Consolidate Contacts Table
- Searching for missing users <br>
- Splitting / joining households <br>


### Checking for Missing records

Checking for any DonorNumbers that are NOT IN contacts Number...

In [None]:
(~gifts.DonorNumber.isin(contacts.Number.unique())).any()

True

<br>Found 1! <br>
<br>
Doing the same for contact_methods

In [None]:
(~contact_methods.DonorNumber.isin(contacts.Number.unique())).any()

False

<br>Didn't find any there <br><br>
Going to extract the missing records from the gifts table

In [None]:
#| exports
donors_not_in_contacts = gifts.loc[~gifts.DonorNumber.isin(contacts.Number.unique()), :]

In [None]:
donors_not_in_contacts

Unnamed: 0,DonorNumber,LegacyGiftId,FirstName,LastName,AmountReceived,Date,FundId,CreditCardType,PaymentMethod,LegacyPledgeID,Notes
9,809975531-Y,9,Adeline,Shakespeare,8.48,8/14/2019,,AMEX,credit card,9,
27,809975531-Y,27,Adeline,Shakespeare,7.58,8/14/2019,"Color run, ChildSponsorship",Mastercard,credit card,27,


<br>Adding to Shakespeare to our contacts!

In [None]:
#| exports
contacts = pd.concat([
    contacts,
    # Dataframe of donors not in contacts
    pd.DataFrame(donors_not_in_contacts[['DonorNumber', 'FirstName', 'LastName']]
                 .drop_duplicates()
                 .rename(columns={'DonorNumber': 'Number'})
                 .drop_duplicates()
                 .to_dict('records'))
])

### Spliting rows with Multiple People

In [None]:
contacts.FirstName.head(1)

0    Karita & Kelvin
Name: FirstName, dtype: object

<br>Split the names on ' & ' or ' and ', then expand the resulting lists into new rows

In [None]:
#| exports
contacts[['FirstName', 'SecondaryFirstName']] = contacts['FirstName'].str.split(' & | and ', expand=True).fillna('')

Aditionally, going to look for any records where there's a duplicated `Number` <br>
<br>
I'm assuming `Number` is a primary key for a household or organization

In [None]:
#| exports
records_to_join = contacts.loc[contacts.Number.duplicated(), :].to_dict(orient='records')

 Removing duplicates from contacts before joining

In [None]:
#| exports
contacts = contacts.loc[~contacts.Number.duplicated(), :]

Adding dupliates as secondary contacts

In [None]:
#| exports
for record in records_to_join:
    contacts.loc[contacts.Number.isin([record['Number']]), ['SecondaryFirstName', 'SecondaryLastName']] = [record['FirstName'], record['LastName']]

In [None]:
#| export
contacts['SecondaryLastName'] = contacts.SecondaryLastName.fillna('')

Adding Secondary Last Name for the appropriate users

In [None]:
#| exports
contacts['SecondaryLastName'] = contacts.apply(lambda x: x['LastName'] if x['SecondaryLastName'] == '' and x['SecondaryFirstName'] != '' else x['SecondaryLastName'], axis=1)

Initializing new columns

In [None]:
#| export
contacts[['LegacyIndividualId', 'SecondaryLegacyIndividualId']] = None

In [None]:
#| export
contacts.reset_index(inplace=True, drop=True)

Adding id since none was provided

In [None]:
#| exports
id = 0
for index, row in contacts.iterrows():
    contacts.loc[index, 'LegacyIndividualId'] = id
    id += 1
    if row['SecondaryFirstName'] != '':
        contacts.loc[index, 'SecondaryLegacyIndividualId'] = id
        id += 1


In [None]:
#| export
contacts.fillna('', inplace=True)

## Cleaning Names
>Cleaning Records with blank first or last names <br>

Checking for records where FirstName  and/or LastName is blank


In [None]:
#| exports
blank_name_records = ((contacts.FirstName == '') | (contacts.LastName == ''))

Previewing the blank name records

In [None]:
contacts.loc[((contacts.FirstName == '') | (contacts.LastName == '')), :]

Unnamed: 0,Number,CompanyName,FirstName,LastName,Street,City,State,Postal,Phone,EMail,Remarks,Deceased,SecondaryFirstName,SecondaryLastName,LegacyIndividualId,SecondaryLegacyIndividualId
2,093004505-X,,Masha,,353 Schmedeman Park,Indianapolis,IN,,577-374-96523,,,,,,3,
3,729707142-0,A Company Co.,,,2055 Lakewood Parkway,Camden,NJ,8104.0,,,,No,,,4,
7,029456846-8,,,,608 Old Shore Alley,Marietta,GA,30066.0,,jdoley6@telegraph.co.uk,,,,,9,


<br>Before I delete the records I'm going to check if the names are present on the gift table <br>
<br>
I'm going to start by getting the unique Numbers that the records belong too

In [None]:
#| exports
blank_name_numbers= contacts.loc[blank_name_records, 'Number']

In [None]:
gifts.loc[gifts.DonorNumber.isin(blank_name_numbers), ['DonorNumber', 'FirstName', 'LastName']].drop_duplicates().head(5)

Unnamed: 0,DonorNumber,FirstName,LastName
1,729707142-0,Cymbre,Cross
6,029456846-8,Romy,Doley
7,093004505-X,Masha,Butt Gow
19,729707142-0,Cymbre,


<br>The names are present on the gifts table!

In [None]:
#| exports
gift_name_records = gifts.loc[gifts.DonorNumber.isin(blank_name_numbers), ['DonorNumber', 'FirstName', 'LastName']].drop_duplicates()

Removing the invalid record

In [None]:
#| exports
gift_name_records = gift_name_records.loc[((gift_name_records.FirstName != '') & (gift_name_records.LastName != '')), :]
gift_name_records

Unnamed: 0,DonorNumber,FirstName,LastName
1,729707142-0,Cymbre,Cross
6,029456846-8,Romy,Doley
7,093004505-X,Masha,Butt Gow


<br>Updating the records that previousuly had a blank first or last name

In [None]:
#| exports
for _, row in gift_name_records.iterrows():
    contacts.loc[contacts['Number'] == row['DonorNumber'], ['FirstName', 'LastName']] = [row['FirstName'], row['LastName']]

All the records valid names now!

In [None]:
contacts.loc[blank_name_records, :]

Unnamed: 0,Number,CompanyName,FirstName,LastName,Street,City,State,Postal,Phone,EMail,Remarks,Deceased,SecondaryFirstName,SecondaryLastName,LegacyIndividualId,SecondaryLegacyIndividualId
2,093004505-X,,Masha,Butt Gow,353 Schmedeman Park,Indianapolis,IN,,577-374-96523,,,,,,3,
3,729707142-0,A Company Co.,Cymbre,Cross,2055 Lakewood Parkway,Camden,NJ,8104.0,,,,No,,,4,
7,029456846-8,,Romy,Doley,608 Old Shore Alley,Marietta,GA,30066.0,,jdoley6@telegraph.co.uk,,,,,9,


## Add Contact Name
>Adding Contact Name

In [None]:
contacts.head(3)

Unnamed: 0,Number,CompanyName,FirstName,LastName,Street,City,State,Postal,Phone,EMail,Remarks,Deceased,SecondaryFirstName,SecondaryLastName,LegacyIndividualId,SecondaryLegacyIndividualId
0,653377813-7,,Karita,Lumbers,4 Bunting Parkway,Washington,DC,20535-871,,kklumbers@ yahoo.co,Is anonymous,,Kelvin,Lumbers,0,1.0
1,390551098-7,,Helga,Benech,48684 Jenifer Way,Las Vegas,NV,89130,,ebenech1@goodreads.com,,,,,2,
2,093004505-X,,Masha,Butt Gow,353 Schmedeman Park,Indianapolis,IN,,577-374-96523,,,,,,3,


<br>Creating a function to update the contact name depending on the individuals on the household record

In [None]:
#| exports
def set_contact_name(row):
    if row['LastName'] == row['SecondaryLastName']:
        return row['FirstName'] + ' & ' + row['SecondaryFirstName'] + ' ' + row['LastName']
    elif row['SecondaryFirstName'] != '':
        return row['FirstName'] +  ' ' + row['LastName'] + ' & ' + row['SecondaryFirstName'] + ' ' + row['SecondaryLastName']
    else:
        return row['FirstName'] + ' ' + row['LastName']

### Applying ContactName Transformation

In [None]:
#| exports
contacts['ContactName'] = contacts.apply(set_contact_name, axis=1)

## ProjectCode Transformation
>Splitting Project Codes

In [None]:
#| exports
project_codes = gifts.FundId.str.split(', ', expand=True)

In [None]:
project_codes.head(3)

Unnamed: 0,0,1
0,,
1,ChildSponsorship,
2,,


<br>

In [None]:
#| exports
gifts[['Project1Code', 'Project2Code']] = project_codes

In [None]:
#| exports
gifts = gifts.loc[:, gifts.columns.drop('FundId')].copy()

## Date Transformation
>Formating date

In [None]:
df['Date'].head(5)

0     3/4/2019
1     3/5/2019
2     3/7/2019
3    3/10/2019
4    2019/1/10
Name: Date, dtype: object

<br>Since Dates are in different formats I'm going to create a custom parser

In [None]:
#| export
from datetime import datetime

In [None]:
#| exports
def custom_parser(date_str):
    try:
        return datetime.strptime(date_str, '%m/%d/%Y')
    except ValueError:
        return datetime.strptime(date_str, '%Y/%m/%d')

### Transforming Dates

In [None]:
#| exports
gifts['GiftDate'] = gifts['Date'].apply(custom_parser)

Dropping old date columns

In [None]:
#| exports
gifts = gifts.loc[:, gifts.columns.drop('Date')].copy()

## Email Transformation
>Validate emails

Previewing the `Email` column

In [None]:
contacts.EMail.head(1)

0    kklumbers@ yahoo.co
Name: EMail, dtype: object

<br>It looks like the first email 'kklumbers@ yahoo.co' is invalid and should be fixed <br>
<br>
I'm going to write a function to validate emails and fix common mistakes

In [None]:
#| exports
def valid_email(s):
    pattern = re.compile(r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$')
    if pattern.match(s):
        return True
    return False

<br> Creating a second function that attempts to fix common email mistakes like mispelled domains

In [None]:
#| exports
def fix_email(email):
    if not email:
        return ''
    
    email = re.sub(r'\s', '', email)

    # Check and fix common mistakes in top-level domain (TLD)
    for wrong_tld, correct_tld in [('co', 'com'), ('cmo', 'com'), ('con', 'com')]:
        if email.endswith('.' + wrong_tld):
            email = email[:-len(wrong_tld)] + correct_tld

    # Check and fix common mistakes in domain names
    for wrong_domain, correct_domain in [('gmial', 'gmail'), ('yahho', 'yahoo')]:
        email = email.replace('@' + wrong_domain, '@' + correct_domain)

    if valid_email(email):
        return email

    return ''


<br>Testing the email validator

In [None]:
contacts.EMail.apply(fix_email)

0              kklumbers@yahoo.com
1           ebenech1@goodreads.com
2                                 
3                                 
4     fcastille4@timesonline.co.uk
5                                 
6                                 
7          jdoley6@telegraph.co.uk
8             cmakepeace7@1688.com
9                                 
10                                
Name: EMail, dtype: object

### Applying Email Transformation

In [None]:
#| exports
contacts['EMail'] = contacts.EMail.apply(fix_email)

## Phone # Transformation
>Fix phone numbers

There appears to be a invalid phone number

In [None]:
contacts.loc[2, 'Phone']

'577-374-96523'

Writing a function to validate US phone numbers, assuming the phone numbers provided are from the US

In [None]:
#| exports
def validate_us_phone_number(phone_number):
    # Patterns for different US phone number formats
    patterns = [
        r'^\+1\s?\d{3}-\d{3}-\d{4}$',
        r'^\(\d{3}\)\s?\d{3}-\d{4}$',
        r'^\d{3}-\d{3}-\d{4}$',
        r'^\d{3}-\d{4}$' 
    ]

    # Check if the phone number matches any of the patterns
    for pattern in patterns:
        if re.match(pattern, phone_number):
            return phone_number

    return ''

### Applying Phone # Transformation

In [None]:
#| exports
contacts['Phone'] = contacts.Phone.apply(validate_us_phone_number)

In [None]:
#| exports
contacts.fillna('', inplace=True)

# Results

In [None]:
contacts.head(5)

Unnamed: 0,Number,CompanyName,FirstName,LastName,Street,City,State,Postal,Phone,EMail,Remarks,Deceased,SecondaryFirstName,SecondaryLastName,LegacyIndividualId,SecondaryLegacyIndividualId,ContactName
0,653377813-7,,Karita,Lumbers,4 Bunting Parkway,Washington,DC,20535-871,,kklumbers@yahoo.com,Is anonymous,,Kelvin,Lumbers,0,1.0,Karita & Kelvin Lumbers
1,390551098-7,,Helga,Benech,48684 Jenifer Way,Las Vegas,NV,89130,,ebenech1@goodreads.com,,,,,2,,Helga Benech
2,093004505-X,,Masha,Butt Gow,353 Schmedeman Park,Indianapolis,IN,,,,,,,,3,,Masha Butt Gow
3,729707142-0,A Company Co.,Cymbre,Cross,2055 Lakewood Parkway,Camden,NJ,8104,,,,No,,,4,,Cymbre Cross
4,488464926-5,,Hoyt,Castille,37 8th Trail,Grand Rapids,MI,49560,,fcastille4@timesonline.co.uk,,No,,,5,,Hoyt Castille


In [None]:
gifts.head(5)

Unnamed: 0,DonorNumber,LegacyGiftId,FirstName,LastName,AmountReceived,CreditCardType,PaymentMethod,LegacyPledgeID,Notes,Project1Code,Project2Code,GiftDate
0,848348568-0,95196378,Mannie,Turpin,4.15,,PayPal,0,,,,2019-03-04
1,729707142-0,95196889,Cymbre,Cross,2.3648,,check,1,,ChildSponsorship,,2019-03-05
2,687119652-8,95197689,Ruggiero,Makepeace,1.31,,cash,2,,,,2019-03-07
3,653377813-7,95198998,Karita,Lumbers,2.04,American Ex,credit card,3,In honor of Mannie Turpin,,,2019-03-10
4,390551098-7,95198999,Helga,Benech,5.8,,cash,89752384,,,,2019-01-10


In [None]:
contacts.head(5)

Unnamed: 0,Number,CompanyName,FirstName,LastName,Street,City,State,Postal,Phone,EMail,Remarks,Deceased,SecondaryFirstName,SecondaryLastName,LegacyIndividualId,SecondaryLegacyIndividualId,ContactName
0,653377813-7,,Karita,Lumbers,4 Bunting Parkway,Washington,DC,20535-871,,kklumbers@yahoo.com,Is anonymous,,Kelvin,Lumbers,0,1.0,Karita & Kelvin Lumbers
1,390551098-7,,Helga,Benech,48684 Jenifer Way,Las Vegas,NV,89130,,ebenech1@goodreads.com,,,,,2,,Helga Benech
2,093004505-X,,Masha,Butt Gow,353 Schmedeman Park,Indianapolis,IN,,,,,,,,3,,Masha Butt Gow
3,729707142-0,A Company Co.,Cymbre,Cross,2055 Lakewood Parkway,Camden,NJ,8104,,,,No,,,4,,Cymbre Cross
4,488464926-5,,Hoyt,Castille,37 8th Trail,Grand Rapids,MI,49560,,fcastille4@timesonline.co.uk,,No,,,5,,Hoyt Castille


# Export

In [None]:
import nbdev

In [None]:
nbdev.nbdev_export('00_Setup.ipynb')

<br>