# Floral Data Processing

# **Imports**

In [1]:
#Numpy
import numpy as np

#Pandas
import pandas as pd

#Seaborn
import seaborn as sns

#matplotlib
import matplotlib.pyplot as plt
import plotly

import warnings
warnings.filterwarnings("ignore")

# **Loading Data**

In [2]:
GDS = pd.read_csv('GDS')

In [3]:
GDS.head()

Unnamed: 0,Order #,Transaction Type,Order Time,Order Date,Delivery Date_x,Sender,Recipient,Product Total,Delivery_x,Nontaxable Delivery,...,Billing State,Billing Zip,Subtotal,Delivery_y,Sale Tax,Tip,Discount_y,Grand Total_y,Tax Exempt,Payment Method_y
0,100017453,Sale,03:28:16PM Sat,"Dec 29, 2018","Dec 29, 2018",Candyce Williams Glaser,,$815.00,$0.00,$0.00,...,,,,,,,,,,
1,100017452,Sale,03:16:50PM Sat,"Dec 29, 2018","Dec 29, 2018",,,$249.90,$0.00,$0.00,...,,,,,,,,,,
2,100017451,Sale,02:58:53PM Sat,"Dec 29, 2018","Dec 29, 2018",,,$24.95,$0.00,$0.00,...,,,,,,,,,,
3,100017450,Sale,02:54:45PM Sat,"Dec 29, 2018","Dec 29, 2018",Tracie Hamilton,,$635.00,$0.00,$0.00,...,,,,,,,,,,
4,100017202,Sale,02:48:15PM Sat,"Dec 29, 2018","Dec 23, 2018",Candyce Williams Glaser,Candyce Williams Glaser,$702.00,$0.00,$25.00,...,,,,,,,,,,


# **Data Exploration**

In [4]:
GDS.isna().sum().sum()

1879164

In [5]:
GDS.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52916 entries, 0 to 52915
Data columns (total 55 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Order #                52916 non-null  int64  
 1   Transaction Type       47467 non-null  object 
 2   Order Time             47467 non-null  object 
 3   Order Date             47467 non-null  object 
 4   Delivery Date_x        47458 non-null  object 
 5   Sender                 28135 non-null  object 
 6   Recipient              21933 non-null  object 
 7   Product Total          47467 non-null  object 
 8   Delivery_x             47467 non-null  object 
 9   Nontaxable Delivery    47467 non-null  object 
 10  Wire Out Fee           47467 non-null  object 
 11  Discount_x             47467 non-null  object 
 12  Gift Cards             47467 non-null  object 
 13  Tax                    47467 non-null  object 
 14  Tips                   47467 non-null  object 
 15  Gr

In [6]:
# Order status Value counts
Ors= GDS['Order Status'].value_counts()
print(Ors)
# decide what to do about canceled

Delivered              5022
Accepted                275
Canceled                104
Out for Delivery         35
Arrangement Created      11
Processing                2
Name: Order Status, dtype: int64


In [7]:
#Item number is irrelevant
IN= GDS['Item Number'].value_counts()
print(IN)

N/A|N/A                                119
N/A|N/A|N/A                             17
N/A|N/A|N/A|N/A|N/A|N/A|N/A|N/A|N/A      1
N/A|N/A|N/A|N/A|N/A                      1
Name: Item Number, dtype: int64


In [8]:
# Tax Exept has no valuable information
TE= GDS['Tax Exempt'].unique()
print(TE)

[          nan 1.5815383e+09]


In [9]:
# Irrelevant data
g1= GDS['Discount_y'].value_counts()
print(g1)

0.0    5449
Name: Discount_y, dtype: int64


# **Fill nans with 0**

In [10]:
Disy = GDS['Discount_y'].unique()
GDS['Discount_y'] = GDS['Discount_y'].fillna(0, inplace = True)
print(Disy)

[nan  0.]


In [11]:
Wire = GDS['Wire Out Fee'].unique()
#GDS['Wire Out Fee'] = GDS['Wire Out Fee'].fillna(0, inplace = True)
print(Wire)

['$0.00' '$11.95' '$14.95' '$24.95' '($24.95)' '$7.50' '$36.00' '$23.95'
 '$4.00' '$10.00' ' -$11.95' '$12.50' '$0.00 ' '$20.00' '$22.95' '$9.95'
 '($11.95)' '$11.95 ' '$14.95 ' '$22.95 ' '$22.50 ' '$6.00 ' '$20.00 '
 '$5.00 ' '$7.50 ' '$24.95 ' '$15.00 ' '$17.95 ' '$30.00 ' '$12.95 '
 '$25.00 ' '$45.00 ' '$28.95 ' '$7.69 ' '$29.95 ' '($14.95)' '$4.99 '
 '$10.00 ' '$18.95 ' '$8.00 ' '$5.95 ' '$19.95 ' '$18.00 ' '($18.95)'
 '$27.99 ' '$4.95 ' '$9.95 ' '$1.00 ' '$9.50 ' '$27.95 ' '$35.00 '
 '$9.99 ' nan]


In [12]:
Dis = GDS['Discount_x'].unique()
GDS['Discount_x'] = GDS['Discount_x'].fillna(0, inplace = True)
print(Dis)

['$0.00' ' -$124.96' ' -$54.85' ... '($95.38)' '($101.43)' nan]


In [13]:
TIP = GDS['Tips'].unique()
GDS['Tip'] = GDS['Tips'].fillna(0, inplace = True)
print(TIP)

['$0.00' '$0.00 ' '$15.00 ' '$7.00 ' '$17.00 ' nan]


In [14]:
UQ = GDS['Transaction Type'].unique()
print(UQ)
# figure out what to put for nans here

['Sale' 'Adjustment' 'Refund' nan]


In [15]:
OS = GDS['Order Source'].unique()
print(OS)
# Decide what to do about nan for Order source

[nan 'Premium Site' 'Marketplace' 'F2F' 'Yelp']


In [16]:
PM = GDS['Payment Method_x'].unique()
print(PM)

['Credit Card' 'Cash' 'Bloomnation' 'FSN'
 'External Processor (Not Floranext)' 'Check / Money order' 'BBROOKS'
 'Donation' 'Gift Certificate' 'Write Off' 'Wire Service' 'CFS' nan]


In [17]:
PM = GDS['Payment Method_y'].unique()
print(PM)

[nan 'Credit Card' 'F2f Payment' 'Bloom Yelpbiz']


In [18]:
SDC = GDS['Sender'].value_counts()
print(SDC)

Consolidated Funeral Services, Inc.    941
Anka Brazzell                          275
Candyce Williams Glaser                266
Wild Root Florist                      252
BLOOM NATION                           219
                                      ... 
Lynn Lowe                                1
HENRY G DAVIS                            1
Mary Atkins                              1
Julie Schmader                           1
Darcy Bomer                              1
Name: Sender, Length: 11032, dtype: int64


In [19]:
OC = GDS['Occasion'].value_counts()
print(OC)
#Look into getting rest of occasion value counts

Birthday                         1220
Sympathy and Funeral              835
Mother's Day                      502
Just Because                      405
Get Well                          398
Thank You                         295
Valentine's Day                   232
Anniversary                       222
Congrats                          193
Christmas                         101
Love and Romance                   78
New Baby                           67
I'm Sorry                          60
Easter and Passover                45
Thanksgiving                       28
Wedding                            22
Admin Professionals' Week          21
Prom                               12
Autumn                              9
Father's Day                        9
Summer                              8
Best Friends Day                    7
Spring                              6
New Year                            6
Graduation                          6
Holiday                             5
Women's Day 

In [20]:
#Nontaxable Delivery
Nd = GDS['Nontaxable Delivery'].value_counts()
print(Nd)

$0.00       21841
$12.00       4843
$15.00       4091
$0.00        4063
$17.95       2437
            ...  
$31.00          1
$53.24          1
$195.00         1
$10.46          1
$11.49          1
Name: Nontaxable Delivery, Length: 390, dtype: int64


In [21]:
DD1 = GDS['Delivery Date_x'].value_counts()
print(DD1)

8-May-21        236
7-May-22        215
14-Feb-23       213
11-May-19       203
13-May-23       199
               ... 
Dec 22, 2017      1
Aug 10, 2017      1
20-Mar-22         1
Sep 30, 2018      1
29-Sep-19         1
Name: Delivery Date_x, Length: 1740, dtype: int64


In [22]:
Dee = GDS['Delivery_x'].value_counts()
print(Dee)

$0.00     39918
$0.00      7549
Name: Delivery_x, dtype: int64


In [23]:
similar_dates = GDS[GDS['Delivery Date_x'].isin(GDS['Delivery Date_y'])]
print(similar_dates)

          Order # Transaction Type      Order Time Order Date Delivery Date_x  \
44068  1000056974             Sale  10:48:12AM Tue   2-May-23             NaN   
44124  1000056903             Sale  02:14:09PM Sat  29-Apr-23             NaN   
44130  1000056898             Sale  01:02:18PM Sat  29-Apr-23             NaN   
44522  1000056547             Sale  12:11:42PM Tue  18-Apr-23             NaN   
44712  1000056390             Sale  11:40:21AM Thu  13-Apr-23             NaN   
...           ...              ...             ...        ...             ...   
52911  2749452849              NaN             NaN        NaN             NaN   
52912  2553412309              NaN             NaN        NaN             NaN   
52913  3540158661              NaN             NaN        NaN             NaN   
52914  3591549621              NaN             NaN        NaN             NaN   
52915  7662866077              NaN             NaN        NaN             NaN   

            Sender Recipien

In [24]:
DAT = GDS['Delivery Address Type'].value_counts()
print(DAT)

Not Provided          5362
Residential             72
Business                 7
Funeral / Cemetery       3
Hospital                 2
Other                    2
Place of Worship         1
Name: Delivery Address Type, dtype: int64


# **Cleaning Unecessary Symbols**

In [25]:
#Taking out everything except numbers in the column
xat = GDS['Tax'].value_counts()
GDS['Tax'] = GDS['Tax'].str.replace('(', '').str.replace(')', '').str.replace('$', '')
print(xat)

$0.00       2688
$7.31       1886
$12.19      1854
$9.75       1823
$14.63      1421
            ... 
$12.24         1
($0.88)        1
$27.07         1
$123.83        1
$1.98          1
Name: Tax, Length: 4037, dtype: int64


In [26]:
#Taking out everything except numbers in the column
XAT = GDS['Tips'].value_counts()
GDS['Tips'] = GDS['Tips'].str.replace('(', '').str.replace(')', '').str.replace('$', '')

In [27]:
#Wire Out Fee
WOF = GDS['Wire Out Fee'].value_counts()
GDS['Wire Out Fee'] = GDS['Wire Out Fee'].str.replace('(', '').str.replace(')', '').str.replace('$', '').str.replace(',', '')

In [28]:
#Grand Total_x
GT = GDS['Grand Total_x'].value_counts()
GDS['Grand Total_x'] = GDS['Grand Total_x'].str.replace('(', '').str.replace(')', '').str.replace('$', '').str.replace(',', '')

In [29]:
#Nontaxable Delivery
NT = GDS['Nontaxable Delivery'].value_counts()
GDS['Nontaxable Delivery'] = GDS['Nontaxable Delivery'].str.replace('(', '').str.replace(')', '').str.replace('$', '').str.replace(',', '')

In [30]:
DAT = GDS['Product Total'].value_counts()
GDS['Product Total'] = GDS['Product Total'].str.replace('(', '').str.replace(')', '').str.replace('$', '').str.replace(',', '')

In [31]:
DAT = GDS['Gift Cards'].value_counts()
GDS['Gift Cards'] = GDS['Gift Cards'].str.replace('(', '').str.replace(')', '').str.replace('$', '')

In [32]:
columns_to_drop = ['Delivery Street 2','Billing Street 2', 'Delivery_x', 'Delivery_y', 'Discount_y', 'Discount_x', 'Card Signature', 
                   'Card Message', 'Tip','Customer Email', 'Customer Phone', 'Delivery Address Type', 'Tax Exempt', 'Grand Total_y', 
                   'Item Number', 'Billing Phone', 'Delivery Date_y', 'Payment Method_y', 'Sale Tax', 'Billing Name', 
                   'Billing Street', 'Billing City', 'Billing State', 'Billing Zip', 'Recipient Phone']
df = GDS.drop(columns_to_drop, axis= 1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52916 entries, 0 to 52915
Data columns (total 30 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Order #              52916 non-null  int64  
 1   Transaction Type     47467 non-null  object 
 2   Order Time           47467 non-null  object 
 3   Order Date           47467 non-null  object 
 4   Delivery Date_x      47458 non-null  object 
 5   Sender               28135 non-null  object 
 6   Recipient            21933 non-null  object 
 7   Product Total        47467 non-null  object 
 8   Nontaxable Delivery  47467 non-null  object 
 9   Wire Out Fee         47467 non-null  object 
 10  Gift Cards           47467 non-null  object 
 11  Tax                  47467 non-null  object 
 12  Tips                 47467 non-null  object 
 13  Grand Total_x        47467 non-null  object 
 14  Payment Method_x     47467 non-null  object 
 15  Order Type           47276 non-null 

# **Changing Data Types**

In [33]:
# Changing Data Types
df['Gift Cards'] = df['Gift Cards'].astype(float)
df['Tax'] = df['Tax'].astype(float)
df['Tips'] = df['Tips'].astype(float)
df['Grand Total_x'] = df['Grand Total_x'].astype(float)
df['Product Total'] = df['Product Total'].astype(float)
df['Wire Out Fee'] = df['Wire Out Fee'].astype(float)
df['Nontaxable Delivery'] = df['Nontaxable Delivery'].astype(float)
df['Order Date'] = pd.to_datetime(df['Order Date'])
df['Delivery Date_x'] = pd.to_datetime(df['Delivery Date_x'])
df['Order Time'] = pd.to_datetime(df['Order Time'])
df['Sale Date'] = pd.to_datetime(df['Sale Date'])

In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52916 entries, 0 to 52915
Data columns (total 30 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   Order #              52916 non-null  int64         
 1   Transaction Type     47467 non-null  object        
 2   Order Time           47467 non-null  datetime64[ns]
 3   Order Date           47467 non-null  datetime64[ns]
 4   Delivery Date_x      47458 non-null  datetime64[ns]
 5   Sender               28135 non-null  object        
 6   Recipient            21933 non-null  object        
 7   Product Total        47467 non-null  float64       
 8   Nontaxable Delivery  47467 non-null  float64       
 9   Wire Out Fee         47467 non-null  float64       
 10  Gift Cards           47467 non-null  float64       
 11  Tax                  47467 non-null  float64       
 12  Tips                 47467 non-null  float64       
 13  Grand Total_x        47467 non-

In [35]:
df.isna().sum().sum()

749962