In [524]:
import pandas as pd
import numpy as np
import html5lib
import lxml
from bs4 import BeautifulSoup

# Merchant Data


In [525]:
df_merchant_data = pd.read_html('Enterprise Department/merchant_data.html')
df_merchant_data[0].head()

Unnamed: 0.1,Unnamed: 0,merchant_id,creation_date,name,street,state,city,country,contact_number
0,0,MERCHANT53971,2022-06-28 11:42:04,Whitby Group,813 North Isle bury,South Carolina,Boise,Brunei Darussalam,661.157.5528
1,1,MERCHANT56138,2020-08-27 20:35:19,YourMapper,54861 Springs view,Colorado,Cleveland,Sint Maarten (Dutch part),(046)415-8092
2,2,MERCHANT31852,2021-08-05 12:04:33,United Mayflower,730 East Islands side,Virginia,Atlanta,Mali,939.273.0312
3,3,MERCHANT63299,2021-08-15 19:00:03,TransparaGov,2619 Coves haven,Maine,Indianapolis,Chad,1-181-438-5899
4,4,MERCHANT16722,2023-02-28 20:35:34,Maponics,979 North Rue borough,Wyoming,Boise,Guatemala,494-319-8223


## Drop Unnamed column


In [526]:
df_merchant_data[0] = df_merchant_data[0].drop(columns=['Unnamed: 0'])

df_merchant_data[0].head()

Unnamed: 0,merchant_id,creation_date,name,street,state,city,country,contact_number
0,MERCHANT53971,2022-06-28 11:42:04,Whitby Group,813 North Isle bury,South Carolina,Boise,Brunei Darussalam,661.157.5528
1,MERCHANT56138,2020-08-27 20:35:19,YourMapper,54861 Springs view,Colorado,Cleveland,Sint Maarten (Dutch part),(046)415-8092
2,MERCHANT31852,2021-08-05 12:04:33,United Mayflower,730 East Islands side,Virginia,Atlanta,Mali,939.273.0312
3,MERCHANT63299,2021-08-15 19:00:03,TransparaGov,2619 Coves haven,Maine,Indianapolis,Chad,1-181-438-5899
4,MERCHANT16722,2023-02-28 20:35:34,Maponics,979 North Rue borough,Wyoming,Boise,Guatemala,494-319-8223


## Fix the naming of name, street, state, city, and country


In [527]:
columns_to_title_case = ['name', 'street', 'state', 'city', 'country']

for column in columns_to_title_case:
    df_merchant_data[0][column] = df_merchant_data[0][column].str.title()

df_merchant_data[0].head()

Unnamed: 0,merchant_id,creation_date,name,street,state,city,country,contact_number
0,MERCHANT53971,2022-06-28 11:42:04,Whitby Group,813 North Isle Bury,South Carolina,Boise,Brunei Darussalam,661.157.5528
1,MERCHANT56138,2020-08-27 20:35:19,Yourmapper,54861 Springs View,Colorado,Cleveland,Sint Maarten (Dutch Part),(046)415-8092
2,MERCHANT31852,2021-08-05 12:04:33,United Mayflower,730 East Islands Side,Virginia,Atlanta,Mali,939.273.0312
3,MERCHANT63299,2021-08-15 19:00:03,Transparagov,2619 Coves Haven,Maine,Indianapolis,Chad,1-181-438-5899
4,MERCHANT16722,2023-02-28 20:35:34,Maponics,979 North Rue Borough,Wyoming,Boise,Guatemala,494-319-8223


## Check if the creation date values are valid


In [528]:
valid_creation_dates_mask = pd.to_datetime(
    df_merchant_data[0]['creation_date'], errors='coerce').notna()
invalid_creation_dates = df_merchant_data[0][~valid_creation_dates_mask]

print("Rows with invalid creation dates:")
print(invalid_creation_dates)

Rows with invalid creation dates:
Empty DataFrame
Columns: [merchant_id, creation_date, name, street, state, city, country, contact_number]
Index: []


## Fix the format of contact number


Remove special characters in contact number


In [529]:
df_merchant_data[0]['contact_number'] = df_merchant_data[0]['contact_number'].str.replace(
    r'\D', '', regex=True)

df_merchant_data[0].head()

Unnamed: 0,merchant_id,creation_date,name,street,state,city,country,contact_number
0,MERCHANT53971,2022-06-28 11:42:04,Whitby Group,813 North Isle Bury,South Carolina,Boise,Brunei Darussalam,6611575528
1,MERCHANT56138,2020-08-27 20:35:19,Yourmapper,54861 Springs View,Colorado,Cleveland,Sint Maarten (Dutch Part),464158092
2,MERCHANT31852,2021-08-05 12:04:33,United Mayflower,730 East Islands Side,Virginia,Atlanta,Mali,9392730312
3,MERCHANT63299,2021-08-15 19:00:03,Transparagov,2619 Coves Haven,Maine,Indianapolis,Chad,11814385899
4,MERCHANT16722,2023-02-28 20:35:34,Maponics,979 North Rue Borough,Wyoming,Boise,Guatemala,4943198223


Check if there are contact number values that are not 10 digits


In [530]:
df_merchant_data[0]['contact_number_length'] = df_merchant_data[0]['contact_number'].apply(
    len)
rows_with_invalid_contact_numbers = df_merchant_data[0][df_merchant_data[0]
                                                        ['contact_number_length'] != 10]

print("Rows with invalid contact numbers:")
rows_with_invalid_contact_numbers.head()

Rows with invalid contact numbers:


Unnamed: 0,merchant_id,creation_date,name,street,state,city,country,contact_number,contact_number_length
3,MERCHANT63299,2021-08-15 19:00:03,Transparagov,2619 Coves Haven,Maine,Indianapolis,Chad,11814385899,11
12,MERCHANT65041,2021-04-29 13:49:11,Credit Sesame,18743 Points Chester,Utah,Louisville/Jefferson,Jamaica,18694833237,11
15,MERCHANT16337,2020-02-20 15:53:14,The Advisory Board Company,8948 Garden Berg,California,Jersey,Nigeria,11846688836,11
18,MERCHANT12063,2020-01-22 10:27:39,"Govzilla, Inc.",4236 Heights Ton,Delaware,Jacksonville,Russian Federation,13428149030,11
21,MERCHANT9424,2022-12-29 23:17:17,"Qado Energy, Inc.",852 Isle Furt,Kansas,Winston-Salem,Timor-Leste,13321333264,11


Check if there are rows with contact number that is not 10 digits and does not start with 1


In [531]:
invalid_contact_numbers = df_merchant_data[0][
    (df_merchant_data[0]['contact_number_length'] != 10) &
    (~df_merchant_data[0]['contact_number'].str.startswith('1'))
]

print("Rows with contact numbers not 10 digits and not starting with 1:")
print(invalid_contact_numbers)

Rows with contact numbers not 10 digits and not starting with 1:
Empty DataFrame
Columns: [merchant_id, creation_date, name, street, state, city, country, contact_number, contact_number_length]
Index: []


Update values of contact number that are not 10 digits and start with 1 by removing the 1 in the beginning


In [532]:
mask = (df_merchant_data[0]['contact_number_length'] != 10) & (
    df_merchant_data[0]['contact_number'].str.startswith('1'))
df_merchant_data[0].loc[mask,
                        'contact_number'] = df_merchant_data[0].loc[mask, 'contact_number'].str[1:]

df_merchant_data[0]['contact_number_length'] = df_merchant_data[0]['contact_number'].apply(
    len)
rows_with_invalid_contact_numbers = df_merchant_data[0][df_merchant_data[0]
                                                        ['contact_number_length'] != 10]

print("Rows with invalid contact numbers:")
rows_with_invalid_contact_numbers.head()

Rows with invalid contact numbers:


Unnamed: 0,merchant_id,creation_date,name,street,state,city,country,contact_number,contact_number_length


Drop contact number length column


In [533]:
df_merchant_data[0] = df_merchant_data[0].drop(
    columns=['contact_number_length'])
df_merchant_data[0].head()

Unnamed: 0,merchant_id,creation_date,name,street,state,city,country,contact_number
0,MERCHANT53971,2022-06-28 11:42:04,Whitby Group,813 North Isle Bury,South Carolina,Boise,Brunei Darussalam,6611575528
1,MERCHANT56138,2020-08-27 20:35:19,Yourmapper,54861 Springs View,Colorado,Cleveland,Sint Maarten (Dutch Part),464158092
2,MERCHANT31852,2021-08-05 12:04:33,United Mayflower,730 East Islands Side,Virginia,Atlanta,Mali,9392730312
3,MERCHANT63299,2021-08-15 19:00:03,Transparagov,2619 Coves Haven,Maine,Indianapolis,Chad,1814385899
4,MERCHANT16722,2023-02-28 20:35:34,Maponics,979 North Rue Borough,Wyoming,Boise,Guatemala,4943198223


Update the format of contact number


In [534]:
def format_contact_number(number):
    return f"({number[:3]}) {number[3:6]}-{number[6:]}"


df_merchant_data[0]['contact_number'] = df_merchant_data[0]['contact_number'].apply(
    format_contact_number)

df_merchant_data[0].head()

Unnamed: 0,merchant_id,creation_date,name,street,state,city,country,contact_number
0,MERCHANT53971,2022-06-28 11:42:04,Whitby Group,813 North Isle Bury,South Carolina,Boise,Brunei Darussalam,(661) 157-5528
1,MERCHANT56138,2020-08-27 20:35:19,Yourmapper,54861 Springs View,Colorado,Cleveland,Sint Maarten (Dutch Part),(046) 415-8092
2,MERCHANT31852,2021-08-05 12:04:33,United Mayflower,730 East Islands Side,Virginia,Atlanta,Mali,(939) 273-0312
3,MERCHANT63299,2021-08-15 19:00:03,Transparagov,2619 Coves Haven,Maine,Indianapolis,Chad,(181) 438-5899
4,MERCHANT16722,2023-02-28 20:35:34,Maponics,979 North Rue Borough,Wyoming,Boise,Guatemala,(494) 319-8223


## Check for duplicate merchant id


In [535]:
duplicate_merchant_data_id = df_merchant_data[0][df_merchant_data[0].duplicated(
    subset=['merchant_id'], keep=False)]
sorted_duplicate_merchant_data_id = duplicate_merchant_data_id.sort_values(by=[
                                                                           'merchant_id'])

sorted_duplicate_merchant_data_id

Unnamed: 0,merchant_id,creation_date,name,street,state,city,country,contact_number
111,MERCHANT0224,2022-12-18 04:20:09,"Urban Mapping, Inc",71284 Port Point Burgh,Arizona,Winston-Salem,Italy,(412) 106-5024
3559,MERCHANT0224,2020-08-28 13:50:55,Funding Circle,709 Ports Fort,New Jersey,Chula Vista,India,(391) 460-2731
2443,MERCHANT0261,2023-11-14 10:03:51,Orlin Research,68998 East Stravenue Borough,Michigan,Madison,Paraguay,(274) 370-3666
863,MERCHANT0261,2023-11-28 11:50:56,Noveda Technologies,96856 Springs Stad,Virginia,Glendale,Czechia,(690) 864-2029
1972,MERCHANT0722,2023-11-01 16:46:18,Balefire Global,4989 East Cove Land,South Dakota,Lexington-Fayette,Brazil,(639) 134-4529
...,...,...,...,...,...,...,...,...
1159,MERCHANT8744,2022-12-17 12:28:00,Mcgraw Hill Financial,2500 Turnpike Side,Texas,Phoenix,Bermuda,(360) 662-3437
2538,MERCHANT9113,2020-11-13 19:48:13,Vitalchek,46725 Manor Land,South Carolina,Reno,Suriname,(544) 707-7197
2804,MERCHANT9113,2023-05-06 01:30:07,Suddath,1600 Motorway Land,South Dakota,San Diego,Botswana,(353) 385-0523
3848,MERCHANT9930,2020-06-06 21:53:07,Mint,4051 Lake Manor Stad,Illinois,Fort Worth,Saint Kitts And Nevis,(123) 876-5938


In [536]:
df_merchant_data[0]['merchant_id'] = df_merchant_data[0]['merchant_id'].str.replace(
    'MERCHANT', '').astype(int)
df_merchant_data[0]

Unnamed: 0,merchant_id,creation_date,name,street,state,city,country,contact_number
0,53971,2022-06-28 11:42:04,Whitby Group,813 North Isle Bury,South Carolina,Boise,Brunei Darussalam,(661) 157-5528
1,56138,2020-08-27 20:35:19,Yourmapper,54861 Springs View,Colorado,Cleveland,Sint Maarten (Dutch Part),(046) 415-8092
2,31852,2021-08-05 12:04:33,United Mayflower,730 East Islands Side,Virginia,Atlanta,Mali,(939) 273-0312
3,63299,2021-08-15 19:00:03,Transparagov,2619 Coves Haven,Maine,Indianapolis,Chad,(181) 438-5899
4,16722,2023-02-28 20:35:34,Maponics,979 North Rue Borough,Wyoming,Boise,Guatemala,(494) 319-8223
...,...,...,...,...,...,...,...,...
4995,54074,2021-10-16 18:18:52,Kld Research,5760 Rapids Mouth,Connecticut,Wichita,Holy See,(499) 792-6191
4996,41034,2020-04-25 23:45:42,Loopnet,532 Walks Bury,Iowa,Madison,Oman,(484) 217-8833
4997,55677,2020-07-25 13:12:36,Marlin & Associates,44600 Park Port,South Dakota,Wichita,Mauritania,(681) 783-5079
4998,44767,2021-09-28 21:25:24,Accenture,217 West Estates Stad,Michigan,San Francisco,Egypt,(628) 405-2478


In [537]:
df_merchant_data[0]['merchant_id'] = df_merchant_data[0].groupby('merchant_id').cumcount().add(
    1).astype(str).radd('MERCHANT') + df_merchant_data[0]['merchant_id'].astype(str)

In [538]:
duplicate_check = df_merchant_data[0].duplicated('merchant_id', keep=False)

if duplicate_check.any():
    print("There are still duplicate merchant IDs.")
else:
    print("There are no duplicate merchant IDs.")

There are no duplicate merchant IDs.


## Check for nulls


In [539]:
df_merchant_data[0].isnull().sum()

merchant_id       0
creation_date     0
name              0
street            0
state             0
city              0
country           0
contact_number    0
dtype: int64

In [540]:
df_merchant_data[0].head()

Unnamed: 0,merchant_id,creation_date,name,street,state,city,country,contact_number
0,MERCHANT153971,2022-06-28 11:42:04,Whitby Group,813 North Isle Bury,South Carolina,Boise,Brunei Darussalam,(661) 157-5528
1,MERCHANT156138,2020-08-27 20:35:19,Yourmapper,54861 Springs View,Colorado,Cleveland,Sint Maarten (Dutch Part),(046) 415-8092
2,MERCHANT131852,2021-08-05 12:04:33,United Mayflower,730 East Islands Side,Virginia,Atlanta,Mali,(939) 273-0312
3,MERCHANT163299,2021-08-15 19:00:03,Transparagov,2619 Coves Haven,Maine,Indianapolis,Chad,(181) 438-5899
4,MERCHANT116722,2023-02-28 20:35:34,Maponics,979 North Rue Borough,Wyoming,Boise,Guatemala,(494) 319-8223


## Convert to parquet


In [541]:
df_merchant_data[0].to_parquet(
    'Enterprise Department/merchant_data.parquet', index=False)

## Order with Merchant 1


In [542]:
df_merchant_1 = pd.read_parquet(
    'Enterprise Department/order_with_merchant_data1.parquet')


print(df_merchant_1)


print(df_merchant_1.columns.ravel())


df_merchant_1.isnull().sum()

                                   order_id    merchant_id      staff_id
0      eac330c0-457a-4faa-b15a-52a3c440e7f3  MERCHANT58557  STAFF0027757
1      e1beaf61-e687-4e70-bdd4-3ea338139a0b   MERCHANT0605  STAFF0039068
2      0612c246-57f1-40e8-9993-0f8d41992049  MERCHANT22282  STAFF0058495
3      a800f0d9-47d8-455b-b096-622e76156705  MERCHANT39307  STAFF0038632
4      b4c411de-2fd3-4806-91ae-165edc9baa12  MERCHANT26962  STAFF0035568
...                                     ...            ...           ...
99995  a72f9936-fea8-427c-869b-a9587e0dfebe  MERCHANT11270  STAFF0016364
99996  d55e57f3-0543-4725-8b2c-007958b7b858  MERCHANT54423  STAFF0043207
99997  758e9edc-b790-47c5-b785-426919339ef1  MERCHANT55289  STAFF0054414
99998  f662ffdf-32ae-4a5f-a326-de658db9eb5d  MERCHANT39525  STAFF0064079
99999  8ed751f5-80ae-454d-bac6-2e98c138cb65   MERCHANT7346  STAFF0013034

[100000 rows x 3 columns]
Index(['order_id', 'merchant_id', 'staff_id'], dtype='object')


order_id       0
merchant_id    0
staff_id       0
dtype: int64

## Check for duplicates


Check for duplicate order id


In [543]:
is_duplicate_order_id_df_merchant_1 = df_merchant_1['order_id'].duplicated(
).any()
is_duplicate_merchant_id_df_merchant_1 = df_merchant_1['merchant_id'].duplicated(
).any()
is_duplicate_staff_id_df_merchant_1 = df_merchant_1['staff_id'].duplicated(
).any()

if is_duplicate_order_id_df_merchant_1:
    print("There are duplicate order IDs.")
else:
    print("All order IDs are unique.")

if is_duplicate_merchant_id_df_merchant_1:
    print("There are duplicate merchant IDs.")
else:
    print("All merchant IDs are unique.")

if is_duplicate_staff_id_df_merchant_1:
    print("There are duplicate staff IDs.")
else:
    print("All staff IDs are unique.")

All order IDs are unique.
There are duplicate merchant IDs.
There are duplicate staff IDs.


Check for duplicate rows with same values of merchant id and staff id


In [544]:
duplicated_rows_df_merchant_1 = df_merchant_1[df_merchant_1.duplicated(
    subset=['merchant_id', 'staff_id'], keep=False)]
sorted_duplicated_rows_df_merchant_1 = duplicated_rows_df_merchant_1.sort_values(
    by=['merchant_id', 'staff_id'])

if not sorted_duplicated_rows_df_merchant_1.empty:
    print("Duplicate rows with the same merchant_id and staff_id:")
    print(sorted_duplicated_rows_df_merchant_1)
else:
    print("No duplicate rows with the same merchant_id and staff_id.")

Duplicate rows with the same merchant_id and staff_id:
                                   order_id   merchant_id      staff_id
35045  c7c3c805-61fc-4170-8688-04b6cd17e69d  MERCHANT0580  STAFF0014191
57300  d773eb3f-cbbd-4a65-8d27-ad4fbc2f51e1  MERCHANT0580  STAFF0014191
35065  32930bd1-ab70-4e61-96fc-742ca06eadea  MERCHANT0764  STAFF0019878
81964  383fec97-f1de-433d-95e2-a67eff301f4c  MERCHANT0764  STAFF0019878
6213   aab98e7d-3e53-44c6-90df-e0d54221164e  MERCHANT1049  STAFF0007004
...                                     ...           ...           ...
64266  6c9c1788-5e92-479e-85ec-289ed5970b5b  MERCHANT8891  STAFF0012294
48930  14a88877-497f-445e-9268-5d2c88e5227e  MERCHANT9083  STAFF0037538
85402  34d1453a-cd64-4bf2-8b11-269a880fd5e1  MERCHANT9083  STAFF0037538
11323  88905c78-15fb-4e19-83bc-a0982352e348  MERCHANT9890  STAFF0030411
38999  99ed2b2f-c3ac-4a06-8036-9ffc58d81aa9  MERCHANT9890  STAFF0030411

[452 rows x 3 columns]


## Convert to parquet


In [545]:
df_merchant_1.to_parquet(
    'Enterprise Department/order_merchant_data1.parquet', index=False)

## Orders with Merchant 2


In [546]:
df_merchant_2 = pd.read_parquet(
    'Enterprise Department/order_with_merchant_data2.parquet')


print(df_merchant_2)


print(df_merchant_2.columns.ravel())


df_merchant_2.isnull().sum()

                                    order_id    merchant_id      staff_id
100000  848f9cad-c4d3-4822-83e9-322ae73261c3   MERCHANT9789  STAFF0020354
100001  be97922b-20fb-4244-9e25-bac98b209668  MERCHANT43424  STAFF0061355
100002  dc905240-5fcd-45e4-8077-be831e1f0263  MERCHANT23900  STAFF0038795
100003  11cd8029-61f2-4bc8-a148-cb1a4afd8b57  MERCHANT58797  STAFF0023844
100004  69c9e6fc-326e-4f2d-98c6-5360b91cfb84  MERCHANT41587  STAFF0022537
...                                      ...            ...           ...
299995  86f5d57b-a787-43ed-8b2c-f4f9a52f5033  MERCHANT15934  STAFF0009370
299996  7a57393e-7c8f-4d9c-ba3c-4711e739fd6e   MERCHANT0688  STAFF0022143
299997  5c442281-170a-4344-8d0f-3c31f3f02b25  MERCHANT58009  STAFF0018140
299998  aecebd35-355e-4f87-acc7-bcca013dceed  MERCHANT33284  STAFF0023079
299999  a2498394-9e81-4416-b0ea-443e584e77cc   MERCHANT1615  STAFF0008378

[200000 rows x 3 columns]
Index(['order_id', 'merchant_id', 'staff_id'], dtype='object')


order_id       0
merchant_id    0
staff_id       0
dtype: int64

## Check for duplicates


In [547]:
is_duplicate_order_id_df_merchant_2 = df_merchant_2['order_id'].duplicated(
).any()
is_duplicate_merchant_id_df_merchant_2 = df_merchant_2['merchant_id'].duplicated(
).any()
is_duplicate_staff_id_df_merchant_2 = df_merchant_2['staff_id'].duplicated(
).any()

if is_duplicate_order_id_df_merchant_2:
    print("There are duplicate order IDs.")
else:
    print("All order IDs are unique.")

if is_duplicate_merchant_id_df_merchant_2:
    print("There are duplicate merchant IDs.")
else:
    print("All merchant IDs are unique.")

if is_duplicate_staff_id_df_merchant_2:
    print("There are duplicate staff IDs.")
else:
    print("All staff IDs are unique.")

All order IDs are unique.
There are duplicate merchant IDs.
There are duplicate staff IDs.


Check for duplicate rows with same merchant id and staff id


In [548]:
duplicated_rows_df_merchant_2 = df_merchant_2[df_merchant_2.duplicated(
    subset=['merchant_id', 'staff_id'], keep=False)]
sorted_duplicated_rows_df_merchant_2 = duplicated_rows_df_merchant_2.sort_values(
    by=['merchant_id', 'staff_id'])

if not sorted_duplicated_rows_df_merchant_2.empty:
    print("Duplicate rows with the same merchant_id and staff_id:")
    print(sorted_duplicated_rows_df_merchant_2)
else:
    print("No duplicate rows with the same merchant_id and staff_id.")

Duplicate rows with the same merchant_id and staff_id:
                                    order_id   merchant_id      staff_id
139968  0a9393ea-b6bd-4373-b003-20fa6ca8d0d8  MERCHANT0038  STAFF0023277
165713  a25d2a2c-fe23-4690-8f3c-1eaad15ab5e9  MERCHANT0038  STAFF0023277
127500  2943bf5d-7688-4753-9a5d-80e1b58b15f6  MERCHANT0082  STAFF0022587
220491  c53387a5-0346-4bd3-8954-6a7d85f132c6  MERCHANT0082  STAFF0022587
199954  0d34d1c1-f8b2-4107-8743-f37291926ff2  MERCHANT0247  STAFF0060103
...                                      ...           ...           ...
266664  22f64c18-6a03-4e3b-8ef7-f5058d09dddb  MERCHANT9806  STAFF0052801
217938  591708ed-17ac-41da-8902-04c49b9e7ccb  MERCHANT9930  STAFF0003703
236799  cbd237ce-5fcd-4099-a576-051d7d1a32e7  MERCHANT9930  STAFF0003703
130843  9c556823-97f0-4fe6-8102-bb6b18f160b2  MERCHANT9980  STAFF0060515
231993  46d02535-fe2e-45c6-b086-085b5027cf83  MERCHANT9980  STAFF0060515

[1866 rows x 3 columns]


## Convert to parquet


In [549]:
df_merchant_2.to_parquet(
    'Enterprise Department/order_merchant_data2.parquet', index=False)

## Order with Merchant 3


In [550]:
df_merchant_3 = pd.read_csv(
    'Enterprise Department/order_with_merchant_data3.csv')


print(df_merchant_3)


print(df_merchant_3.columns.ravel())


df_merchant_3.isnull().sum()

        Unnamed: 0                              order_id    merchant_id  \
0           300000  e8bafe72-d506-4437-b357-f38682353238  MERCHANT13004   
1           300001  eae5e846-c16b-4959-9f90-331c355951ff  MERCHANT17176   
2           300002  913f901b-d488-49be-bcaf-86fcd581a528   MERCHANT0855   
3           300003  8479d795-6c4d-41d0-a0c7-4ca9c3c3ca31  MERCHANT40576   
4           300004  2aaae5b3-c858-4d05-b92f-cd2085201c66  MERCHANT32289   
...            ...                                   ...            ...   
199995      499995  ba506489-1505-481d-b0b1-8cdbf04a515f  MERCHANT64706   
199996      499996  551a54f1-1a7f-4a1d-a7c7-62afb433b4f4  MERCHANT12240   
199997      499997  2b2e775f-d6cd-4659-9a88-b4950bc326c0  MERCHANT39339   
199998      499998  949eb760-8022-4f0b-9e6c-3ddb049a25fd   MERCHANT1546   
199999      499999  4d1c5613-a48e-4b18-b016-bb639a1d5490  MERCHANT10900   

            staff_id  
0       STAFF0009879  
1       STAFF0045676  
2       STAFF0031852  
3      

Unnamed: 0     0
order_id       0
merchant_id    0
staff_id       0
dtype: int64

## Check for duplicates


In [551]:
is_duplicate_order_id_df_merchant_3 = df_merchant_3['order_id'].duplicated(
).any()
is_duplicate_merchant_id_df_merchant_3 = df_merchant_3['merchant_id'].duplicated(
).any()
is_duplicate_staff_id_df_merchant_3 = df_merchant_3['staff_id'].duplicated(
).any()

if is_duplicate_order_id_df_merchant_3:
    print("There are duplicate order IDs.")
else:
    print("All order IDs are unique.")

if is_duplicate_merchant_id_df_merchant_3:
    print("There are duplicate merchant IDs.")
else:
    print("All merchant IDs are unique.")

if is_duplicate_staff_id_df_merchant_3:
    print("There are duplicate staff IDs.")
else:
    print("All staff IDs are unique.")

All order IDs are unique.
There are duplicate merchant IDs.
There are duplicate staff IDs.


Check for duplicate rows with same merchant IDs and staff IDs


In [552]:
duplicated_rows_df_merchant_3 = df_merchant_3[df_merchant_3.duplicated(
    subset=['merchant_id', 'staff_id'], keep=False)]
sorted_duplicated_rows_df_merchant_3 = duplicated_rows_df_merchant_3.sort_values(
    by=['merchant_id', 'staff_id'])

if not sorted_duplicated_rows_df_merchant_3.empty:
    print("Duplicate rows with the same merchant_id and staff_id:")
    print(sorted_duplicated_rows_df_merchant_3)
else:
    print("No duplicate rows with the same merchant_id and staff_id.")

Duplicate rows with the same merchant_id and staff_id:
        Unnamed: 0                              order_id   merchant_id  \
144429      444429  e2410e9d-c7ab-461d-a9aa-e2529bc7a2b2  MERCHANT0076   
172642      472642  73b32885-c894-4613-ab5d-d1f7f0c596ed  MERCHANT0076   
45807       345807  8ec9afcb-9072-4611-bacd-f7d5dfeabb0b  MERCHANT0261   
77156       377156  f29541fd-5c3e-4046-b30e-8354c1baf53b  MERCHANT0261   
101639      401639  94b04efb-1761-4ecd-aff0-bdfbb69e3109  MERCHANT0444   
...            ...                                   ...           ...   
155445      455445  4d6916cd-073b-478f-84ae-fc04b42a8541  MERCHANT9742   
9890        309890  db557420-a023-4904-9bde-78aca1b3f3fe  MERCHANT9821   
172333      472333  84e7bf84-d02a-41df-8890-82a02bdd6346  MERCHANT9821   
15071       315071  a89d925a-b8ca-4d31-94e1-08fa50f15606  MERCHANT9980   
57917       357917  b3ce8cbb-8542-46cc-8d78-16143b5a69f8  MERCHANT9980   

            staff_id  
144429  STAFF0006893  
172642  ST

## convert to parquet


In [553]:
df_merchant_3.to_parquet(
    'Enterprise Department/order_merchant_data3.parquet', index=False)

## Staff Data


In [554]:
df_staff_data = pd.read_html('Enterprise Department/staff_data.html')
df_staff_data[0].head()

Unnamed: 0.1,Unnamed: 0,staff_id,name,job_level,street,state,city,country,contact_number,creation_date
0,0,STAFF0009650,Randall Bergstrom,intermediate,376 Land chester,Texas,Omaha,Cook Islands,(138)548-8481,2020-09-04 02:33:28
1,1,STAFF0039964,Christian Hessel,intermediate,945 West Camp shire,New Mexico,San Diego,Pakistan,393-164-5574,2020-08-08 06:50:47
2,2,STAFF0044932,Edgardo Fadel,entry,997 Expressway town,Rhode Island,Corpus Christi,Albania,328.133.8850,2020-02-10 16:49:18
3,3,STAFF0015819,Jordi Gleichner,entry,720 Centers burgh,Virginia,Bakersfield,Timor-Leste,649.258.8115,2021-06-11 11:30:29
4,4,STAFF0036616,Price Hintz,intermediate,1720 North Skyway burgh,Alabama,Scottsdale,"Palestine, State of",977-698-2305,2020-08-10 10:37:55


In [555]:
df_staff_data[0] = df_staff_data[0].drop(columns=['Unnamed: 0'])

df_staff_data[0].head()

Unnamed: 0,staff_id,name,job_level,street,state,city,country,contact_number,creation_date
0,STAFF0009650,Randall Bergstrom,intermediate,376 Land chester,Texas,Omaha,Cook Islands,(138)548-8481,2020-09-04 02:33:28
1,STAFF0039964,Christian Hessel,intermediate,945 West Camp shire,New Mexico,San Diego,Pakistan,393-164-5574,2020-08-08 06:50:47
2,STAFF0044932,Edgardo Fadel,entry,997 Expressway town,Rhode Island,Corpus Christi,Albania,328.133.8850,2020-02-10 16:49:18
3,STAFF0015819,Jordi Gleichner,entry,720 Centers burgh,Virginia,Bakersfield,Timor-Leste,649.258.8115,2021-06-11 11:30:29
4,STAFF0036616,Price Hintz,intermediate,1720 North Skyway burgh,Alabama,Scottsdale,"Palestine, State of",977-698-2305,2020-08-10 10:37:55


## Check unique values of job level


In [556]:
for i, df in enumerate(df_staff_data):
    unique_job_levels = df['job_level'].unique()
    print(f"Unique Job Levels in DataFrame {i}: {unique_job_levels}")

Unique Job Levels in DataFrame 0: ['intermediate' 'entry' 'senior']


## Convert name, job level, street, state, city, and country to title case


In [557]:
columns_to_title_case = ['name', 'job_level',
                         'street', 'state', 'city', 'country']

for column in columns_to_title_case:
    df_staff_data[0][column] = df_staff_data[0][column].str.title()

df_staff_data[0].head()

Unnamed: 0,staff_id,name,job_level,street,state,city,country,contact_number,creation_date
0,STAFF0009650,Randall Bergstrom,Intermediate,376 Land Chester,Texas,Omaha,Cook Islands,(138)548-8481,2020-09-04 02:33:28
1,STAFF0039964,Christian Hessel,Intermediate,945 West Camp Shire,New Mexico,San Diego,Pakistan,393-164-5574,2020-08-08 06:50:47
2,STAFF0044932,Edgardo Fadel,Entry,997 Expressway Town,Rhode Island,Corpus Christi,Albania,328.133.8850,2020-02-10 16:49:18
3,STAFF0015819,Jordi Gleichner,Entry,720 Centers Burgh,Virginia,Bakersfield,Timor-Leste,649.258.8115,2021-06-11 11:30:29
4,STAFF0036616,Price Hintz,Intermediate,1720 North Skyway Burgh,Alabama,Scottsdale,"Palestine, State Of",977-698-2305,2020-08-10 10:37:55


## Fix contact number values


In [558]:
df_staff_data[0]['contact_number'] = df_staff_data[0]['contact_number'].str.replace(
    r'\D', '', regex=True)

df_staff_data[0].head()

Unnamed: 0,staff_id,name,job_level,street,state,city,country,contact_number,creation_date
0,STAFF0009650,Randall Bergstrom,Intermediate,376 Land Chester,Texas,Omaha,Cook Islands,1385488481,2020-09-04 02:33:28
1,STAFF0039964,Christian Hessel,Intermediate,945 West Camp Shire,New Mexico,San Diego,Pakistan,3931645574,2020-08-08 06:50:47
2,STAFF0044932,Edgardo Fadel,Entry,997 Expressway Town,Rhode Island,Corpus Christi,Albania,3281338850,2020-02-10 16:49:18
3,STAFF0015819,Jordi Gleichner,Entry,720 Centers Burgh,Virginia,Bakersfield,Timor-Leste,6492588115,2021-06-11 11:30:29
4,STAFF0036616,Price Hintz,Intermediate,1720 North Skyway Burgh,Alabama,Scottsdale,"Palestine, State Of",9776982305,2020-08-10 10:37:55


Check for contact number that is not 10 digits


In [559]:
df_staff_data[0]['contact_number_length'] = df_staff_data[0]['contact_number'].apply(
    len)
rows_with_invalid_contact_numbers = df_staff_data[0][df_staff_data[0]
                                                     ['contact_number_length'] != 10]

rows_with_invalid_contact_numbers.head()

Unnamed: 0,staff_id,name,job_level,street,state,city,country,contact_number,creation_date,contact_number_length
9,STAFF0045925,Kiley Monahan,Intermediate,2222 Land Bury,South Dakota,Indianapolis,Mongolia,11585393751,2020-06-07 19:11:30,11
19,STAFF0056603,Sedrick Walter,Senior,8293 New Mall Borough,Nevada,Toledo,American Samoa,18532236187,2022-05-17 08:02:26,11
20,STAFF0048480,Willa Dicki,Entry,99372 East Dam Chester,Minnesota,Scottsdale,Djibouti,12722015653,2020-12-06 18:07:33,11
21,STAFF0062223,Citlalli Runolfsson,Intermediate,97572 Port Creek Town,Wisconsin,Norfolk,Cabo Verde,15837711895,2021-11-03 03:35:41,11
24,STAFF0014985,Jennings Johnston,Senior,9319 North Lake Bury,Missouri,Toledo,Nigeria,14559975770,2022-04-20 19:55:46,11


Check rows with contact number not 10 digits and start with 1


In [560]:
df_staff_data[0]['contact_number_length'] = df_staff_data[0]['contact_number'].apply(
    len)

invalid_contact_numbers = df_staff_data[0][
    (df_staff_data[0]['contact_number_length'] != 10) &
    (~df_staff_data[0]['contact_number'].str.startswith('1'))
]

invalid_contact_numbers

Unnamed: 0,staff_id,name,job_level,street,state,city,country,contact_number,creation_date,contact_number_length


Update the values of contact number that are not 10 digits and start with 1


In [561]:
mask = (df_staff_data[0]['contact_number_length'] == 11) & (
    df_staff_data[0]['contact_number'].str.startswith('1'))
df_staff_data[0].loc[mask, 'contact_number'] = df_staff_data[0].loc[mask,
                                                                    'contact_number'].str[1:]

df_staff_data[0]['contact_number_length'] = df_staff_data[0]['contact_number'].apply(
    len)
rows_with_invalid_contact_numbers = df_staff_data[0][df_staff_data[0]
                                                     ['contact_number_length'] != 10]

rows_with_invalid_contact_numbers.head()

Unnamed: 0,staff_id,name,job_level,street,state,city,country,contact_number,creation_date,contact_number_length


In [562]:
df_staff_data[0].head()

Unnamed: 0,staff_id,name,job_level,street,state,city,country,contact_number,creation_date,contact_number_length
0,STAFF0009650,Randall Bergstrom,Intermediate,376 Land Chester,Texas,Omaha,Cook Islands,1385488481,2020-09-04 02:33:28,10
1,STAFF0039964,Christian Hessel,Intermediate,945 West Camp Shire,New Mexico,San Diego,Pakistan,3931645574,2020-08-08 06:50:47,10
2,STAFF0044932,Edgardo Fadel,Entry,997 Expressway Town,Rhode Island,Corpus Christi,Albania,3281338850,2020-02-10 16:49:18,10
3,STAFF0015819,Jordi Gleichner,Entry,720 Centers Burgh,Virginia,Bakersfield,Timor-Leste,6492588115,2021-06-11 11:30:29,10
4,STAFF0036616,Price Hintz,Intermediate,1720 North Skyway Burgh,Alabama,Scottsdale,"Palestine, State Of",9776982305,2020-08-10 10:37:55,10


Check for invalid creation dates


In [563]:
valid_creation_dates_mask = pd.to_datetime(
    df_staff_data[0]['creation_date'], errors='coerce').notna()
invalid_creation_dates = df_staff_data[0][~valid_creation_dates_mask]

print("Rows with invalid creation dates:")
print(invalid_creation_dates)

Rows with invalid creation dates:
Empty DataFrame
Columns: [staff_id, name, job_level, street, state, city, country, contact_number, creation_date, contact_number_length]
Index: []


Check for dupplicate rows with different staff IDs


In [564]:
duplicate_rows = df_staff_data[0][df_staff_data[0].duplicated(
    subset='staff_id', keep=False)]
duplicate_rows_sorted = duplicate_rows.sort_values(by='staff_id')

duplicate_rows_sorted.head()

Unnamed: 0,staff_id,name,job_level,street,state,city,country,contact_number,creation_date,contact_number_length
876,STAFF0000360,Edna Keebler,Entry,52633 South Freeway Mouth,Pennsylvania,Albuquerque,Martinique,4454991194,2022-04-18 14:24:50,10
2835,STAFF0000360,Abbey Strosin,Senior,310 Union Chester,Kansas,St. Louis,Kuwait,5744118925,2022-06-19 21:04:05,10
2424,STAFF0000450,Shirley Lakin,Entry,288 Courts Mouth,Delaware,Kansas,Nigeria,6909001899,2023-04-24 23:16:58,10
4920,STAFF0000450,Sierra Powlowski,Entry,24810 West Drive Port,Wisconsin,Reno,Nigeria,4416376029,2023-04-25 06:40:50,10
2099,STAFF0000746,Retha Kuhlman,Entry,6120 Port Turnpike Stad,Oregon,Miami,Botswana,5616076647,2021-08-08 23:11:06,10


In [565]:
df_staff_data[0]['staff_id'] = df_staff_data[0]['staff_id'].str.replace(
    'STAFF', '').astype(int)
df_staff_data[0].head()

Unnamed: 0,staff_id,name,job_level,street,state,city,country,contact_number,creation_date,contact_number_length
0,9650,Randall Bergstrom,Intermediate,376 Land Chester,Texas,Omaha,Cook Islands,1385488481,2020-09-04 02:33:28,10
1,39964,Christian Hessel,Intermediate,945 West Camp Shire,New Mexico,San Diego,Pakistan,3931645574,2020-08-08 06:50:47,10
2,44932,Edgardo Fadel,Entry,997 Expressway Town,Rhode Island,Corpus Christi,Albania,3281338850,2020-02-10 16:49:18,10
3,15819,Jordi Gleichner,Entry,720 Centers Burgh,Virginia,Bakersfield,Timor-Leste,6492588115,2021-06-11 11:30:29,10
4,36616,Price Hintz,Intermediate,1720 North Skyway Burgh,Alabama,Scottsdale,"Palestine, State Of",9776982305,2020-08-10 10:37:55,10


In [566]:
df_staff_data[0]['staff_id'] = df_staff_data[0].groupby('staff_id').cumcount().add(
    1).astype(str).radd('STAFF') + df_staff_data[0]['staff_id'].astype(str)

duplicate_check = df_staff_data[0].duplicated('staff_id', keep=False)

if duplicate_check.any():
    print("There are still duplicate staff IDs.")
else:
    print("There are no duplicate staff IDs.")

There are no duplicate staff IDs.


Remove contact number length column


In [567]:
df_staff_data[0] = df_staff_data[0].drop(columns=['contact_number_length'])
df_staff_data[0].head()

Unnamed: 0,staff_id,name,job_level,street,state,city,country,contact_number,creation_date
0,STAFF19650,Randall Bergstrom,Intermediate,376 Land Chester,Texas,Omaha,Cook Islands,1385488481,2020-09-04 02:33:28
1,STAFF139964,Christian Hessel,Intermediate,945 West Camp Shire,New Mexico,San Diego,Pakistan,3931645574,2020-08-08 06:50:47
2,STAFF144932,Edgardo Fadel,Entry,997 Expressway Town,Rhode Island,Corpus Christi,Albania,3281338850,2020-02-10 16:49:18
3,STAFF115819,Jordi Gleichner,Entry,720 Centers Burgh,Virginia,Bakersfield,Timor-Leste,6492588115,2021-06-11 11:30:29
4,STAFF136616,Price Hintz,Intermediate,1720 North Skyway Burgh,Alabama,Scottsdale,"Palestine, State Of",9776982305,2020-08-10 10:37:55


Update format of contact number


In [568]:
def format_contact_number(number):
    return f"({number[:3]}) {number[3:6]}-{number[6:]}"


df_staff_data[0]['contact_number'] = df_staff_data[0]['contact_number'].apply(
    format_contact_number)

df_staff_data[0].head()

Unnamed: 0,staff_id,name,job_level,street,state,city,country,contact_number,creation_date
0,STAFF19650,Randall Bergstrom,Intermediate,376 Land Chester,Texas,Omaha,Cook Islands,(138) 548-8481,2020-09-04 02:33:28
1,STAFF139964,Christian Hessel,Intermediate,945 West Camp Shire,New Mexico,San Diego,Pakistan,(393) 164-5574,2020-08-08 06:50:47
2,STAFF144932,Edgardo Fadel,Entry,997 Expressway Town,Rhode Island,Corpus Christi,Albania,(328) 133-8850,2020-02-10 16:49:18
3,STAFF115819,Jordi Gleichner,Entry,720 Centers Burgh,Virginia,Bakersfield,Timor-Leste,(649) 258-8115,2021-06-11 11:30:29
4,STAFF136616,Price Hintz,Intermediate,1720 North Skyway Burgh,Alabama,Scottsdale,"Palestine, State Of",(977) 698-2305,2020-08-10 10:37:55


## Check for duplicate staff IDs


In [569]:
duplicate_staff_data_id = df_staff_data[0][df_staff_data[0].duplicated(
    subset=['staff_id'], keep=False)]

duplicate_staff_data_id

Unnamed: 0,staff_id,name,job_level,street,state,city,country,contact_number,creation_date


## Check for nulls


In [570]:
df_staff_data[0].isnull().sum()

staff_id          0
name              0
job_level         0
street            0
state             0
city              0
country           0
contact_number    0
creation_date     0
dtype: int64

## Convert to parquet


In [571]:
df_staff_data[0].to_parquet(
    'Enterprise Department/staff_data.parquet', index=False)