In [1]:
# import dependencies
import numpy as np
import pandas as pd

# import data
customers_df = pd.read_csv("../data/customers.csv")
vendors_df = pd.read_csv("../data/vendors.csv")
locations_df = pd.read_csv("../data/locations.csv")
orders_df = pd.read_csv("../data/orders.csv")


# data cleaning ###
# ------------- ###


# remove duplicate customer observations
customers_df = customers_df.drop_duplicates(subset=['akeed_customer_id'], keep='first')

# drop vendor columns specified in the assessment brief
vendors_df = vendors_df.drop([
    'sunday_from_time1', 'sunday_to_time1', 'sunday_from_time2', 'sunday_to_time2', 'monday_from_time1',
    'monday_to_time1', 'monday_from_time2', 'monday_to_time2', 'tuesday_from_time1', 'tuesday_to_time1',
    'tuesday_from_time2', 'tuesday_to_time2', 'wednesday_from_time1', 'wednesday_to_time1', 'wednesday_from_time2',
    'wednesday_to_time2', 'thursday_from_time1', 'thursday_to_time1', 'thursday_from_time2', 'thursday_to_time2',
    'friday_from_time1', 'friday_to_time1', 'friday_from_time2', 'friday_to_time2', 'saturday_from_time1',
    'saturday_to_time1', 'saturday_from_time2', 'saturday_to_time2'
], axis=1)

# drop orders with empty 'customer_id' fields
orders_df['customer_id'].replace('', np.nan, inplace=True)
orders_df = orders_df.dropna(subset=['customer_id'])

# drop orders with empty 'item_count' fields
orders_df['item_count'].replace('', np.nan, inplace=True)
orders_df = orders_df.dropna(subset=['item_count'])

# rename legacy ID columns
customers_df = customers_df.rename(columns={'akeed_customer_id': 'legacy_customer_id'})
vendors_df = vendors_df.rename(columns={'id': 'legacy_vendor_id'})
locations_df = locations_df.rename(columns={'customer_id': 'legacy_customer_id'})
orders_df = orders_df.rename(columns={
    'akeed_order_id': 'legacy_order_id',
    'customer_id': 'legacy_customer_id',
    'vendor_id': 'legacy_vendor_id'
})


# create a new 'customer_id' indexing system ###
# ------------------------------------------ ###


# use pandas index as the new 'customer_id'
customers_df['customer_id'] = customers_df.index

# create a dict of 'legacy_customer_id' and corresponding 'customer_id'
cids = customers_df.set_index('legacy_customer_id')['customer_id'].to_dict()

# copy new 'customer_id' values to 'locations_df' and 'orders_df'
locations_df['customer_id'] = locations_df['legacy_customer_id'].map(cids)
orders_df['customer_id'] = orders_df['legacy_customer_id'].map(cids)

# drop 'legacy_customer_id' from 'customers_df', 'locations_df', and 'orders_df'
customers_df = customers_df.drop(columns=['legacy_customer_id'], axis=1)
locations_df = locations_df.drop(columns=['legacy_customer_id'], axis=1)
orders_df = orders_df.drop(columns=['legacy_customer_id'], axis=1)

# drop locations and orders which don't correspond to a valid customer
locations_df = locations_df.dropna(subset=['customer_id'])
orders_df = orders_df.dropna(subset=['customer_id'])


# create a new 'vendor_id' indexing system ###
# ---------------------------------------- ###


# use pandas index as the new 'vendor_id'
vendors_df['vendor_id'] = vendors_df.index

# create a dict of 'legacy_vendor_id' and corresponding 'vendor_id'
vids = vendors_df.set_index('legacy_vendor_id')['vendor_id'].to_dict()

# copy new 'vendor_id' values to 'orders_df'
orders_df['vendor_id'] = orders_df['legacy_vendor_id'].map(vids)

# drop 'legacy_vendor_id' from 'vendors_df' and 'orders_df'
vendors_df = vendors_df.drop(columns=['legacy_vendor_id'], axis=1)
orders_df = orders_df.drop(columns=['legacy_vendor_id'], axis=1)


# create a new 'order_id' indexing system ###
# --------------------------------------- ###


# use pandas index as the new 'order_id'
orders_df['order_id'] = orders_df.index

# drop 'legacy_order_id' from 'orders_df'
orders_df = orders_df.drop(columns=['legacy_order_id'], axis=1)


# create a new 'location_id' indexing system ###
# ------------------------------------------ ###


# use pandas index as the new 'location_id'
locations_df['location_id'] = locations_df.index


# match the new 'location_id' in 'orders_df'
def update_location_id(cid, loc_num):
    return locations_df.loc[
        (locations_df['customer_id'] == cid) & (locations_df['location_number'] == loc_num)]['location_id'].values[0]


orders_df['location_id'] = [update_location_id(*a) for a in tuple(
    zip(orders_df['customer_id'], orders_df['LOCATION_NUMBER']))]  # this executes faster than .apply()


# drop 'location_number' from 'locations_df'
locations_df = locations_df.drop(columns=['location_number'])

# drop 'LOCATION_NUMBER' and 'LOCATION_TYPE' from 'orders_df'
orders_df = orders_df.drop(columns=['LOCATION_NUMBER', 'LOCATION_TYPE'])


# extract vendor category information ###
# ----------------------------------- ###


# extract 'vendor_id' and 'vendor_category_en' from 'vendors_df'
vendor_cats_df = vendors_df[['vendor_id', 'vendor_category_en']].copy().rename(
    columns={'vendor_category_en': 'category'}
)

# drop 'vendor_category_en' and 'vendor_category_id' from 'vendors_df'
vendors_df = vendors_df.drop(['vendor_category_en', 'vendor_category_id'], axis=1)


# extract vendor tags information ###
# ------------------------------- ###


# extract 'vendor_id' and 'vendor_tag_name' from 'vendors_df'
vendor_tags_df = vendors_df[['vendor_id', 'vendor_tag_name']].copy().rename(
    columns={'vendor_tag_name': 'tag'}
)

# split tags into a list and explode that list into a new row for each item
vendor_tags_df['tag'] = vendor_tags_df['tag'].str.split(',').tolist()
vendor_tags_df = vendor_tags_df.explode('tag').reset_index(drop=True)

# drop 'primary_tags', 'vendor_tag', and 'vendor_tag_name' from 'vendors_df'
vendors_df = vendors_df.drop(['primary_tags', 'vendor_tag', 'vendor_tag_name'], axis=1)


# extract customer favorite vendors information ###
# --------------------------------------------- ###


# extract 'customer_id' and 'vendor_id' from 'orders_df' where 'is_favorite' == 'Yes'
customer_fav_vendors_df = orders_df[orders_df['is_favorite'] == 'Yes'][
    ['customer_id', 'vendor_id']
].copy().reset_index(drop=True)

# drop 'is_favorite' from 'orders_df'
orders_df = orders_df.drop(['is_favorite'], axis=1)

In [2]:
orders_df.head()

Unnamed: 0,item_count,grand_total,payment_mode,promo_code,vendor_discount_amount,promo_code_discount_percentage,is_rated,vendor_rating,driver_rating,deliverydistance,...,ready_for_pickup_time,picked_up_time,delivered_time,delivery_date,created_at,CID X LOC_NUM X VENDOR,customer_id,vendor_id,order_id,location_id
0,1.0,7.6,2,,0.0,,No,,0.0,0.0,...,,,,2019-07-31 05:30:00,2019-08-01 05:30:16,92PEE24 X 0 X 105,4413.0,24,0,2764
1,1.0,8.7,1,,0.0,,No,,0.0,0.0,...,,,,2019-07-31 05:30:00,2019-08-01 05:31:10,QS68UD8 X 0 X 294,16594.0,65,1,59196
2,2.0,14.4,1,,0.0,,No,,0.0,0.0,...,,,,2019-07-31 05:30:00,2019-08-01 05:31:33,MB7VY5F X 0 X 83,14710.0,17,2,9783
3,1.0,7.1,1,,0.0,,No,,0.0,0.0,...,,,,2019-07-31 05:30:00,2019-08-01 05:34:54,KDJ951Y X 0 X 90,7574.0,21,3,4309
4,4.0,27.2,1,,0.0,,No,,0.0,0.0,...,,,,2019-07-31 05:30:00,2019-08-01 05:35:51,BAL0RVT X 0 X 83,2782.0,17,4,32864


In [8]:
# extract 'promo_code' and 'promo_code_discount_percentage' from 'orders_df'
promo_codes_df = orders_df[['promo_code', 'promo_code_discount_percentage']].copy().rename(
    columns={'promo_code': 'code', 'promo_code_discount_percentage': 'discount_percentage'}
)

# convert all codes to lowercase
promo_codes_df['code'] = promo_codes_df['code'].str.lower()


# promo_codes_df = orders_df[['promo_code', 'promo_code_discount_percentage']].copy().rename(
#     columns={'promo_code': 'code', 'promo_code_discount_percentage': 'discount_percentage'}
# ).dropna(subset=['code']).drop_duplicates(subset=['code'], keep='first')

for code in promo_codes_df['code']:
    print(code)

akeedPDO
akeedpdo
AkeedPDO
Akeed157081
burgerhood 
burgerhood
Akeed163024
akeedsohar
Burgerhood
Akeed162691
burgerHOOD
akeed166497
akeed166498
Akeed161962
akeed165202
Akeed166493
akeed166487
akeed010d
akeedC121
akeedA629
akeed3e13 
Akeed169332
Akeed169125
akeed010A
SF3551
akeed167729
Akeed170998
Akeed162378
SF736
graffitiburger
graffitiburger 
Graffitiburger
Akeed173628
sf5125
Sf9186
Akeed173827
Akeed175866
Akeed176666
Akeed176586
7akeed3e952
7akeed07579
7akeedAE983
7akeed0F959
7akeedEC8E5
7akeed92B90
7akeed633BF
7akeedBE48D
Freedelivery02
akeedPdo
7akeedE9D54
7akeed83FB7
Akeed177762
7akeedC2503
Akeed177532
7akeed0BFF2
7akeed42150
7akeed7f236
w27c841
w2AC790
w239257
w2E942E
w20A3D3
w2c525c
w2B179E
w2759AE
w212274
7akeedBAA5F
w22507A
w294B55
w232A45
w2CC3D3
w2F1270
7akeed56B95
Akeed179400
7akeed6D227
akeed179060
w276366
7akeed24A37
w2FBBD3
w248EA7
w2BAD75
akeed179978
Akeed1756500
w237E5C
FreeDelivery
welcome2akeed
7akeedDA8A7
welcome2akeed 
w2AAAC5
Akeed181036
freedelivery
 
w2C428F
w2F