# Ebay ML competition 2021 - Prepocessing

In this notebook we are going to treat the data from the Ebay ML competition from 2021. Pre-processing includes transformation of dates, one-hot encoding, feature extraction and other methods. Because the data provided is not public, a new reduced dataset was created and will be used here.

In [4]:
import os
import pandas as pd
import datetime as dt
import time
import numpy as np
import matplotlib.pyplot as plt
from uszipcode import SearchEngine
from uszipcode import Zipcode
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
from pandas.tseries.offsets import CustomBusinessDay
from datetime import datetime, date, time

ImportError: cannot import name 'Zipcode' from 'uszipcode' (C:\Users\nicol\Anaconda3\envs\tf-gpu\lib\site-packages\uszipcode\__init__.py)

In [None]:
os.chdir('eBay_ML_Challenge_Dataset_2021')

tsv_file = open("train reduced fake.csv", encoding='utf-8')

# Takes 1 min to load if all rows are enabled
df = pd.read_csv(tsv_file)

In [None]:
# Obtains state data for all the zipcodes
search = SearchEngine()

buyer_state = []
item_state = []

for zipc in df['buyer_zip']:
    
    zipcode = search.by_zipcode(zipc)
    
    # Catch zipcodes that were not found
    try:
        buyer_state.append(zipcode.state_abbr)
    except:
        buyer_state.append('-1')



for zipc in df['item_zip'].tolist():
    
    zipcode = search.by_zipcode(zipc)
    
    # Catch zipcodes that were not found
    try:
        item_state.append(zipcode.state_abbr)
    except:
        item_state.append('-1')

In [None]:
# Add location columns to dataframe
df['buyer_state'] = buyer_state
df['item_state'] = item_state

df['buyer_state'] = df['buyer_state'].astype(str)
df['item_state'] = df['item_state'].astype(str)

In [None]:
# Convert Pounds to kg and remove weight units column.

tmp = df[df['weight_units'] == 2]
tmp['weight'] = tmp['weight']*2.20462

df.drop(columns = ['weight_units'], inplace = True)

In [None]:
# Exclude weekends and US federal holidays
dr = pd.date_range(start='2018-01-01', end='2020-12-31')
fed_hol = pd.DataFrame()
fed_hol['Date'] = dr

cal = calendar()
holidays = cal.holidays(start=dr.min(), end=dr.max())

fed_hol['Holiday'] = fed_hol['Date'].isin(holidays)
hld = fed_hol[fed_hol['Holiday']==1].values.astype('datetime64[D]')

handling_time = []
total_time_bd = []

for t in range(df.shape[0]):
    
    # determine number of days for delivery. Use only business days to finde this
    acceptance_time = datetime.fromisoformat(df['acceptance_scan_timestamp'][t][0:10])
    payment_time = datetime.fromisoformat(df['payment_datetime'][t][0:10])
    handling_time.append(np.busday_count(df['payment_datetime'][t][0:10], df['acceptance_scan_timestamp'][t][0:10], holidays = hld.flatten()))
    total_time_bd.append(np.busday_count(df['payment_datetime'][t][0:10], df['delivery_date'][t],holidays = hld.flatten()))
    
    #if payment is made past cutoff on a weekday, it is considered next business day
    if payment_time.hour >= 14 and payment_time.weekday() < 5:
        total_time_bd[t] = total_time_bd[t] - 1
    

df['delivery_time_buss_days'] = total_time_bd

In [None]:
# formatting the timestamps to datetime instead of strings

df['acceptance_scan_timestamp'] = pd.to_datetime(df['acceptance_scan_timestamp'], format = '%Y-%m-%d %H:%M:%S', utc = True)
df['payment_datetime'] = pd.to_datetime(df['payment_datetime'], format = '%Y-%m-%d %H:%M:%S', utc = True)
df['delivery_date'] = pd.to_datetime(df['delivery_date'], format = '%Y-%m-%d', utc = True)

# Obtain datetimes as timestamps and calculate time in seconds.
df['acceptance_scan_timestamp seconds'] = (df['acceptance_scan_timestamp'].values.astype(np.int64)/10**9).astype(int)
df['payment_datetime seconds'] = (df['payment_datetime'].values.astype(np.int64)/10**9).astype(int)
df['delivery_date seconds'] = (df['delivery_date'].values.astype(np.int64)/10**9).astype(int)

# Create the True handling time column and true handling time in seconds (because then it can be used as an integer values)
#df['true handling time'] = df['acceptance_scan_timestamp'] - df['payment_datetime']
#df['true handling time seconds'] = (df['true handling time'].values.astype(np.int64)/10**9).astype(int)

In [None]:
# Solving missing weight values by substituting with the mean/median

tmp = df[df['weight'] > 0].copy()
mean = np.mean(tmp['weight'])
median = np.median(tmp['weight'])

new_weight = np.where(df['weight'] > 0, df['weight'], median)
df['new_weight'] = new_weight

In [None]:
# Solving missing declared_handling_days values by substituting with the mean/median. Mean might be better in this case.

tmp = df[df['declared_handling_days'].notnull()].copy()
mean = np.mean(tmp['declared_handling_days'])
median = np.median(tmp['declared_handling_days'])

df['new_declared_handling_days'] = df['declared_handling_days']
df['new_declared_handling_days'].fillna(mean, inplace = True)


In [None]:
# setting the numerical variables as numerical variables (they come as strings). Save this version of the
# Dataframe as it is the one with all the columns and now we will keep only the desired columns.

df['shipment_method_id'] = df['shipment_method_id'].astype(int)
df['shipping_fee'] = df['shipping_fee'].astype(float)
df['carrier_min_estimate'] = df['carrier_min_estimate'].astype(int)
df['carrier_max_estimate'] = df['carrier_max_estimate'].astype(int)
df['category_id'] = df['category_id'].astype(int)
df['item_price'] = df['item_price'].astype(float)
df['quantity'] = df['quantity'].astype(int)
df['new_weight'] = df['new_weight'].astype(int)
df['new_declared_handling_days'] = df['new_declared_handling_days'].astype(int)

df.to_csv('Complete dataset 100 rows.csv', index = False)

In [None]:
# First we drop all the unnecessary columns and save again

df.drop(columns = (['item_zip', 'buyer_zip', 'weight', 'declared_handling_days', 'record_number']), inplace = True)

df.to_csv('No useless columns dataset 100 rows.csv', index = False)

In [None]:
# Now we drop all columns that have updated versions

df.drop(columns = (['acceptance_scan_timestamp', 'payment_datetime', 'delivery_date']), inplace = True)

In [None]:
# And now we do the One-Hot encodings.

for column in df.columns.tolist():
    if df[column].dtype == 'object':
        print(column)
        tmp = pd.get_dummies(df[column], prefix = column).astype(int)
        tmp.where(tmp == 1, -1, inplace = True)

        df = df.join(tmp)

cat_cols = ['shipment_method_id', 'category_id'] 
for column in cat_cols:
    
    print(column)
    tmp = pd.get_dummies(df[column], prefix = column).astype(int)
    tmp.where(tmp == 1, -1, inplace = True)

    df = df.join(tmp)

In [None]:
# And now we drop the columns that were transformed

df.drop(columns = (['b2c_c2c', 'package_size', 'buyer_state', 'item_state', 'seller_id']), inplace = True)
df.drop(columns = (cat_cols), inplace = True)

df.to_csv('Only numerical dataset all rows.csv', index = False)

In [None]:
# Now we normalize the data numerical data
real_cols = ['shipping_fee',
 'carrier_min_estimate',
 'carrier_max_estimate',
 'item_price',
 'quantity',
 'acceptance_scan_timestamp seconds',
 'payment_datetime seconds',
 'new_weight',
 'new_declared_handling_days']

for column in real_cols:
    mean = np.mean(df[column])
    std = np.std(df[column])
    df[column] = (df[column] - mean)/std

df.to_csv('Only numerical dataset normalized 100 rows.csv', index = False)