In [1]:
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer

# Preprocessing 

### Merchants data preprocessing

In [10]:
merchants_df = pd.read_parquet('../data/tables/tbl_merchants.parquet').reset_index()

In [48]:
# A sample of an entry in the tags columns of merchants data
merchants_df.head(1).loc[0, 'tags']

'((furniture, home furnishings and equipment shops, and manufacturers, except appliances), (e), (take rate: 0.18))'

In [24]:
#First separte text into 3 separate features (separated by () or [])
def separate_tags(row):
    features = re.findall(r'[\(|\[][\(|\[](.*)[\)|\]],\s[\(|\[](.*)[\)|\]],\s[\(|\[](.*)[\)|\]][\)|\]]', row['tags'])
    row['feature_1'] = features[0][0]
    row['feature_2'] = features[0][1]
    row['feature_3'] = features[0][2]
    return row
merchants_df = merchants_df.apply(separate_tags, axis = 1)

In [33]:
# convert the take rate feature (feature_2) to float type
def get_take_rate(x):
    feature = re.findall('take rate: (\d+\.\d+)', x)
    return float(feature[0])

merchants_df['take_rate'] = merchants_df['feature_3'].apply(get_take_rate)

In [34]:
# Convert text tags to bag of words vectorized form
cv = CountVectorizer(stop_words='english', lowercase=True)
X = cv.fit_transform(merchants_df['feature_1'])
merchants_df[cv.get_feature_names_out()] = pd.DataFrame.sparse.from_spmatrix(X, columns = cv.get_feature_names_out())


In [50]:
merchants_df.head()

Unnamed: 0,merchant_abn,name,tags,feature_1,feature_2,feature_3,take_rate,al,antique,appliance,...,supply,systems,telecom,television,tent,tool,toy,vehicle,watch,writing
0,10023283211,Felis Limited,"((furniture, home furnishings and equipment sh...","furniture, home furnishings and equipment shop...",e,take rate: 0.18,0.18,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,10142254217,Arcu Ac Orci Corporation,"([cable, satellite, and otHer pay television a...","cable, satellite, and otHer pay television and...",b,take rate: 4.22,4.22,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,10165489824,Nunc Sed Company,"([jewelry, watch, clock, and silverware shops]...","jewelry, watch, clock, and silverware shops",b,take rate: 4.40,4.4,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,10187291046,Ultricies Dignissim Lacus Foundation,"([wAtch, clock, and jewelry repair shops], [b]...","wAtch, clock, and jewelry repair shops",b,take rate: 3.29,3.29,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,10192359162,Enim Condimentum PC,"([music shops - musical instruments, pianos, a...","music shops - musical instruments, pianos, and...",a,take rate: 6.33,6.33,0,0,0,...,0,0,0,0,0,0,0,0,0,0
