In [68]:
import pandas as pd
import numpy as np
import json
from IPython.display import Markdown, display

### Read in CSV scrape

In [36]:
# read in each sheet to a dataframe
column_names = ['imageURL', 'Data', 'Retailer']
sheet = pd.read_csv('zalora_demo.csv', encoding='latin1', header=None, names=column_names)

In [37]:
sheet.head()

Unnamed: 0,imageURL,Data,Retailer
0,https://dynamic.zacdn.com/akwaGc0-XB4PteDLcaJF...,"{""Product Name"":""Stella Top Olive - Ivory"",""Pr...",ZALORA
1,https://dynamic.zacdn.com/6JcZ5jF0xf6N6XqKQtBR...,"{""Product Name"":""Ponco Kotak"",""Product URL"":""h...",ZALORA
2,https://dynamic.zacdn.com/ugTbBRnMPp6qyWSI2aOC...,"{""Product Name"":""Flowy Printed Blouse"",""Produc...",ZALORA
3,https://dynamic.zacdn.com/0Z4edM1Tkj2WdctCEjQe...,"{""Product Name"":""Gingham Check Off-Shoulder Bl...",ZALORA
4,https://dynamic.zacdn.com/btw6uTBZ2CHofOczS2w_...,"{""Product Name"":""Knot Detail T-Shirt"",""Product...",ZALORA


## Unpack the Data column

In [64]:
import ast
d = []
for col, row in sheet.iterrows():
    fields = ast.literal_eval(row['Data'])
    fields['ImageURL'] = row['imageURL']
    fields['Retailer'] = row['Retailer']
    d.append(fields)
    
sheet = pd.DataFrame(d)

### Load in tag list and mvp tag list

In [70]:
# Load in tags from csv as a set of items
tags = set(pd.read_csv('data/meta/tags.csv',header=None, names=['tag'])['tag'].str.lower())  # make lower case

# Load in Tag-to-mvpTag dictionary (key value map)
with open('data/meta/mvp.json', 'r') as fp:
    mvp = json.load(fp)

### Define the Labeling function

In [72]:
def label(x, tags):
    """
    Labeling function. Takes in a string and searches for a match against a set of
    pre-defined tag words.
    inputs:
        x <str>: string of words
        tags <set>: unordered set of words
        
    outputs:
        word <str>: the matching word. If no match, returns NaN
    """
    x = x.replace('-', ' ')  # text processing for split operation
    description = x.split()
    for word in description:
        if word.lower() in tags:
            return word.lower()
    else:
        return np.nan

### Tag processing
This adds two columns to the sheet dataframe. It uses the function described above, searching through the Product Name words to find a word that is listed in the "Tag" set of words. If a word is found in the Tag list, then that is the tag that is returned. 

The mvp column simply finds the equivalent mvp tag for the tag set above, otherwise sets the value to be Null.

In [75]:
sheet['tag'] = sheet.apply(lambda x: label(x['Product Name'], tags), axis=1)
sheet['mvp'] = sheet.apply(lambda x: mvp.get(x['tag'], np.nan), axis=1)

In [76]:
sheet.head()

Unnamed: 0,Discount,Discounted Price,ImageURL,Out Stock,Price,Product Name,Product URL,Retailer,tag,mvp
0,0%,,https://dynamic.zacdn.com/akwaGc0-XB4PteDLcaJF...,False,"IDR 359,000",Stella Top Olive - Ivory,https://www.zalora.co.id/core-attire-stella-to...,ZALORA,top,tops
1,0%,,https://dynamic.zacdn.com/6JcZ5jF0xf6N6XqKQtBR...,False,"IDR 149,900",Ponco Kotak,https://www.zalora.co.id/luire-by-raden-sirait...,ZALORA,,
2,0%,,https://dynamic.zacdn.com/ugTbBRnMPp6qyWSI2aOC...,False,"IDR 699,000",Flowy Printed Blouse,https://www.zalora.co.id/mango-flowy-printed-b...,ZALORA,blouse,tops
3,0%,,https://dynamic.zacdn.com/0Z4edM1Tkj2WdctCEjQe...,False,"IDR 599,000",Gingham Check Off-Shoulder Blouse,https://www.zalora.co.id/mango-gingham-check-o...,ZALORA,blouse,tops
4,0%,,https://dynamic.zacdn.com/btw6uTBZ2CHofOczS2w_...,False,"IDR 449,000",Knot Detail T-Shirt,https://www.zalora.co.id/mango-knot-detail-t-s...,ZALORA,shirt,tops


## Compare with Leo's tags (DRAFT)
Let's create a quick table to track total rows, label count and miss count every time we process a scraped dataset.

In [333]:
df = pd.DataFrame({'Filename': 'test.csv', 'TotalRows': 0, 'Labeled': 0, 'Missed':0}, index=[0])
df.write_csv('data/3_monitor/')

Unnamed: 0,Filename,Labeled,Missed,TotalRows
0,test.csv,0,0,0


We're going to read in a spreadsheet of the same data that was tagged by Leo, then compare what was tagged there with what was tagged in the tagging function.

In [273]:
checker = pd.read_csv('data/P1/20180824-zalora_dress.csv', encoding='latin1')

In [274]:
# where tag is string, convert to lower case. If it's empty, keep as a NaN value
checker['tag'] = checker['tag'].apply(lambda x: np.nan if isinstance(x, float) else x.lower()) # make lowercase for consistency

In [312]:
# Columns we're interested in comparing
columns = ['Image URL', 'Product Name', 'tag']

# Merge Leo's sheet with the Pandas processed sheet
compare = checker[columns].merge(sheet[columns], on=['Image URL', 'Product Name'], how='inner', suffixes=['_Leo', '_Panda'])

# Keep rows where tags do not match
compare = compare.loc[(compare.tag_Leo != compare.tag_Panda)]

# Do Not(~) Keep rows where both columns are NaN values (some reason these don't drop out from above function)
compare = compare.loc[~(compare.tag_Leo.apply(pd.isnull) & compare.tag_Panda.apply(pd.isnull))]

display(Markdown('Discrpancies between Leo and Processing Routine'))
compare.drop_duplicates(subset=['tag_Leo', 'tag_Panda'])

Discrpancies between Leo and Processing Routine

Unnamed: 0,Image URL,Product Name,tag_Leo,tag_Panda
28,https://dynamic.zacdn.com/ym9sIzYwuhnZo28PaJNT...,Lace Panel Playsuit,suit,playsuit
31,https://dynamic.zacdn.com/QW2Sx_u5K_9NX7JSUOo6...,ONLY ONE Line Sweatshirt,shirt,sweatshirt
58,https://dynamic.zacdn.com/I1LAlfZMr8tmB4UaDkRk...,Sleepdress Batwing Saten-Purple,dress,sleepdress
74,https://dynamic.zacdn.com/DbMXjB6n0sf6kOq3KXiK...,Chiffon Tank Top With Overlay Front,top,tank
179,https://dynamic.zacdn.com/wcOQOpjXbz4vfImOWCR8...,Tulip Grace Kelly Panty - Grey,pant,panty
202,https://dynamic.zacdn.com/-FnXtSRQeEQ1nsSbH0vQ...,SJO's Khateris Pink Stripe Women's Sweatshirts,shirt,
237,https://dynamic.zacdn.com/TpxvxHAxO6flvrP9MhWS...,2Nd RED Shorts Jeans Fashion Ripped Denim Navy...,jeans,shorts
240,https://dynamic.zacdn.com/3CgAyWrZUoqK4AfGdWyA...,2Nd RED Denim Shorts Medium Short Pants Fungky...,pants,shorts
245,https://dynamic.zacdn.com/uSpT2wTQgiwamymcIWe6...,Isla Batik Shorts,shorts,batik
280,https://dynamic.zacdn.com/5KDD2fe8LjX14GlkPyjG...,Mid Length Cullotes,cullote,


In [106]:
sheet[checker.tag != sheet.tag].head()

Unnamed: 0,Product Name,Actual Price (Rp),Discounted Price (Rp),Image URL,Product URL,Retailer Name,tag,mvp
28,Lace Panel Playsuit,689000,159000,https://dynamic.zacdn.com/ym9sIzYwuhnZo28PaJNT...,https://www.zalora.co.id/glamorous-lace-panel-...,ZALORA,Playsuit,tops
31,ONLY ONE Line Sweatshirt,539000,129000,https://dynamic.zacdn.com/QW2Sx_u5K_9NX7JSUOo6...,https://www.zalora.co.id/only-only-one-line-sw...,ZALORA,Sweatshirt,tops
32,ONLY ONE Line Sweatshirt,539000,129000,https://dynamic.zacdn.com/Fo1G82OeQovnj0qNWRvE...,https://www.zalora.co.id/only-only-one-line-sw...,ZALORA,Sweatshirt,tops
33,Wrap Playsuit,619000,149000,https://dynamic.zacdn.com/jJo5As3NGrZpsrV9Z9EP...,https://www.zalora.co.id/glamorous-wrap-playsu...,ZALORA,Playsuit,tops
58,Sleepdress Batwing Saten-Purple,399000,100000,https://dynamic.zacdn.com/I1LAlfZMr8tmB4UaDkRk...,https://www.zalora.co.id/cynthia-sleepdress-ba...,Cynthia,Sleepdress,underwear
