In [2]:
# import required lib
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import string
import nltk
import calendar
from os import path
from PIL import Image
import warnings
warnings.filterwarnings("ignore")

In [3]:
#Read the data
orders_all = pd.read_csv("perf_test_orderdata/orders_all.csv")
orders_times = pd.read_csv("perf_test_orderdata/orders_times.csv")

In [4]:
#join the column to connect the two csv data
merge = pd.merge(orders_all,orders_times,how='left',on='admin_reference')

In [5]:
# There are around 4000 products which doesnot have the properties. 
#load product properties data collected from API
properties = pd.read_csv("properties_values_new.csv")

In [6]:
#drop the features which has most null values
orders = merge.drop(['completed_at_x','customer_company','bill_state_name','ship_state_name','ship_company','subsite_store','campaign_code','bill_company'],axis=1)

In [7]:
#for campaign 
campaign_code = merge.drop(['completed_at_x','customer_company','bill_state_name','ship_state_name','ship_company','subsite_store','bill_company'],axis=1)

In [8]:
#prepasre the data
def prep_data(data):
    #Date and time is splitted
    new = data["completed_at_y"].str.split(" ", n = 1, expand = True) 
    data['Date'] = new[0]
    data['Time'] = new[1]
    #Day, month and year is splitted
    new = data["Date"].str.split("-", n = 2, expand = True) 
    data['Year'] = new[0]
    data['Month'] = new[1]
    data['Day'] = new[2]
    #drop null values
    data = data.dropna()
    #convert month number to month name
    data['Month'] = data['Month'].astype(int).apply(lambda x: calendar.month_abbr[x])
    #Date and time is splitted
    new = data["Time"].str.split(":", n = 1, expand = True) 
    data['Hour'] = new[0]
    data['Minute'] = new[1]
    data['Date'] = pd.to_datetime(data['Date'])  # Step 1
    data['DayofWeek'] = data['Date'].dt.day_name()  # Step 2
    return data

In [9]:
orders = prep_data(orders)
campaign_code = prep_data(campaign_code)

In [10]:
#create a sum of sales and orders to one new column
#orders['total_sum'] = orders['total'].groupby(orders['sku']).transform('sum')
orders['quantity_sum'] = orders['quantity'].groupby(orders['sku']).transform('sum')
product_sum = orders.drop_duplicates(subset=['sku']).drop(['admin_reference', 'state', 'payment_state', 'shipment_state', 'total',
       'currency', 'bill_city', 'bill_zipcode', 'bill_country_iso_name',
       'ship_city', 'ship_zipcode', 'ship_country_iso_name',
       'quantity', 'completed_at_y', 'Date', 'Time', 'Year', 'Month',
       'Day', 'Hour', 'Minute', 'DayofWeek'],axis=1)

In [11]:
#drop values which has empty property
properties = properties.drop(properties[properties.property_name == '0' ].index)
properties.columns = ['sku', 'property_name']

In [12]:
#join property and product sum data
product_properties = pd.merge(properties,product_sum,how='left',on='sku')
product_properties = product_properties.dropna()
product_properties.head(2)

Unnamed: 0,sku,property_name,product_name,quantity_sum
0,COC520,"['visibility', 'information_provider', 'last_s...",Monster Lewis Hamilton 50 cl x 24 st,167.0
1,SÄL427025,"['package_size', 'nutrient_comparison_type', '...",SMAKIS APELSIN KRAV BRICK 25CL - 27 st,82.0


In [13]:
#load product properties data collected from API
properties = pd.read_csv("properties_values.csv")
properties.head(2)

Unnamed: 0,sku_id,property_name,value
0,SÄL427025,package_size,27.0
1,SÄL427025,nutrient_comparison_type,100 g


In [15]:
table = pd.pivot_table(properties, values='value', index=['sku_id'],
                    columns=['property_name'], aggfunc=np.sum)
table.head()

property_name,Etikettnamn,allergen_contains,allergen_may_contain,bast_fore_datum,brand_name,child_item_gtin,child_item_quantity,comparison_,comparison_drained_weight,comparison_per_kilogram,...,product_name,statistics_code,subproduct_ean,target_market,trade_item_size,validoo_country_of_origin,vegansk,veganskt,visibility,width
sku_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1-KR,,,,,,,,,,,...,,,,,,,,,W,
2-KR,,,,,,,,,,,...,,,,,,,,,W,
5052197023664,,,,,,,,,,,...,,,,,,,,,W,
5201184846230,,,,,,,,,,,...,,,,,,,,,W,
5201184854310,,,,,Party Pack,,,,,,...,Godispåse Frozen 6-p,95030070.0,,752.0,6.0,,,,W,180.0


In [48]:
#take one property and create a dataframe
one_property = table[['Etikettnamn']].copy()
#reset index
one_property.reset_index(level=0, inplace=True)
#rename the column names
one_property.columns = ['sku', 'Etikettnamn']
#clean the data by placing 0 at Nan values
one_property = one_property.fillna(0)
one_property.head(2)

Unnamed: 0,sku,Etikettnamn
0,1-KR,0
1,2-KR,0


In [49]:
#join the column to connect the two dataframe
join = pd.merge(one_property,orders,how='left',on='sku')
join.head(2)

Unnamed: 0,sku,Etikettnamn,admin_reference,state,payment_state,shipment_state,total,currency,bill_city,bill_zipcode,...,completed_at_y,Date,Time,Year,Month,Day,Hour,Minute,DayofWeek,quantity_sum
0,1-KR,0,O155288849,complete,paid,shipped,1673.0,SEK,Vinslöv,28834,...,2018-09-27 06:03,2018-09-27,06:03,2018,Sep,27,6,3,Thursday,1954.0
1,1-KR,0,O155288849,complete,paid,shipped,1673.0,SEK,Vinslöv,28834,...,2018-09-27 06:03,2018-09-27,06:03,2018,Sep,27,6,3,Thursday,1954.0


In [55]:
join.drop(['sku', 'admin_reference', 'state', 'payment_state',
       'shipment_state', 'currency', 'bill_city', 'bill_zipcode', 'ship_city', 'ship_zipcode',
       'ship_country_iso_name', 'completed_at_y','Date', 'Time', 'Year', 'Month', 'Day', 'Hour', 'Minute', 'DayofWeek'],axis=1)

Unnamed: 0,Etikettnamn,total,bill_country_iso_name,product_name,quantity,quantity_sum
0,0,1673.0,SE,SVENSK PANT 1 KRONA 1st,2.0,1954.0
1,0,1673.0,SE,SVENSK PANT 1 KRONA 1st,1.0,1954.0
2,0,232.0,SE,SVENSK PANT 1 KRONA 1st,1.0,1954.0
3,0,232.0,SE,SVENSK PANT 1 KRONA 1st,1.0,1954.0
4,0,227.6,SE,SVENSK PANT 1 KRONA 1st,1.0,1954.0
...,...,...,...,...,...,...
221464,0,771.0,SE,ZOBR NO HANGOVER PILL,1.0,80.0
221465,0,147.0,SE,ZOBR NO HANGOVER PILL,2.0,80.0
221466,0,441.0,SE,ZOBR NO HANGOVER PILL,8.0,80.0
221467,0,196.0,SE,ZOBR NO HANGOVER PILL,3.0,80.0


1. Tried with seperate property values by joining with the orders data. (Data is not distrubuted properly to apply machine learning techniques)
2. Different property values are not sufficient.

Hence i am going back to write an API similar to the previous recommendation and build a recommendation.