# Which properties has the more sales?

In [130]:
# import required lib
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import calendar 
import plotly.express as px
import string
import nltk
#nltk.download('punkt')
#nltk.download('stopwords')
import warnings
warnings.filterwarnings("ignore")

# Load data from all the sources

In [131]:
#Read the data
orders_all = pd.read_csv("perf_test_orderdata/orders_all.csv")
orders_times = pd.read_csv("perf_test_orderdata/orders_times.csv")

In [132]:
#join the column to connect the two csv data
merge = pd.merge(orders_all,orders_times,how='left',on='admin_reference')

In [133]:
#load product data collected from API
properties = pd.read_csv("properties.csv")

# Prepare Data

In [134]:
#drop the features which has most null values
orders = merge.drop(['completed_at_x','customer_company','bill_state_name','ship_state_name','ship_company','subsite_store','campaign_code','bill_company'],axis=1)

In [135]:
#for campaign 
campaign_code = merge.drop(['completed_at_x','customer_company','bill_state_name','ship_state_name','ship_company','subsite_store','bill_company'],axis=1)

In [136]:
#prepasre the dat 
def prep_data(data):
    #Date and time is splitted
    new = data["completed_at_y"].str.split(" ", n = 1, expand = True) 
    data['Date'] = new[0]
    data['Time'] = new[1]
    #Day, month and year is splitted
    new = data["Date"].str.split("-", n = 2, expand = True) 
    data['Year'] = new[0]
    data['Month'] = new[1]
    data['Day'] = new[2]
    #drop null values
    data = data.dropna()
    #convert month number to month name
    data['Month'] = data['Month'].astype(int).apply(lambda x: calendar.month_abbr[x])
    #Date and time is splitted
    new = data["Time"].str.split(":", n = 1, expand = True) 
    data['Hour'] = new[0]
    data['Minute'] = new[1]

    data['Date'] = pd.to_datetime(data['Date'])  # Step 1
    data['DayofWeek'] =data['Date'].dt.day_name()  # Step 2
    return data

In [137]:
orders = prep_data(orders)
campaign_code = prep_data(campaign_code)

In [138]:
#Drop duplicates.
properties = properties.drop_duplicates(subset='p_name', keep="first")
properties.columns = ['product_name', 'properties']

In [139]:
properties.properties.head(2)

0     {'ingredients': '<p>Vit choklad (70 %): (Socke...
10    {'ingredients': '<b>ingredienser</b>: <b>kolsy...
Name: properties, dtype: object

In [140]:
properties['properties']

0       {'ingredients': '<p>Vit choklad (70 %): (Socke...
10      {'ingredients': '<b>ingredienser</b>: <b>kolsy...
11      {'ingredients': 'Vatten, apelsinjuice från kon...
12      {'ingredients': 'Glukos-fruktossirap, socker, ...
13      {'ingredients': '<p>Socker, glukossirap, <stro...
                              ...                        
2782    {'name': 'Godispåse Halloween 5-pack', 'search...
2783    {'name': 'Gott och Blandat Original - 2 kg', '...
2793    {'search': '', 'ingredients': '<p><span><span>...
2794    {'ingredients': '<p><strong>Ingredienser:</str...
2795    {'ingredients': '<b>ingredienser/ainekset/ingr...
Name: properties, Length: 1723, dtype: object

In [141]:
#Left strip and remove the side headings.
# properties['properties'] = properties['properties'].map(lambda x: x.lstrip('{\'ingredients\':'))
# properties['properties'] = properties['properties'].map(lambda x: x.lstrip('{\'name\':'))
# properties['properties'] = properties['properties'].map(lambda x: x.lstrip('{\'search\':'))
# clean punchuation
properties['properties'] = properties['properties'].map(lambda x: "".join([i.lower() for i in x if i not in string.punctuation]))

In [142]:
properties['properties']

0       ingredients pvit choklad 70  socker kakaosmör ...
10      ingredients bingredienserb bkolsyrat vattenb b...
11      ingredients vatten apelsinjuice från koncentra...
12      ingredients glukosfruktossirap socker fullhärd...
13      ingredients psocker glukossirap strongvetemjöl...
                              ...                        
2782    name godispåse halloween 5pack search  descrip...
2783    name gott och blandat original  2 kg search pg...
2793    search  ingredients pspanspanspanglukossirap s...
2794    ingredients pstrongingredienserstrongnbspsocke...
2795    ingredients bingredienseraineksetingredientsb ...
Name: properties, Length: 1723, dtype: object

In [143]:
#Tokenize
properties['properties'] = properties['properties'].map(lambda x: nltk.tokenize.word_tokenize(x))

In [144]:
properties['properties']

0       [ingredients, pvit, choklad, 70, socker, kakao...
10      [ingredients, bingredienserb, bkolsyrat, vatte...
11      [ingredients, vatten, apelsinjuice, från, konc...
12      [ingredients, glukosfruktossirap, socker, full...
13      [ingredients, psocker, glukossirap, strongvete...
                              ...                        
2782    [name, godispåse, halloween, 5pack, search, de...
2783    [name, gott, och, blandat, original, 2, kg, se...
2793    [search, ingredients, pspanspanspanglukossirap...
2794    [ingredients, pstrongingredienserstrongnbspsoc...
2795    [ingredients, bingredienseraineksetingredients...
Name: properties, Length: 1723, dtype: object

In [145]:
#remove stop words
stopwords = nltk.corpus.stopwords.words('swedish')
properties['properties'] = properties['properties'].map(lambda x: [i for i in x if i not in stopwords])
stopwords = nltk.corpus.stopwords.words('english')
properties['properties'] = properties['properties'].map(lambda x: [i for i in x if i not in stopwords])

In [146]:
properties['properties']

0       [ingredients, pvit, choklad, 70, socker, kakao...
10      [ingredients, bingredienserb, bkolsyrat, vatte...
11      [ingredients, vatten, apelsinjuice, koncentrat...
12      [ingredients, glukosfruktossirap, socker, full...
13      [ingredients, psocker, glukossirap, strongvete...
                              ...                        
2782    [name, godispåse, halloween, 5pack, search, de...
2783    [name, gott, blandat, original, 2, kg, search,...
2793    [search, ingredients, pspanspanspanglukossirap...
2794    [ingredients, pstrongingredienserstrongnbspsoc...
2795    [ingredients, bingredienseraineksetingredients...
Name: properties, Length: 1723, dtype: object

In [147]:
#removed numbers
properties['properties'] = properties['properties'].map(lambda s:[x for x in s if not (x.isdigit() or x[0] == '-' and x[1:].isdigit())])

In [148]:
properties['properties']

0       [ingredients, pvit, choklad, socker, kakaosmör...
10      [ingredients, bingredienserb, bkolsyrat, vatte...
11      [ingredients, vatten, apelsinjuice, koncentrat...
12      [ingredients, glukosfruktossirap, socker, full...
13      [ingredients, psocker, glukossirap, strongvete...
                              ...                        
2782    [name, godispåse, halloween, 5pack, search, de...
2783    [name, gott, blandat, original, kg, search, pg...
2793    [search, ingredients, pspanspanspanglukossirap...
2794    [ingredients, pstrongingredienserstrongnbspsoc...
2795    [ingredients, bingredienseraineksetingredients...
Name: properties, Length: 1723, dtype: object

In [149]:
#remove some specific stop words
stopwords = ['ingredients', 'name','search','kg','5pack','pstrongingredienserstrongnbspsocker','bingredienseraineksetingredientsb']
properties['properties'] = properties['properties'].map(lambda s: [word for word in s if word not in stopwords])

In [150]:
properties['properties']

0       [pvit, choklad, socker, kakaosmör, bhelmjölksp...
10      [bingredienserb, bkolsyrat, vattenb, bsukrosb,...
11      [vatten, apelsinjuice, koncentrat, socker, syr...
12      [glukosfruktossirap, socker, fullhärdad, kokos...
13      [psocker, glukossirap, strongvetemjölstrong, p...
                              ...                        
2782    [godispåse, halloween, description, h3godispås...
2783    [gott, blandat, original, pgott, å, blandat, g...
2793    [pspanspanspanglukossirap, socker, gelatin, fu...
2794                                       [äppelsyraprn]
2795    [glukossirapglukoosisiirappiglucose, syrup, so...
Name: properties, Length: 1723, dtype: object

## Uniques values check and add to stopwords.
find a way to strip inside specific value like 'strong'

In [None]:
# Merge orders and product properties
orders['properties'] = ''
orders = pd.merge(orders,properties,on='product_name',how='left')
#assign the required data
data = orders.drop([ 'state', 'payment_state', 'shipment_state',
       'currency', 'bill_city', 'bill_zipcode', 'bill_country_iso_name',
       'ship_city', 'ship_zipcode', 'ship_country_iso_name',
       'quantity', 'sku', 'completed_at_y', 'Date', 'Time', 'Year', 'Month',
       'Day', 'Hour', 'Minute', 'DayofWeek', 'properties_x'],axis=1)
#drop null values
data = data.dropna()

In [None]:
data.groupby(by=['product_name','properties_y']).sum()