# SALES ANALYSIS

In [1]:
# Start with loading all necessary libraries 
import pandas as pd
import numpy as np
import glob
import os
import re

# Viz libraries 
import plotly.express as px  
import plotly.graph_objects as go 
from plotly.subplots import make_subplots

### Merging 12 months data in single CSV file

In [2]:
os.chdir(r'.../Sales_Data')


extension = 'csv'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]

#combine all files in the list
combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames ])

In [3]:
# Brief check of data

df = combined_csv
df.head()

Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
0,295665,Macbook Pro Laptop,1,1700.0,12/30/19 00:01,"136 Church St, New York City, NY 10001"
1,295666,LG Washing Machine,1,600.0,12/29/19 07:03,"562 2nd St, New York City, NY 10001"
2,295667,USB-C Charging Cable,1,11.95,12/12/19 18:21,"277 Main St, New York City, NY 10001"
3,295668,27in FHD Monitor,1,149.99,12/22/19 15:13,"410 6th St, San Francisco, CA 94016"
4,295669,USB-C Charging Cable,1,11.95,12/18/19 12:38,"43 Hill St, Atlanta, GA 30301"


#### TREATING NULL VALUES

In [4]:
# IDENTIFY NULL VALUES
# Option 1

def num_missing(x):
  return sum(x.isnull())

#Applying per column:
print ("Missing values per column:")
print (df.apply(num_missing, axis=0)) #axis=0 defines that function is to be applied on each column

Missing values per column:
Order ID            545
Product             545
Quantity Ordered    545
Price Each          545
Order Date          545
Purchase Address    545
dtype: int64


In [5]:
# IDENTIFY NULL VALUES
# Option 2

df.apply(lambda x: sum(x.isnull()), axis = 0)

Order ID            545
Product             545
Quantity Ordered    545
Price Each          545
Order Date          545
Purchase Address    545
dtype: int64

In [6]:
# REMOVE (DROP) NULL VALUES

df.dropna(how='all', inplace = True)
df.head()

Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
0,295665,Macbook Pro Laptop,1,1700.0,12/30/19 00:01,"136 Church St, New York City, NY 10001"
1,295666,LG Washing Machine,1,600.0,12/29/19 07:03,"562 2nd St, New York City, NY 10001"
2,295667,USB-C Charging Cable,1,11.95,12/12/19 18:21,"277 Main St, New York City, NY 10001"
3,295668,27in FHD Monitor,1,149.99,12/22/19 15:13,"410 6th St, San Francisco, CA 94016"
4,295669,USB-C Charging Cable,1,11.95,12/18/19 12:38,"43 Hill St, Atlanta, GA 30301"


In [7]:
# Validate NULL VALUES removal

df.apply(lambda x: sum(x.isnull()), axis = 0)

Order ID            0
Product             0
Quantity Ordered    0
Price Each          0
Order Date          0
Purchase Address    0
dtype: int64

In [8]:
df.describe()
# We can see that CONCAT did not remove column headers from the rows

Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
count,186305,186305,186305,186305.0,186305,186305
unique,178438,20,10,24.0,142396,140788
top,Order ID,USB-C Charging Cable,1,11.95,Order Date,Purchase Address
freq,355,21903,168552,21903.0,355,355


### Remove header from the row values

In [9]:
temp_df = df.loc[df['Order Date'].str.contains("Order Date", case=False)]
temp_df

Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
254,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
705,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
1101,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
2875,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
3708,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
...,...,...,...,...,...,...
10443,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
10784,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
10813,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
11047,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address


In [10]:
df = df.drop(df.loc[df['Order Date'].str.contains("Order Date", case=False)].index)
df.reset_index()

Unnamed: 0,index,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
0,0,295665,Macbook Pro Laptop,1,1700,12/30/19 00:01,"136 Church St, New York City, NY 10001"
1,1,295666,LG Washing Machine,1,600.0,12/29/19 07:03,"562 2nd St, New York City, NY 10001"
2,2,295667,USB-C Charging Cable,1,11.95,12/12/19 18:21,"277 Main St, New York City, NY 10001"
3,3,295668,27in FHD Monitor,1,149.99,12/22/19 15:13,"410 6th St, San Francisco, CA 94016"
4,4,295669,USB-C Charging Cable,1,11.95,12/18/19 12:38,"43 Hill St, Atlanta, GA 30301"
...,...,...,...,...,...,...,...
182730,13617,222905,AAA Batteries (4-pack),1,2.99,06/07/19 19:02,"795 Pine St, Boston, MA 02215"
182731,13618,222906,27in FHD Monitor,1,149.99,06/01/19 19:29,"495 North St, New York City, NY 10001"
182732,13619,222907,USB-C Charging Cable,1,11.95,06/22/19 18:57,"319 Ridge St, San Francisco, CA 94016"
182733,13620,222908,USB-C Charging Cable,1,11.95,06/26/19 18:35,"916 Main St, San Francisco, CA 94016"


In [11]:
check = df.loc[df['Order Date'].str.contains("Order Date", case=False)]
check

Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address


### Check data types before further processing

In [12]:
df.dtypes

Order ID            object
Product             object
Quantity Ordered    object
Price Each          object
Order Date          object
Purchase Address    object
dtype: object

#### Convert object (string) data to correct formats

In [13]:
df["Quantity Ordered"] = pd.to_numeric(df["Quantity Ordered"])
df["Price Each"] = pd.to_numeric(df["Price Each"])
df["Order Date"] = pd.to_datetime(df["Order Date"])

In [14]:
df.describe()

Unnamed: 0,Quantity Ordered,Price Each
count,182735.0,182735.0
mean,1.124333,184.315392
std,0.442937,332.684344
min,1.0,2.99
25%,1.0,11.95
50%,1.0,14.95
75%,1.0,150.0
max,9.0,1700.0


### Taks 1: What was the best month for sales?

In [15]:
# First I have to create month values

df['Month'] = df['Order Date'].dt.month
df['Quarter'] = df['Order Date'].dt.quarter

df['Revenue'] = round(df['Quantity Ordered']*df['Price Each'],2)

df.head()

Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address,Month,Quarter,Revenue
0,295665,Macbook Pro Laptop,1,1700.0,2019-12-30 00:01:00,"136 Church St, New York City, NY 10001",12,4,1700.0
1,295666,LG Washing Machine,1,600.0,2019-12-29 07:03:00,"562 2nd St, New York City, NY 10001",12,4,600.0
2,295667,USB-C Charging Cable,1,11.95,2019-12-12 18:21:00,"277 Main St, New York City, NY 10001",12,4,11.95
3,295668,27in FHD Monitor,1,149.99,2019-12-22 15:13:00,"410 6th St, San Francisco, CA 94016",12,4,149.99
4,295669,USB-C Charging Cable,1,11.95,2019-12-18 12:38:00,"43 Hill St, Atlanta, GA 30301",12,4,11.95


In [17]:
task1 = df.groupby(['Month','Quarter']).sum(numeric_only=True)
task1 = task1.reset_index()
task1

Unnamed: 0,Month,Quarter,Quantity Ordered,Price Each,Revenue
0,1,1,10670,1776155.79,1786511.29
1,2,1,13162,2145208.87,2158127.48
2,3,1,16697,2740393.12,2755969.4
3,4,2,20226,3313522.35,3336376.42
4,5,2,18344,3084756.09,3101881.04
5,6,2,14964,2508863.49,2524464.99
6,7,3,15781,2572764.75,2587444.91
7,8,3,13185,2178183.81,2191698.31
8,9,3,12827,2038019.59,2050361.26
9,10,4,22356,3658884.86,3679254.16


In [18]:
# Visualization with Plotly
# Set styling parameters

colors = {'background':'darkgrey','text':'black'}

data = go.Bar(x=task1['Month']
              ,y = task1['Revenue']
              ,marker = dict(
                color = task1['Revenue']
                ,showscale = True
                ,colorscale = 'Sunsetdark'
                ,line=dict(color=colors['background'])  
              )
              ,hovertemplate='Month: %{x}<br>Revenue: $%{y:.2s} <extra></extra>'
              
             )

layout = go.Layout(title = 'Revenue by Month',
                  xaxis = dict(
                                title = 'Month'
                                # ,categoryorder = 'total descending'
                               ,tickmode='linear'
                              #  ,linecolor = colors['background']  
                              )
                  ,yaxis = dict(
                    title = 'Revenue'
                    ,showgrid = False
                    ,tickformat='$.1s'
                    ,zeroline = False

                  )
                  # ,template='plotly_white'
                  ,plot_bgcolor = colors['background']
                  ,paper_bgcolor = colors['background']
                   
                  )
fig_1 = go.Figure(data,layout)
fig_1.update_traces(marker=dict(line=dict(color=colors['background'])))

fig_1.show()

### Task 2: What city had the biggest sales?

In [19]:
df.head()

Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address,Month,Quarter,Revenue
0,295665,Macbook Pro Laptop,1,1700.0,2019-12-30 00:01:00,"136 Church St, New York City, NY 10001",12,4,1700.0
1,295666,LG Washing Machine,1,600.0,2019-12-29 07:03:00,"562 2nd St, New York City, NY 10001",12,4,600.0
2,295667,USB-C Charging Cable,1,11.95,2019-12-12 18:21:00,"277 Main St, New York City, NY 10001",12,4,11.95
3,295668,27in FHD Monitor,1,149.99,2019-12-22 15:13:00,"410 6th St, San Francisco, CA 94016",12,4,149.99
4,295669,USB-C Charging Cable,1,11.95,2019-12-18 12:38:00,"43 Hill St, Atlanta, GA 30301",12,4,11.95


#### Extract city name from Address

In [20]:
# As it often happens with geographies - there can be the same city name within multiple countries (or states in our case)
# That is why I extract both city and a state to avoid incorect assignments

def get_city(address):
    return address.split(',')[1]

def get_state(address):
    return address.split(',')[2].split(' ')[1]
df['City'] = df['Purchase Address'].apply(lambda x: get_city(x) + ' (' + get_state(x) + ')')

In [21]:
df.head()

Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address,Month,Quarter,Revenue,City
0,295665,Macbook Pro Laptop,1,1700.0,2019-12-30 00:01:00,"136 Church St, New York City, NY 10001",12,4,1700.0,New York City (NY)
1,295666,LG Washing Machine,1,600.0,2019-12-29 07:03:00,"562 2nd St, New York City, NY 10001",12,4,600.0,New York City (NY)
2,295667,USB-C Charging Cable,1,11.95,2019-12-12 18:21:00,"277 Main St, New York City, NY 10001",12,4,11.95,New York City (NY)
3,295668,27in FHD Monitor,1,149.99,2019-12-22 15:13:00,"410 6th St, San Francisco, CA 94016",12,4,149.99,San Francisco (CA)
4,295669,USB-C Charging Cable,1,11.95,2019-12-18 12:38:00,"43 Hill St, Atlanta, GA 30301",12,4,11.95,Atlanta (GA)


In [24]:
task2 = df.groupby('City')[['Quantity Ordered','Price Each','Revenue']].sum().sort_values('Revenue',ascending = True)
task2 = task2.reset_index()
task2['Best Peforming City'] = task2['Revenue'].max(numeric_only=True)
task2['vs Leader'] = task2['Best Peforming City'] - task2['Revenue']

task2

Unnamed: 0,City,Quantity Ordered,Price Each,Revenue,Best Peforming City,vs Leader
0,Portland (ME),2696,441701.73,444110.53,8124120.94,7680010.41
1,Austin (TX),10933,1777231.68,1786745.52,8124120.94,6337375.42
2,Portland (OR),11110,1822878.38,1832538.71,8124120.94,6291582.23
3,Seattle (WA),16281,2678871.35,2693048.6,8124120.94,5431072.34
4,Dallas (TX),16453,2702619.12,2717793.72,8124120.94,5406327.22
5,Atlanta (GA),16304,2726247.35,2741642.05,8124120.94,5382478.89
6,Boston (MA),22123,3580075.0,3604080.86,8124120.94,4520040.08
7,New York City (NY),27470,4553287.23,4581658.91,8124120.94,3542462.03
8,Los Angeles (CA),32722,5323915.79,5354039.93,8124120.94,2770081.01
9,San Francisco (CA),49363,8074045.54,8124120.94,8124120.94,0.0


In [25]:
# Visualization with Plotly
# Set styling parameters

colors = {'background':'darkgrey','text':'black'}

data = go.Bar(x=task2['Revenue']
              ,y = task2['City']
              ,orientation='h'
              ,marker = dict(
                color = task2['Revenue']
                ,showscale = True
                ,colorscale = 'Sunsetdark'
                ,line=dict(color=colors['background'])  
              )
              ,customdata=task2['vs Leader'] 
              ,hovertemplate="City: %{y}<br>Revenue: $%{x:.2s}<br>Difference to leader: %{customdata:.2s} <extra></extra>"
              
             )


layout = go.Layout(title = 'Revenue by City',
                  xaxis = dict(
                              title = 'Revenue'
                              ,showgrid = False
                              ,zeroline = False
                              # ,tickmode='linear'
                              # ,linecolor = colors['background']  
                              )
                  ,yaxis = dict(
                    title = 'City'
                    ,showgrid = False
                    # ,tickformat='$.1s'
                    ,zeroline = False
                    ,categoryorder = 'total ascending'

                  )
                  # ,template='plotly_white'
                  ,plot_bgcolor = colors['background']
                  ,paper_bgcolor = colors['background']
                   
                  )
fig_2 = go.Figure(data,layout)
fig_2.show()

### Task 3: What products are most frequently sold together?

In [26]:
# step 1 - Identify repeatable (duplicated) Order IDs, to see orders that contain multiple products

task3 = df[df['Order ID'].duplicated(keep = False)]
task3.head()

Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address,Month,Quarter,Revenue,City
16,295681,Google Phone,1,600.0,2019-12-25 12:37:00,"79 Elm St, Boston, MA 02215",12,4,600.0,Boston (MA)
17,295681,USB-C Charging Cable,1,11.95,2019-12-25 12:37:00,"79 Elm St, Boston, MA 02215",12,4,11.95,Boston (MA)
18,295681,Bose SoundSport Headphones,1,99.99,2019-12-25 12:37:00,"79 Elm St, Boston, MA 02215",12,4,99.99,Boston (MA)
19,295681,Wired Headphones,1,11.99,2019-12-25 12:37:00,"79 Elm St, Boston, MA 02215",12,4,11.99,Boston (MA)
36,295698,Vareebadd Phone,1,400.0,2019-12-13 14:32:00,"175 1st St, New York City, NY 10001",12,4,400.0,New York City (NY)


In [28]:
# step 2 - Join products from multiple rows into single column

task3['Grouped'] = task3.groupby('Order ID')['Product'].transform(lambda x: ','.join(x))
task3.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address,Month,Quarter,Revenue,City,Grouped
16,295681,Google Phone,1,600.0,2019-12-25 12:37:00,"79 Elm St, Boston, MA 02215",12,4,600.0,Boston (MA),"Google Phone,USB-C Charging Cable,Bose SoundSp..."
17,295681,USB-C Charging Cable,1,11.95,2019-12-25 12:37:00,"79 Elm St, Boston, MA 02215",12,4,11.95,Boston (MA),"Google Phone,USB-C Charging Cable,Bose SoundSp..."
18,295681,Bose SoundSport Headphones,1,99.99,2019-12-25 12:37:00,"79 Elm St, Boston, MA 02215",12,4,99.99,Boston (MA),"Google Phone,USB-C Charging Cable,Bose SoundSp..."
19,295681,Wired Headphones,1,11.99,2019-12-25 12:37:00,"79 Elm St, Boston, MA 02215",12,4,11.99,Boston (MA),"Google Phone,USB-C Charging Cable,Bose SoundSp..."
36,295698,Vareebadd Phone,1,400.0,2019-12-13 14:32:00,"175 1st St, New York City, NY 10001",12,4,400.0,New York City (NY),"Vareebadd Phone,USB-C Charging Cable"


In [29]:
# step 3 - Create a new structure by removing duplicate Order IDs

task3 = task3[['Order ID','Grouped']].drop_duplicates()
task3.head()

Unnamed: 0,Order ID,Grouped
16,295681,"Google Phone,USB-C Charging Cable,Bose SoundSp..."
36,295698,"Vareebadd Phone,USB-C Charging Cable"
42,295703,"AA Batteries (4-pack),Bose SoundSport Headphones"
66,295726,"iPhone,Lightning Charging Cable"
76,295735,"iPhone,Apple Airpods Headphones,Wired Headphones"


In [31]:
# step 4 - count repeatable items in GROUPED

from itertools import combinations
from collections import Counter

count = Counter()
# Combinations define a set of items that are sold together
for row in task3['Grouped']:
    row_list = row.split(',')
    count.update(Counter(combinations(row_list,2)))

most_common = count.most_common(10)

result = pd.DataFrame(most_common,columns = ['Bundled Products','Frequency'])
result

Unnamed: 0,Bundled Products,Frequency
0,"(iPhone, Lightning Charging Cable)",973
1,"(Google Phone, USB-C Charging Cable)",954
2,"(iPhone, Wired Headphones)",437
3,"(Google Phone, Wired Headphones)",401
4,"(Vareebadd Phone, USB-C Charging Cable)",348
5,"(iPhone, Apple Airpods Headphones)",343
6,"(Google Phone, Bose SoundSport Headphones)",219
7,"(USB-C Charging Cable, Wired Headphones)",156
8,"(Vareebadd Phone, Wired Headphones)",141
9,"(Lightning Charging Cable, Wired Headphones)",92
