In [1]:
import pandas as pd

### Using `.concat()` to place one DataFrame on top of another based on shared column names

In [2]:
#generating some data

data = { 'Col1' : [1,2,3],
         'Col2' : [4,5,6],
         'Col3' : [7,8,9]}

In [3]:
#creating two dataframes using the variable "data" above

table_1 = pd.DataFrame(data)
table_2 = pd.DataFrame(data)

### The DataFrames are concatenated, one on top of the other
 - The first 3 rows coming from `table_1`
 - The second 3 rows coming from `table_2`

In [4]:
concat_data = pd.concat([table_1, table_2])
print(concat_data)

   Col1  Col2  Col3
0     1     4     7
1     2     5     8
2     3     6     9
0     1     4     7
1     2     5     8
2     3     6     9


#### Notice that the index repeats itself
 - Use `reset_index()` to get unique index values
 - Setting `drop = True` prevents a second index from being added to your dataframe

In [5]:
concat_data = concat_data.reset_index(drop = True)
print(concat_data)

   Col1  Col2  Col3
0     1     4     7
1     2     5     8
2     3     6     9
3     1     4     7
4     2     5     8
5     3     6     9


## Learning to merge with pandas

In [6]:
product_data = pd.read_excel('../data/WA_Product_Data.xlsx')

In [7]:
sales_data = pd.read_excel('../data/WA_Sales_Data.xlsx')

#### Start by getting a feel for the data

In [21]:
#notice that the Sale_ID in index position 2 is 900000 instead of 3

product_data.head()

Unnamed: 0,Sale_ID,Retailer_country,Order_method_type,Retailer_type,Product_line,Product_type,Product
0,1,United States,Fax,Outdoors Shop,Camping Equipment,Cooking Gear,TrailChef Deluxe Cook Set
1,2,United States,Fax,Outdoors Shop,Camping Equipment,Cooking Gear,TrailChef Double Flame
2,900000,United States,Fax,Outdoors Shop,Camping Equipment,Tents,Star Dome
3,4,United States,Fax,Outdoors Shop,Camping Equipment,Tents,Star Gazer 2
4,5,United States,Fax,Outdoors Shop,Camping Equipment,Sleeping Bags,Hibernator Lite


In [9]:
product_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88475 entries, 0 to 88474
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Sale_ID            88475 non-null  int64 
 1   Retailer_country   88475 non-null  object
 2   Order_method_type  88475 non-null  object
 3   Retailer_type      88475 non-null  object
 4   Product_line       88475 non-null  object
 5   Product_type       88475 non-null  object
 6   Product            88475 non-null  object
dtypes: int64(1), object(6)
memory usage: 4.7+ MB


In [20]:
#notice that the Sale_ID in index position 1 is 3 instead of 2

sales_data.head()

Unnamed: 0,Sale_ID,Year,Quarter,Revenue,Quantity
0,1,2012,Q1 2012,59628.66,489
1,3,2012,Q1 2012,89940.48,147
2,4,2012,Q1 2012,165883.41,303
3,5,2012,Q1 2012,119822.2,1415
4,6,2012,Q1 2012,87728.96,352


In [11]:
sales_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88164 entries, 0 to 88163
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Sale_ID   88164 non-null  int64  
 1   Year      88164 non-null  int64  
 2   Quarter   88164 non-null  object 
 3   Revenue   88164 non-null  float64
 4   Quantity  88164 non-null  int64  
dtypes: float64(1), int64(3), object(1)
memory usage: 3.4+ MB


#### Performing a join between the two dataframes
- cannot merge on fields that are different types (example - can't merge a string with an int)
- check the type for the `Sale_ID` in each table to be sure they are the same


In [22]:
print(type(product_data['Sale_ID'][0]))
print(type(sales_data['Sale_ID'][0]))

<class 'numpy.int64'>
<class 'numpy.int64'>


Python syntax:
pd.merge(left dataframe, right dataframe, how to merge, column to merge on)


In [23]:
combined_data = pd.merge(product_data, sales_data, how = 'left', on = 'Sale_ID')

In [24]:
combined_data.head()

Unnamed: 0,Sale_ID,Retailer_country,Order_method_type,Retailer_type,Product_line,Product_type,Product,Year,Quarter,Revenue,Quantity
0,1,United States,Fax,Outdoors Shop,Camping Equipment,Cooking Gear,TrailChef Deluxe Cook Set,2012.0,Q1 2012,59628.66,489.0
1,2,United States,Fax,Outdoors Shop,Camping Equipment,Cooking Gear,TrailChef Double Flame,,,,
2,900000,United States,Fax,Outdoors Shop,Camping Equipment,Tents,Star Dome,,,,
3,4,United States,Fax,Outdoors Shop,Camping Equipment,Tents,Star Gazer 2,2012.0,Q1 2012,165883.41,303.0
4,5,United States,Fax,Outdoors Shop,Camping Equipment,Sleeping Bags,Hibernator Lite,2012.0,Q1 2012,119822.2,1415.0


####  Why do we have NaN values in row 2 for our sales columns (Year, Quarter, Revenue, Quantity)? 
- Was that information in the original `sales_data`?

In [25]:
sales_data.loc[sales_data.Sale_ID == 2]

Unnamed: 0,Sale_ID,Year,Quarter,Revenue,Quantity


In [26]:
## How many values are we missing from each column?

combined_data.isnull().sum()

Sale_ID                0
Retailer_country       0
Order_method_type      0
Retailer_type          0
Product_line           0
Product_type           0
Product                0
Year                 312
Quarter              312
Revenue              312
Quantity             312
dtype: int64

### Use groupby to easily look at subsets of data

- we want to count the types of product by types of retailer
- slicing a unique field from the grouped object (here we use `Sale_ID`) gives us a single field to count

In [34]:
combined_data_gb = combined_data.groupby(['Retailer_type', 'Product_type'])['Sale_ID']
combined_data_gb

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7fafc922b5b0>

In [35]:
combined_data_gb = combined_data_gb.agg('count')
combined_data_gb

Retailer_type     Product_type        
Department Store  Binoculars               808
                  Climbing Accessories     101
                  Cooking Gear            1467
                  Eyewear                 3125
                  First Aid                636
                                          ... 
Warehouse Store   Sleeping Bags            384
                  Sunscreen                565
                  Tents                    631
                  Watches                   63
                  Woods                      8
Name: Sale_ID, Length: 132, dtype: int64

In [36]:
combined_data_gb = combined_data_gb.reset_index()
combined_data_gb

Unnamed: 0,Retailer_type,Product_type,Sale_ID
0,Department Store,Binoculars,808
1,Department Store,Climbing Accessories,101
2,Department Store,Cooking Gear,1467
3,Department Store,Eyewear,3125
4,Department Store,First Aid,636
...,...,...,...
127,Warehouse Store,Sleeping Bags,384
128,Warehouse Store,Sunscreen,565
129,Warehouse Store,Tents,631
130,Warehouse Store,Watches,63


In [37]:
combined_data_gb = combined_data_gb.iloc[:,:3].rename(columns={'Sale_ID':'Sales_count'})
combined_data_gb

Unnamed: 0,Retailer_type,Product_type,Sales_count
0,Department Store,Binoculars,808
1,Department Store,Climbing Accessories,101
2,Department Store,Cooking Gear,1467
3,Department Store,Eyewear,3125
4,Department Store,First Aid,636
...,...,...,...
127,Warehouse Store,Sleeping Bags,384
128,Warehouse Store,Sunscreen,565
129,Warehouse Store,Tents,631
130,Warehouse Store,Watches,63


### Where are different product types most often sold?

In [38]:
combined_data_gb[combined_data_gb['Product_type'] == 'Binoculars']
# Binoculars don't seem to do well with Direct Marketing or Equipment Rental Stores. Maybe can cut those efforts.

Unnamed: 0,Retailer_type,Product_type,Sales_count
0,Department Store,Binoculars,808
21,Direct Marketing,Binoculars,16
37,Equipment Rental Store,Binoculars,15
58,Eyewear Store,Binoculars,533
64,Golf Shop,Binoculars,333
77,Outdoors Shop,Binoculars,894
94,Sports Store,Binoculars,1043
115,Warehouse Store,Binoculars,220


In [39]:
combined_data_gb[combined_data_gb['Product_type'] == 'First Aid']
# Maybe reduce First Aid stock at golf stores

Unnamed: 0,Retailer_type,Product_type,Sales_count
4,Department Store,First Aid,636
24,Direct Marketing,First Aid,469
41,Equipment Rental Store,First Aid,62
66,Golf Shop,First Aid,58
81,Outdoors Shop,First Aid,420
98,Sports Store,First Aid,572
118,Warehouse Store,First Aid,569


In [40]:
combined_data_gb[combined_data_gb['Product_type'] == 'Climbing Accessories']
# Focus majority of efforts for Climbing Accessories in Outdoors Shops

Unnamed: 0,Retailer_type,Product_type,Sales_count
1,Department Store,Climbing Accessories,101
38,Equipment Rental Store,Climbing Accessories,134
78,Outdoors Shop,Climbing Accessories,2439
95,Sports Store,Climbing Accessories,1
