In [None]:
import pandas as pd

### Using `.concat()` to place one DataFrame on top of another based on shared column names

In [None]:
#generating some data

data = { 'Col1' : [1,2,3],
         'Col2' : [4,5,6],
         'Col3' : [7,8,9]}

In [None]:
#creating two dataframes using the variable "data" above

table_1 = pd.DataFrame(data)
table_2 = pd.DataFrame(data)

### The DataFrames are concatenated, one on top of the other
 - The first 3 rows coming from `table_1`
 - The second 3 rows coming from `table_2`

In [None]:
concat_data = pd.concat([table_1, table_2])
print(concat_data)

#### Notice that the index repeats itself
 - Use `reset_index()` to get unique index values
 - Setting `drop = True` prevents a second index from being added to your dataframe

In [None]:
concat_data = concat_data.reset_index(drop = True)
print(concat_data)

## Learning to merge with pandas

In [None]:
product_data = pd.read_excel('../data/WA_Product_Data.xlsx')

In [None]:
sales_data = pd.read_excel('../data/WA_Sales_Data.xlsx')

#### Start by getting a feel for the data

In [None]:
#notice that the Sale_ID in index position 2 is 900000 instead of 3

product_data.head()

In [None]:
product_data.info()

In [None]:
#notice that the Sale_ID in index position 1 is 3 instead of 2

sales_data.head()

In [None]:
sales_data.info()

#### Performing a join between the two dataframes
- cannot merge on fields that are different types (example - can't merge a string with an int)
- check the type for the `Sale_ID` in each table to be sure they are the same


In [None]:
print(type(product_data['Sale_ID'][0]))
print(type(sales_data['Sale_ID'][0]))

Python syntax:
pd.merge(left dataframe, right dataframe, how to merge, column to merge on)


In [None]:
combined_data = pd.merge(product_data, sales_data, how = 'left', on = 'Sale_ID')

In [None]:
combined_data.head()

####  Why do we have NaN values in row 2 for our sales columns (Year, Quarter, Revenue, Quantity)? 
- Was that information in the original `sales_data`?

In [None]:
sales_data.loc[sales_data.Sale_ID == 2]

In [None]:
## How many values are we missing from each column?

combined_data.isnull().sum()

### Use groupby to easily look at subsets of data

- we want to count the types of product by types of retailer
- slicing a unique field from the grouped object (here we use `Sale_ID`) gives us a single field to count

In [None]:
combined_data_gb = combined_data.groupby(['Retailer_type', 'Product_type'])['Sale_ID']
combined_data_gb

In [None]:
combined_data_gb = combined_data_gb.agg('count')
combined_data_gb

In [None]:
combined_data_gb = combined_data_gb.reset_index()
combined_data_gb

In [None]:
combined_data_gb.columns

In [None]:
combined_data_gb = combined_data_gb.drop(columns = ['index']).rename(columns={'Sale_ID':'Sales_count'})
combined_data_gb

### Where are different product types most often sold?

In [None]:
combined_data_gb.loc[combined_data_gb['Product_type'] == 'Binoculars']
# Binoculars don't seem to do well with Direct Marketing or Equipment Rental Stores. Maybe can cut those efforts.

In [None]:
combined_data_gb.loc[combined_data_gb['Product_type'] == 'First Aid']
# Maybe reduce First Aid stock at golf stores

In [None]:
combined_data_gb.loc[combined_data_gb['Product_type'] == 'Climbing Accessories']
# Focus majority of efforts for Climbing Accessories in Outdoors Shops