In [1]:
## Please copy 'WA_Product_Data.xlsx' and 'WA_Sales_Data.xlsx' into the same folder as your police_call data
## Should be located in analytics_jumpstart/data

In [2]:
import pandas as pd

## Using .concat to place one dataframe on top of another, based on shared column names

In [3]:
data = { 'Col1' : [1,2,3],
         'Col2' : [4,5,6],
         'Col3' : [7,8,9]}

In [4]:
## Creating two dataframes using the variable "data" above

table_1 = pd.DataFrame(data)
table_2 = pd.DataFrame(data)

In [5]:
## The data is placed on top of each other
## The first 3 rows coming from table_1
## The second 3 rows coming from table_2

concat_data = pd.concat([table_1, table_2])
print(concat_data)

## Notice that the index repeats itself

   Col1  Col2  Col3
0     1     4     7
1     2     5     8
2     3     6     9
0     1     4     7
1     2     5     8
2     3     6     9


In [6]:
## Reset index to correct index values.
## Drop prevents a second index from being added to your dataframe

concat_data = concat_data.reset_index(drop = True)
print(concat_data)

   Col1  Col2  Col3
0     1     4     7
1     2     5     8
2     3     6     9
3     1     4     7
4     2     5     8
5     3     6     9


## Learning to merge with pandas

In [7]:
product_data = pd.read_excel('../data/WA_Product_Data.xlsx')

In [8]:
sales_data = pd.read_excel('../data/WA_Sales_Data.xlsx')

### Start getting a feel for the data

In [9]:
## Notice that the Sale_ID in index position 2 is '900000' instead of 3

product_data.head()

Unnamed: 0,Sale_ID,Retailer_country,Order_method_type,Retailer_type,Product_line,Product_type,Product
0,1,United States,Fax,Outdoors Shop,Camping Equipment,Cooking Gear,TrailChef Deluxe Cook Set
1,2,United States,Fax,Outdoors Shop,Camping Equipment,Cooking Gear,TrailChef Double Flame
2,900000,United States,Fax,Outdoors Shop,Camping Equipment,Tents,Star Dome
3,4,United States,Fax,Outdoors Shop,Camping Equipment,Tents,Star Gazer 2
4,5,United States,Fax,Outdoors Shop,Camping Equipment,Sleeping Bags,Hibernator Lite


In [10]:
product_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88475 entries, 0 to 88474
Data columns (total 7 columns):
Sale_ID              88475 non-null int64
Retailer_country     88475 non-null object
Order_method_type    88475 non-null object
Retailer_type        88475 non-null object
Product_line         88475 non-null object
Product_type         88475 non-null object
Product              88475 non-null object
dtypes: int64(1), object(6)
memory usage: 4.7+ MB


In [11]:
## Notice that the Sale_ID in index position 1 is '3' instead of 2

sales_data.head()

Unnamed: 0,Sale_ID,Year,Quarter,Revenue,Quantity
0,1,2012,Q1 2012,59628.66,489
1,3,2012,Q1 2012,89940.48,147
2,4,2012,Q1 2012,165883.41,303
3,5,2012,Q1 2012,119822.2,1415
4,6,2012,Q1 2012,87728.96,352


In [12]:
sales_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88164 entries, 0 to 88163
Data columns (total 5 columns):
Sale_ID     88164 non-null int64
Year        88164 non-null int64
Quarter     88164 non-null object
Revenue     88164 non-null float64
Quantity    88164 non-null int64
dtypes: float64(1), int64(3), object(1)
memory usage: 3.4+ MB


### Performing a join between the two dataframes

In [16]:
## Merge won't work if you join on fields that are different types (ex - can't merge a string with an int)

print(type(product_data['Sale_ID'][0]))
print(type(sales_data['Sale_ID'][0]))

<class 'numpy.int64'>
<class 'numpy.int64'>


In [17]:
## Python syntax - pd.merge(left dataframe, right dataframe, how to merge, column to merge on)

combined_data = pd.merge(product_data, sales_data, how = 'left', on = 'Sale_ID')

In [18]:
combined_data.head()

Unnamed: 0,Sale_ID,Retailer_country,Order_method_type,Retailer_type,Product_line,Product_type,Product,Year,Quarter,Revenue,Quantity
0,1,United States,Fax,Outdoors Shop,Camping Equipment,Cooking Gear,TrailChef Deluxe Cook Set,2012.0,Q1 2012,59628.66,489.0
1,2,United States,Fax,Outdoors Shop,Camping Equipment,Cooking Gear,TrailChef Double Flame,,,,
2,900000,United States,Fax,Outdoors Shop,Camping Equipment,Tents,Star Dome,,,,
3,4,United States,Fax,Outdoors Shop,Camping Equipment,Tents,Star Gazer 2,2012.0,Q1 2012,165883.41,303.0
4,5,United States,Fax,Outdoors Shop,Camping Equipment,Sleeping Bags,Hibernator Lite,2012.0,Q1 2012,119822.2,1415.0


In [19]:
## Why do we have NaN values in row 2 for our sales columns (Year, Quarter, Revenue, Quantity)?

sales_data.query('Sale_ID == 2')

Unnamed: 0,Sale_ID,Year,Quarter,Revenue,Quantity


In [21]:
## How many values are we missing from each column?

combined_data.isnull().sum()

Sale_ID                0
Retailer_country       0
Order_method_type      0
Retailer_type          0
Product_line           0
Product_type           0
Product                0
Year                 312
Quarter              312
Revenue              312
Quantity             312
dtype: int64