# DATA WRANGLING
___________________________________________________________________________________________________________________

### 1. Case study question:
#### Use data wrangling tools to calculate the total revenue of each product at each price based on 02 sample datasets:

- price.csv: records of product's history price changes in Sep 2018
- sale.csv: records of product's sale in Sep 2018

### 2. Solution:
Here i am using merge_asof()

In [1]:
#import necessary package

import pandas as pd
import numpy as np

In [2]:
#import file

prices_df = pd.read_csv('prices.csv')
sales_df = pd.read_csv('sales.csv')

In [3]:
#check file: no null values

prices_df.info()
sales_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   product_id  14 non-null     int64 
 1   old_price   14 non-null     int64 
 2   new_price   14 non-null     int64 
 3   updated_at  14 non-null     object
dtypes: int64(3), object(1)
memory usage: 576.0+ bytes
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175 entries, 0 to 174
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   product_id        175 non-null    int64 
 1   ordered_at        175 non-null    object
 2   quantity_ordered  175 non-null    int64 
dtypes: int64(2), object(1)
memory usage: 4.2+ KB


In [4]:
#convert ordered_at, updated_at to datetime object (using pd.to_datetime)

prices_df['updated_at'] = pd.to_datetime(prices_df['updated_at'])
sales_df['ordered_at'] = pd.to_datetime(sales_df['ordered_at'])

In [5]:
prices_df = prices_df.sort_values('updated_at')
prices_df

Unnamed: 0,product_id,old_price,new_price,updated_at
4,3954203,68800,60000,2018-09-10 16:32:00
7,3998909,19000,17000,2018-09-10 16:35:00
0,64,270000,239000,2018-09-10 16:37:00
11,4085861,60000,53500,2018-09-11 08:51:00
1,3954203,60000,64000,2018-09-11 11:54:00
9,4085861,53500,67000,2018-09-12 03:51:00
6,3998909,17000,15500,2018-09-13 06:43:00
13,4085861,67000,62500,2018-09-13 06:43:00
3,3954203,64000,60500,2018-09-15 03:49:00
12,4085861,62500,58000,2018-09-15 03:51:00


In [6]:
sales_df = sales_df.sort_values('ordered_at')
sales_df

Unnamed: 0,product_id,ordered_at,quantity_ordered
86,3954203,2018-09-11 01:43:00,1
28,4085861,2018-09-11 06:26:00,1
26,4085861,2018-09-11 06:53:00,1
27,4085861,2018-09-11 08:24:00,1
123,4085861,2018-09-11 09:30:00,1
...,...,...,...
67,4085861,2018-09-18 20:23:00,1
77,4085861,2018-09-18 20:43:00,1
79,4085861,2018-09-18 20:54:00,1
87,3954203,2018-09-18 21:26:00,1


In [7]:
#find the "nearest" update timestamp of each order

merged_df = pd.merge_asof(sales_df, prices_df, left_on='ordered_at', right_on='updated_at', 
                          by='product_id', direction = 'nearest')

In [9]:
#compare ordered_at and "nearest" updated_at of each order to find the correct price

merged_df['final_price'] = np.where(merged_df['ordered_at'] >= merged_df['updated_at'], 
                                    merged_df['new_price'], merged_df['old_price'])
merged_df.head(10)

Unnamed: 0,product_id,ordered_at,quantity_ordered,old_price,new_price,updated_at,final_price
0,3954203,2018-09-11 01:43:00,1,68800,60000,2018-09-10 16:32:00,60000
1,4085861,2018-09-11 06:26:00,1,60000,53500,2018-09-11 08:51:00,60000
2,4085861,2018-09-11 06:53:00,1,60000,53500,2018-09-11 08:51:00,60000
3,4085861,2018-09-11 08:24:00,1,60000,53500,2018-09-11 08:51:00,60000
4,4085861,2018-09-11 09:30:00,1,60000,53500,2018-09-11 08:51:00,53500
5,4085861,2018-09-11 11:06:00,1,60000,53500,2018-09-11 08:51:00,53500
6,3954203,2018-09-11 11:11:00,1,60000,64000,2018-09-11 11:54:00,60000
7,3954203,2018-09-11 11:11:00,1,60000,64000,2018-09-11 11:54:00,60000
8,4085861,2018-09-11 11:34:00,1,60000,53500,2018-09-11 08:51:00,53500
9,4085861,2018-09-11 11:47:00,2,60000,53500,2018-09-11 08:51:00,53500


In [14]:
#calculate total revenue for each product at each price

merged_df['revenue'] = merged_df['final_price'] * merged_df['quantity_ordered']
revenue_df = merged_df.groupby(['product_id','final_price'],as_index=False)['revenue'].sum()
revenue_df

Unnamed: 0,product_id,final_price,revenue
0,64,239000,956000
1,3954203,57500,57500
2,3954203,60000,180000
3,3954203,64000,640000
4,3998909,15500,15500
5,3998909,16500,231000
6,3998909,17000,34000
7,4085861,52000,1040000
8,4085861,53500,2140000
9,4085861,58000,2204000


In [15]:
#calculate total revenue for each product

total_revenue = merged_df.groupby('product_id',as_index=False)['revenue'].sum()
total_revenue

Unnamed: 0,product_id,revenue
0,64,956000
1,3954203,877500
2,3998909,280500
3,4085861,8247500
