JOINING TWO CSV FILES USING INNER JOIN IN PANDAS 

-- We need to define index for both csv files (unique ID)
-- The default join is left
-- We will be joining the orders and customers file 



In [None]:
import json 
import pandas as pd 

In [None]:
## to get the index, we will need to write a function 

def get_column_names (schemas, file_name, sorting_key = 'column_position'):             #schemas will be read as a json file while file_name is a table from the file
    column_details = schemas[file_name]
    columns = sorted (column_details, key = lambda col: col [sorting_key])
    return [col ['column_name'] for col in columns]                                     #column name is from the json file

In [None]:

# the schema is a json file
schemas = json.load(open('C:\\Users\\HAKEEM OLUWATOBI\\Research\\data\\retail_db\\schemas.json'))       

# variable to get the column headers for orders table
orders_columnHeader = get_column_names (schemas, 'orders')        

# variable to get the column headers for customers table 
customers_columnHeader = get_column_names (schemas, 'customers')          

orders_df = pd.read_csv(                        # reading the orders table into a df
    'C:\\Users\\HAKEEM OLUWATOBI\\Research\\data\\retail_db\\orders\\part-00000',
    names = orders_columnHeader
)


customers_df = pd.read_csv(                        # reading the customers table into a df
    'C:\\Users\\HAKEEM OLUWATOBI\\Research\\data\\retail_db\\customers\\part-00000',
    names = customers_columnHeader
)


In [None]:
customers_df


       customer_id customer_fname customer_lname customer_email  \
0                1        Richard      Hernandez      XXXXXXXXX   
1                2           Mary        Barrett      XXXXXXXXX   
2                3            Ann          Smith      XXXXXXXXX   
3                4           Mary          Jones      XXXXXXXXX   
4                5         Robert         Hudson      XXXXXXXXX   
...            ...            ...            ...            ...   
12430        12431           Mary           Rios      XXXXXXXXX   
12431        12432         Angela          Smith      XXXXXXXXX   
12432        12433       Benjamin         Garcia      XXXXXXXXX   
12433        12434           Mary          Mills      XXXXXXXXX   
12434        12435          Laura         Horton      XXXXXXXXX   

      customer_password             customer_street customer_city  \
0             XXXXXXXXX          6303 Heather Plaza   Brownsville   
1             XXXXXXXXX     9526 Noble Embers Ridge     L

In [None]:
orders_df

Unnamed: 0,order_id,order_date,order_customer_id,order_status
0,1,2013-07-25 00:00:00.0,11599,CLOSED
1,2,2013-07-25 00:00:00.0,256,PENDING_PAYMENT
2,3,2013-07-25 00:00:00.0,12111,COMPLETE
3,4,2013-07-25 00:00:00.0,8827,CLOSED
4,5,2013-07-25 00:00:00.0,11318,COMPLETE
...,...,...,...,...
68878,68879,2014-07-09 00:00:00.0,778,COMPLETE
68879,68880,2014-07-13 00:00:00.0,1117,COMPLETE
68880,68881,2014-07-19 00:00:00.0,2518,PENDING_PAYMENT
68881,68882,2014-07-22 00:00:00.0,10000,ON_HOLD


In [None]:
## creating the index / primary key 
# order_customer_id and customer_id are the FK and PK respectively 
# we will use set_index to set the index for both columns 

orders_df = orders_df.set_index('order_customer_id')

customers_df = customers_df.set_index('customer_id')

In [None]:
#Joining both tables 

CustomersAndOrders_df = customers_df.\
join(orders_df, how =  'inner')

CustomersAndOrders_df               #notice its now 11 columns 

Unnamed: 0_level_0,customer_fname,customer_lname,customer_email,customer_password,customer_street,customer_city,customer_state,customer_zipcode,order_id,order_date,order_status
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,Richard,Hernandez,XXXXXXXXX,XXXXXXXXX,6303 Heather Plaza,Brownsville,TX,78521,22945,2013-12-13 00:00:00.0,COMPLETE
2,Mary,Barrett,XXXXXXXXX,XXXXXXXXX,9526 Noble Embers Ridge,Littleton,CO,80126,15192,2013-10-29 00:00:00.0,PENDING_PAYMENT
2,Mary,Barrett,XXXXXXXXX,XXXXXXXXX,9526 Noble Embers Ridge,Littleton,CO,80126,33865,2014-02-18 00:00:00.0,COMPLETE
2,Mary,Barrett,XXXXXXXXX,XXXXXXXXX,9526 Noble Embers Ridge,Littleton,CO,80126,57963,2013-08-02 00:00:00.0,ON_HOLD
2,Mary,Barrett,XXXXXXXXX,XXXXXXXXX,9526 Noble Embers Ridge,Littleton,CO,80126,67863,2013-11-30 00:00:00.0,COMPLETE
...,...,...,...,...,...,...,...,...,...,...,...
12434,Mary,Mills,XXXXXXXXX,XXXXXXXXX,9720 Colonial Parade,Caguas,PR,725,42915,2014-04-16 00:00:00.0,COMPLETE
12434,Mary,Mills,XXXXXXXXX,XXXXXXXXX,9720 Colonial Parade,Caguas,PR,725,51800,2014-06-14 00:00:00.0,ON_HOLD
12434,Mary,Mills,XXXXXXXXX,XXXXXXXXX,9720 Colonial Parade,Caguas,PR,725,61777,2013-12-26 00:00:00.0,COMPLETE
12435,Laura,Horton,XXXXXXXXX,XXXXXXXXX,5736 Honey Downs,Summerville,SC,29483,41643,2014-04-08 00:00:00.0,PENDING


In [None]:
CustomersAndOrders_df.shape

(68883, 11)

In [None]:
# now that we av join thge data, we need to reset index incase we want to do aggregation on the CustomersAndOrders_df
# we will be using the reset_index function 

CustomersAndOrders_df. \
    reset_index(names = 'customer_id')


Unnamed: 0,customer_id,customer_fname,customer_lname,customer_email,customer_password,customer_street,customer_city,customer_state,customer_zipcode,order_id,order_date,order_status
0,1,Richard,Hernandez,XXXXXXXXX,XXXXXXXXX,6303 Heather Plaza,Brownsville,TX,78521,22945,2013-12-13 00:00:00.0,COMPLETE
1,2,Mary,Barrett,XXXXXXXXX,XXXXXXXXX,9526 Noble Embers Ridge,Littleton,CO,80126,15192,2013-10-29 00:00:00.0,PENDING_PAYMENT
2,2,Mary,Barrett,XXXXXXXXX,XXXXXXXXX,9526 Noble Embers Ridge,Littleton,CO,80126,33865,2014-02-18 00:00:00.0,COMPLETE
3,2,Mary,Barrett,XXXXXXXXX,XXXXXXXXX,9526 Noble Embers Ridge,Littleton,CO,80126,57963,2013-08-02 00:00:00.0,ON_HOLD
4,2,Mary,Barrett,XXXXXXXXX,XXXXXXXXX,9526 Noble Embers Ridge,Littleton,CO,80126,67863,2013-11-30 00:00:00.0,COMPLETE
...,...,...,...,...,...,...,...,...,...,...,...,...
68878,12434,Mary,Mills,XXXXXXXXX,XXXXXXXXX,9720 Colonial Parade,Caguas,PR,725,42915,2014-04-16 00:00:00.0,COMPLETE
68879,12434,Mary,Mills,XXXXXXXXX,XXXXXXXXX,9720 Colonial Parade,Caguas,PR,725,51800,2014-06-14 00:00:00.0,ON_HOLD
68880,12434,Mary,Mills,XXXXXXXXX,XXXXXXXXX,9720 Colonial Parade,Caguas,PR,725,61777,2013-12-26 00:00:00.0,COMPLETE
68881,12435,Laura,Horton,XXXXXXXXX,XXXXXXXXX,5736 Honey Downs,Summerville,SC,29483,41643,2014-04-08 00:00:00.0,PENDING


In [None]:
## now to get the count of each customer per order 
# meaning how many times a customer (customer_id) made an order 

CustomersAndOrders_df.\
    groupby('customer_id')['order_id'].\
    agg (order_count = 'count')               #groupby customer_id based on order_id and count 

Unnamed: 0_level_0,order_count
customer_id,Unnamed: 1_level_1
1,1
2,4
3,7
4,6
5,4
...,...
12431,16
12432,10
12433,4
12434,8


In [None]:
#to get customers that has orders grreater than 15

CustomersAndOrders_df.\
    groupby('customer_id')['order_id'].\
    agg (order_count = 'count').\
    query('order_count >= 15')

Unnamed: 0_level_0,order_count
customer_id,Unnamed: 1_level_1
221,15
569,16
4320,15
5283,15
5624,15
5654,15
5897,16
6316,16
12284,15
12431,16


In [None]:
## sorting
# sorting based on order_date

orders_df.sort_values('order_date')                     #ascending 
orders_df.sort_values('order_date', ascending = False)           #descending 

Unnamed: 0_level_0,order_id,order_date,order_status
order_customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2621,57751,2014-07-24 00:00:00.0,CLOSED
6463,57628,2014-07-24 00:00:00.0,CLOSED
2989,57637,2014-07-24 00:00:00.0,PENDING_PAYMENT
1211,57636,2014-07-24 00:00:00.0,PENDING
10211,57635,2014-07-24 00:00:00.0,COMPLETE
...,...,...,...
12294,57787,2013-07-25 00:00:00.0,PENDING_PAYMENT
5711,57788,2013-07-25 00:00:00.0,COMPLETE
5293,57789,2013-07-25 00:00:00.0,COMPLETE
2256,93,2013-07-25 00:00:00.0,PENDING_PAYMENT


In [None]:
## sorting with more than one columns (composite sorting)
## sorting order_customer_id and order_date 

orders_df.sort_values(['order_customer_id', 'order_date']) 

#using it with ascending 
orders_df.sort_values(['order_customer_id', 'order_date'], ascending=[True, False])

Unnamed: 0_level_0,order_id,order_date,order_status
order_customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,22945,2013-12-13 00:00:00.0,COMPLETE
2,33865,2014-02-18 00:00:00.0,COMPLETE
2,67863,2013-11-30 00:00:00.0,COMPLETE
2,15192,2013-10-29 00:00:00.0,PENDING_PAYMENT
2,57963,2013-08-02 00:00:00.0,ON_HOLD
...,...,...,...
12434,5303,2013-08-26 00:00:00.0,PENDING
12434,4799,2013-08-23 00:00:00.0,PENDING_PAYMENT
12434,1868,2013-08-03 00:00:00.0,CLOSED
12435,41643,2014-04-08 00:00:00.0,PENDING
