# 4.6 Combining Data Part 1

### This script contains the following points:
1. Importing libraries
2. Importing data
3. Merge dataframes
4. Export merged file as pickle format

### 1. Importing Libraries

In [25]:
import pandas as pd
import numpy as np
import os

### 2. Importing Data

In [26]:
path = r'C:\Users\keely\Documents\Courses\CareerFoundry\Immersion\Achievement 4 - Python\01-2023 Instacart Basket Analysis'

In [3]:
df_ords_prior = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'order_products__prior.csv'), index_col = False)

In [4]:
df_ords_prior

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0
...,...,...,...,...
32434484,3421083,39678,6,1
32434485,3421083,11352,7,0
32434486,3421083,4600,8,0
32434487,3421083,24852,9,1


In [5]:
# Checking number of rows and columns of df_ords_prior

df_ords_prior.shape

(32434489, 4)

In [6]:
# Seeing if data types are the same between shared columns in df_ords and df_ords_prior.

df_ords_prior.dtypes

# It is beneficial to change order_id to the matching data type for this column in df_ords. For columns in df_ords_prior 
# not found in df_ords, change to the most appropriate smaller data type.

order_id             int64
product_id           int64
add_to_cart_order    int64
reordered            int64
dtype: object

In [7]:
# Looking for missing data in df_ords_prior:

df_ords_prior.isnull().sum()

order_id             0
product_id           0
add_to_cart_order    0
reordered            0
dtype: int64

In [8]:
# Looking for duplicates in df_ords_prior:

df_ords_prior_dups = df_ords_prior[df_ords_prior.duplicated()]

df_ords_prior_dups

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered


In [9]:
# 3) Check for mixed-type data in your df_ords_prior dataframe.

for col in df_ords_prior.columns.tolist():
  weird = (df_ords_prior[[col]].applymap(type) != df_ords_prior[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_ords_prior[weird]) > 0:
    print (col, ' mixed')
  else: print(col, ' consistent')

order_id  consistent
product_id  consistent
add_to_cart_order  consistent
reordered  consistent


In [10]:
# Importing df_ords dataframe:

df_ords = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_checked.pkl'))

In [11]:
# Checking number of rows and columns of df_ords

df_ords.shape

(3421083, 7)

In [12]:
df_ords.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,new_customer
0,2539329,1,1,2,8,,True
1,2398795,1,2,3,7,15.0,False
2,473747,1,3,3,12,21.0,False
3,2254736,1,4,4,7,29.0,False
4,431534,1,5,4,15,28.0,False


In [13]:
df_ords.dtypes

order_id                   object
user_id                    object
order_number                int16
orders_day_of_week           int8
order_hour_of_day            int8
days_since_prior_order    float16
new_customer                 bool
dtype: object

In [14]:
df_ords_prior.dtypes

order_id             int64
product_id           int64
add_to_cart_order    int64
reordered            int64
dtype: object

In [15]:
df_ords_prior['product_id'].max()

49688

In [16]:
df_ords_prior['add_to_cart_order'].max()

145

In [17]:
df_ords_prior['reordered'].max()

1

In [18]:
# Changing data types in df_ords_prior.

df_ords_prior['order_id'] = df_ords_prior['order_id'].astype('str')

df_ords_prior['product_id'] = df_ords_prior['product_id'].astype('int32')

df_ords_prior['add_to_cart_order'] = df_ords_prior['add_to_cart_order'].astype('int16')

df_ords_prior['reordered'] = df_ords_prior['reordered'].astype('int8')


### 3. Merge Dataframes

In [19]:
# 1) Merge datafames.

df_ords_merged = df_ords.merge(df_ords_prior, on = ['order_id'], indicator = True)


In [21]:
df_ords_merged.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,new_customer,product_id,add_to_cart_order,reordered,_merge
0,2539329,1,1,2,8,,True,196,1,0,both
1,2539329,1,1,2,8,,True,14084,2,0,both
2,2539329,1,1,2,8,,True,12427,3,0,both
3,2539329,1,1,2,8,,True,26088,4,0,both
4,2539329,1,1,2,8,,True,26405,5,0,both


In [22]:
df_ords_merged['_merge'].value_counts()

both          32434489
left_only            0
right_only           0
Name: _merge, dtype: int64

In [23]:
df_ords_merged.shape

(32434489, 11)

### 4. Export Merged File as Pickle Format

In [24]:
# 2) Exporting as a Pickle File

# Exporting a pickle or .pkl file. This format is faster but does not allow you to import only 
# certain rows and columns like with a csv file.

df_ords_merged.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'orders_products_combined.pkl'))