# Data Consistency Checks

### This script contains following points:
### 1. Importing libraries and dataframes
### 2. How to check for mixed-type data
### 3. Checking Missing Values in df_prods
#### 3.1. Addressing Missing Values


## 1. Importing libraries and dataframes

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os

In [2]:
# Importing the dataframes

# creates a path with the source folder
path = r'C:\Users\Marc\Mariana_CareerFoundry'

# importing the orders.csv file from prepared data
df_ords = pd.read_csv(os.path.join(path, '02_Data', '2-2_Prepared', 'orders_wrangled.csv'), index_col = False)

# importing the products.csv file
df_prods = pd.read_csv(os.path.join(path, '02_Data', '2-1_Original', 'products.csv'), index_col = False)

## 2. How to check for mixed-type data

In [3]:
# Create a dataframe
df_test = pd.DataFrame()

# Create a mixed type column
df_test['mix'] = ['a', 'b', 1, True]

# Show the df_test
df_test.head()

Unnamed: 0,mix
0,a
1,b
2,1
3,True


In [4]:
        # Checking if dataframe contains mixed-type columns

        # for col in df_test.columns.tolist():
        #  weird = (df_test[[col]].applymap(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)
        #  if len (df_test[weird]) > 0:
        #    print (col)
    
# When I used the command above I got the error below
    
        #  C:\Users\Marc\AppData\Local\Temp\ipykernel_7096\2250661989.py:4: FutureWarning: DataFrame.applymap has been deprecated. 
        #  Use DataFrame.map instead.
        #  weird = (df_test[[col]].applymap(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)

# Solution: substitute  'weird = df_test[[col]].applymap(type)..' for  'weird = df_test[[col]].map(type)'

In [5]:
# Checking if dataframe contains mixed-type columns

# a 'weird' variable is created to test if the data types are consistent
# if 'weird' is true, is will print the number of the mixed-type column

for col in df_test.columns.tolist():
  weird = (df_test[[col]].map(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_test[weird]) > 0:
    print (col)

mix


In [6]:
# Converting all data to the type string
df_test['mix'] = df_test['mix'].astype('str')
df_test.dtypes

mix    object
dtype: object

## 3. Checking Missing Values in df_prods

### 16 values are missing
### a new dataframe without the 16 values was created

In [7]:
# Finds missing values
df_prods.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

In [8]:
# Creates a subset only containing the missing values
df_nan_prods = df_prods[df_prods['product_name'].isnull() == True]
df_nan_prods

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


### 3.1. Addressing Missing Values

In [9]:
# Making a new dataframe

# Checking first the size of the original dataframe
df_prods.shape

(49693, 5)

In [10]:
# Creates dataframe with non-missing values
df_prods_clean = df_prods[df_prods['product_name'].isnull() == False]

df_prods_clean.shape

(49677, 5)

## 4. Duplicates in df_prods

### 5 duplicates were found

### a new dataset without the duplicates was created

In [11]:
# Finding the duplicates
df_dups_prods = df_prods_clean[df_prods_clean.duplicated()]
df_dups_prods

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


### 4.1. Addressing Duplicates

In [12]:
# Making a new dataframe

# Checking first the size of the dataframe
df_prods_clean.shape

(49677, 5)

In [13]:
# Creates a new data frame without duplicates
df_prods_clean_no_dups = df_prods_clean.drop_duplicates()

df_prods_clean_no_dups.shape

(49672, 5)

## 5. Checking stats of df_ords

### The count row is not consistent, the last column deviates - maybe NaN?

In [14]:
# Check basic stats
df_ords.describe()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710541.0,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,0.0,1.0,1.0,1.0,0.0,0.0,0.0
25%,855270.5,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710541.0,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421082.0,3421083.0,206209.0,100.0,6.0,23.0,30.0


## 6. Checking Mixed-Type Data of df_ords

### No mixed-data type

In [15]:
# Checking if dataframe contains mixed-type columns
# a 'weird' variable is created to test if the data types are consistent
# if 'weird' is true, is will print the number of the mixed-type column

for col in df_ords.columns.tolist():
  weird = (df_ords[[col]].map(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_ords[weird]) > 0:
    print (col)
    
# No columns were flagged with mixed data type 

In [16]:
# Checking the data types
df_ords.dtypes

Unnamed: 0                  int64
order_id                    int64
user_id                     int64
eval_set                   object
order_number                int64
orders_day_of_week          int64
order_hour_of_day           int64
days_since_prior_order    float64
dtype: object

## 7. Checking missing values df_ords

### 206209 values in 'days_since_prior_order' missing

In [17]:
# Finds missing values
df_ords.isnull().sum()

Unnamed: 0                     0
order_id                       0
user_id                        0
eval_set                       0
order_number                   0
orders_day_of_week             0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64

In [19]:
# Creates a subset only containing the missing values
df_nan_ords = df_ords[df_ords['days_since_prior_order'].isnull() == True]
df_nan_ords.head()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,eval_set,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,0,2539329,1,prior,1,2,8,
11,11,2168274,2,prior,1,2,11,
26,26,1374495,3,prior,1,1,14,
39,39,3343014,4,prior,1,6,11,
45,45,2717275,5,prior,1,3,12,


In [20]:
# Checking the size of the dataframe with missing values
df_nan_ords.shape

(206209, 8)

### 7.1. Addressing Missing values

In [21]:
# Checking first the size of the original dataframe
df_ords.shape

(3421083, 8)

#### Solution for missing values = do nothing.

They only appear for the first order of the user. This makes sense because there is "no days" before the first order.

The number of rows of the dataframe of missing values is exactly the same of user ids.

All other columns information is still valid. In case I am doing an analysis of/including the specific column "days_since_prior_order" I can address it by excluding the data of needed.

If I immediatly drop the data, I might loose 206209 perfectly usable datapoints for many other analysis (such day or time of purchase). Therefore, I will just flag it and take care later if necessary.


## 8. Checking duplicates in df_ords

### No duplicated data

In [22]:
# Finding the duplicates
df_dups = df_ords[df_ords.duplicated()]
df_dups.shape

# No duplicated data

(0, 8)

## 9. Exporting Changes

In [23]:
# Exporting the final dataframe without missing values and duplicates 

df_ords.to_csv(os.path.join(path, '02_Data', '2-2_Prepared', 'orders_checked.csv')) 
 # nothing was done to the original df_ords, so I kept the df_ords and just renamed to flag the review
    
df_prods_clean_no_dups.to_csv(os.path.join(path, '02_Data', '2-2_Prepared', 'products_checked.csv'))