# 01. Importing Libraries

In [2]:
# Import libraries
import pandas as pd
import numpy as np
import os

# 02. Importing Data

In [3]:
# Project folder path
path=r'C:\Users\maryn\Documents\Data Projects\Instacart Basket Analysis'

In [4]:
# Importing "orders_wrangled.csv"
df_ords = pd.read_csv(os.path.join(path, '02 Data','Prepared Data', 'orders_wrangled.csv'), index_col = False)

In [5]:
# Importing products.csv
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'), index_col = False)

# 03. Data Consistency Checks

In [18]:
# View the descriptive statistics for df_ords
df_ords.describe().astype(int)

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083,3421083,3421083,3421083,3421083,3421083,3214874
mean,1710541,1710542,102978,17,2,13,11
std,987581,987581,59533,17,2,4,9
min,0,1,1,1,0,0,0
25%,855270,855271,51394,5,1,10,4
50%,1710541,1710542,102689,11,3,13,7
75%,2565811,2565812,154385,23,5,16,15
max,3421082,3421083,206209,100,6,23,30


## Mixed-Type Data

In [7]:
# Create a dataframe 
df_test = pd.DataFrame()

In [8]:
# Create a mixed type column 
df_test['mix'] = ['a', 'b', 1, True]

In [9]:
df_test.head()

Unnamed: 0,mix
0,a
1,b
2,1
3,True


In [10]:
# Check for mixed types
for col in df_test.columns.tolist():
    weird= (df_test[[col]].map(type)!=df_test[[col]].iloc[0].apply(type)).any(axis=1)
    if len (df_test[weird])>0:
        print (col)

mix


In [11]:
# # Changing data type of the column 'mix' to string type
df_test['mix'] = df_test['mix'].astype('str')

## Missing values

In [12]:
# Looking for missing values in the df_prods
df_prods.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

In [13]:
# Create a subset of the df_prods containing only missing values
df_nan = df_prods[df_prods['product_name'].isnull() == True]

In [14]:
df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


In [15]:
# Create a subset of df_prods with no missing values
df_prods_clean = df_prods[df_prods['product_name'].isnull() == False]

In [16]:
df_prods_clean

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3
...,...,...,...,...,...
49688,49684,"Vodka, Triple Distilled, Twist of Vanilla",124,5,5.3
49689,49685,En Croute Roast Hazelnut Cranberry,42,1,3.1
49690,49686,Artisan Baguette,112,3,7.8
49691,49687,Smartblend Healthy Metabolism Dry Cat Food,41,8,4.7


# Outliers as possible missing values

In [19]:
# View the descriptive statistics for df_ords
df_prods.describe().astype (int)

Unnamed: 0,product_id,aisle_id,department_id,prices
count,49693,49693,49693,49693
mean,24844,67,11,9
std,14343,38,5,453
min,1,1,1,1
25%,12423,35,7,4
50%,24845,69,13,7
75%,37265,100,17,11
max,49688,134,21,99999


In [20]:
# Exploratory checks for detection of outliers
df_prods.loc[df_prods['prices'] > 100]

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
21554,21553,Lowfat 2% Milkfat Cottage Cheese,108,16,14900.0
33666,33664,2 % Reduced Fat Milk,84,16,99999.0


## Finding Duplicates

In [22]:
# Looking for duplicates in the df_prods_clean
df_dups = df_prods_clean[df_prods_clean.duplicated()]

In [23]:
df_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


In [24]:
# Dropping the duplicate values by creating a new dataframe
df_prods_clean_no_dups = df_prods_clean.drop_duplicates()

In [25]:
df_prods_clean_no_dups.shape

(49672, 5)

# 04. Task

### Data Consistency Check for df_ords

### 2.	Run the df.describe() function on your df_ords dataframe. Using your new knowledge about how to interpret the output of this function, share in a markdown cell whether anything about the data looks off or should be investigated further.

In [29]:
# View the descriptive statistics for df_ords
df_ords.describe().astype(int)

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083,3421083,3421083,3421083,3421083,3421083,3214874
mean,1710541,1710542,102978,17,2,13,11
std,987581,987581,59533,17,2,4,9
min,0,1,1,1,0,0,0
25%,855270,855271,51394,5,1,10,4
50%,1710541,1710542,102689,11,3,13,7
75%,2565811,2565812,154385,23,5,16,15
max,3421082,3421083,206209,100,6,23,30


#### The variable 'order_id' has a minimum value of 1 and a maximum value of 3421082, which is equal to the total number of records in the data frame.

#### The variable 'user_id' has a minimum value of 1 and a maximum value of 206209.

#### The variable 'order_number' has a minimun value of 1 and a maximum value of 100.

#### The variable 'orders_day_of_week' has a minimum value of 0 and a maximum value of 6, representing 7 days of the week.

#### The variable 'orders_hour_of_day' has a minimum value of 0 and a maximum value of 23, representing 24 hours of the day.

#### The variable 'days_since_prior_order' has a minimum value of 0 and a maximum value of 30 and it has 206209 fewer values compared to other variables. This may be an indication of missing values, but it also corresponds to the unique number of users.

### 3. Check for mixed-type data in your df_ords dataframe.

In [30]:
# Check for mixed types in df_ords

for col in df_ords.columns.tolist():
  weird = (df_ords[[col]].map(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_ords[weird]) > 0:
    print (col)

### 4.	If you find mixed-type data, fix it. The column in question should contain observations of a single data type.

#### No mixed data type found

### 5.	Run a check for missing values in your df_ords dataframe.

In [31]:
# Looking for missing values in the df_ords
df_ords.isnull().sum()

Unnamed: 0                     0
order_id                       0
user_id                        0
order_number                   0
orders_day_of_week             0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64

In [32]:
# Create a subset of the df_ords containing only missing values
df_ords_missing_values = df_ords[df_ords['days_since_prior_order'].isnull() == True]

In [33]:
df_ords_missing_values

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,0,2539329,1,1,2,8,
11,11,2168274,2,1,2,11,
26,26,1374495,3,1,1,14,
39,39,3343014,4,1,6,11,
45,45,2717275,5,1,3,12,
...,...,...,...,...,...,...,...
3420930,3420930,969311,206205,1,4,12,
3420934,3420934,3189322,206206,1,3,18,
3421002,3421002,2166133,206207,1,6,19,
3421019,3421019,2227043,206208,1,1,15,


### 6.	Address the missing values using an appropriate method.

#### There are 206209 missing values, as we also suspected earlier. Once it's equal to the number of users, this could be each user's first order and it's logical not to have a value in the days_since_prior_order column for their first order. It was therefore decided to leave the missing data unchanged.

### 7.	Run a check for duplicate values in your df_ords data.

In [34]:
# Looking for duplicates in the df_ords
df_ords_dups = df_ords[df_ords.duplicated()]

In [35]:
df_ords_dups

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order


#### No duplicates were found.

### 8.	Address the duplicates using an appropriate method.

#### No addressing is required once no duplicates have been found.

# 05. Exporting Data

In [26]:
# Exporting df_prods_clean_no_dups dataframe as "products_checked.csv"
df_prods_clean_no_dups.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'products_checked.csv'))

In [36]:
# Exporting df_ords dataframe as "orders_checked.csv"
df_ords.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_checked.csv'))