# Preparing new Notebook

In [1]:
# importing libraries
import pandas as pd
import numpy as np
import os

In [3]:
# defining path
path = r'C:\Users\lifti\OneDrive\CareerFoundry\Data Immersion\Achievement4\Instacart Basket Analysis'

In [12]:
# importing products.csv from Original Data
df_prods = pd.read_csv (os.path.join (path, '02 Data', 'Original Data', 'products.csv'), index_col = False)

In [7]:
df_prods

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3
...,...,...,...,...,...
49688,49684,"Vodka, Triple Distilled, Twist of Vanilla",124,5,5.3
49689,49685,En Croute Roast Hazelnut Cranberry,42,1,3.1
49690,49686,Artisan Baguette,112,3,7.8
49691,49687,Smartblend Healthy Metabolism Dry Cat Food,41,8,4.7


In [9]:
# importing orders.csv from Prepared Data
df_ords = pd.read_csv (os.path.join (path, '02 Data', 'Prepared Data', 'orders_wrangled.csv'), index_col = False)

In [10]:
df_ords

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number_history,order_day_of_week,order_hour_of_day,days_since_prior_order
0,0,2539329,1,1,2,8,
1,1,2398795,1,2,3,7,15.0
2,2,473747,1,3,3,12,21.0
3,3,2254736,1,4,4,7,29.0
4,4,431534,1,5,4,15,28.0
...,...,...,...,...,...,...,...
3421078,3421078,2266710,206209,10,5,18,29.0
3421079,3421079,1854736,206209,11,4,10,30.0
3421080,3421080,626363,206209,12,1,12,18.0
3421081,3421081,2977660,206209,13,1,12,7.0


In [19]:
# the column 'unnamed: 0' is wrong, I will create a list so that is correct imported
var_list = ('order_id','user_id','order_number_history','order_day_of_week','order_hour_of_day','days_since_prior_order')

In [20]:
var_list

('order_id',
 'user_id',
 'order_number_history',
 'order_day_of_week',
 'order_hour_of_day',
 'days_since_prior_order')

In [24]:
# importing orders.wrangled with var_list
df_ords = pd.read_csv (os.path.join (path, '02 Data','Prepared Data','orders_wrangled.csv'), usecols = var_list)

In [25]:
# import mistake corrected
df_ords

Unnamed: 0,order_id,user_id,order_number_history,order_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
...,...,...,...,...,...,...
3421078,2266710,206209,10,5,18,29.0
3421079,1854736,206209,11,4,10,30.0
3421080,626363,206209,12,1,12,18.0
3421081,2977660,206209,13,1,12,7.0


In [27]:
# descriptive statistics
df_ords.describe ()

Unnamed: 0,order_id,user_id,order_number_history,order_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,1.0,1.0,1.0,0.0,0.0,0.0
25%,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421083.0,206209.0,100.0,6.0,23.0,30.0


# Mixed-Type Data

## Create new Dataframe to practice fixing mixed-type colums

In [28]:
# Create a dataframe
df_test = pd.DataFrame()

In [29]:
# Create a mixed type column
df_test['mix'] = ['a','b', 1, True]

In [30]:
df_test.head()

Unnamed: 0,mix
0,a
1,b
2,1
3,True


### Checking for mixed-type columns

In [31]:
# checking for mixed-type columns in df_test
for col in df_test.columns.tolist():
    weird = (df_test[[col]].applymap(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)
    if len (df_test[weird]) > 0:
        print (col)

mix


### Fix columns

#### The first step is deciding what single data type the column in question should be. If your column contained mostly names, for instance, it should be a string. If it contained mostly order numbers, it should be a numeric value of some sort.

In [33]:
df_test.dtypes

mix    object
dtype: object

In [50]:
# change datatype from number to object (just to learn, it was already object)
df_test['mix'] = df_test['mix'].astype('str')

In [51]:
df_test.dtypes

mix    object
dtype: object

# Missing Values

## Finding Missing Values

In [52]:
# finding missing values
# .isnull() finds missing values for each data set with False (1)/True (0)
# .sum() sums the observations such that it is good readable
df_prods.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

In [62]:
# creating subset to have a closer look at the missing values
df_nan = df_prods[df_prods['product_name'].isnull() ==True]

In [63]:
df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


### Addressing Missing Values

### 1. Create a new variable that acts like a flag based on the missing value.

### 2. Impute the value with the mean or median of the column (if the variable is numeric).

### 3. Remove or filter out the missing data.

#### For this kind of data only number 3. is feasable

In [64]:
df_prods.shape

(49693, 5)

In [67]:
# creating subset without the missing values
df_prods_clean = df_prods[df_prods['product_name'].isnull()==False]

In [68]:
df_prods_clean

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3
...,...,...,...,...,...
49688,49684,"Vodka, Triple Distilled, Twist of Vanilla",124,5,5.3
49689,49685,En Croute Roast Hazelnut Cranberry,42,1,3.1
49690,49686,Artisan Baguette,112,3,7.8
49691,49687,Smartblend Healthy Metabolism Dry Cat Food,41,8,4.7


# Duplicates

#### Oftentimes, the duplicates you encounter won’t make sense to you, so you’ll need clarification from your client before proceeding with any data manipulation.

### Finding duplicates

In [72]:
# Finding duplicates and subsetting them
df_dups = df_prods_clean[df_prods_clean.duplicated()]

In [74]:
df_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


### Addressing duplicates

In [109]:
# Create a dataframe which does not include the duplicates
df_prods_clean_no_dups = df_prods_clean.drop_duplicates ()

In [110]:
df_prods_clean_no_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3
...,...,...,...,...,...
49688,49684,"Vodka, Triple Distilled, Twist of Vanilla",124,5,5.3
49689,49685,En Croute Roast Hazelnut Cranberry,42,1,3.1
49690,49686,Artisan Baguette,112,3,7.8
49691,49687,Smartblend Healthy Metabolism Dry Cat Food,41,8,4.7


In [111]:
# Exporting cleaned data df_prods_clean_no_dups
df_prods_clean_no_dups.to_csv(os.path.join(path, '02 Data','Prepared Data','products_checked.csv'))

# TASK

## 01. If you haven’t performed the consistency checks covered in this Exercise on your df_prods dataframe, do so now.

#### I have done this already.

## 02. Run the df.describe() function on your df_prods dataframe. Using your new knowledge about how to interpret the output of this function, share in a markdown cell whether anything about the data looks off or should be investigated further.

In [114]:
# descriptive analysis
df_prods_clean_no_dups.describe ()

Unnamed: 0,product_id,aisle_id,department_id,prices
count,49672.0,49672.0,49672.0,49672.0
mean,24850.349775,67.762442,11.728942,9.993282
std,14340.705287,38.315784,5.850779,453.615536
min,1.0,1.0,1.0,1.0
25%,12432.75,35.0,7.0,4.1
50%,24850.5,69.0,13.0,7.1
75%,37268.25,100.0,17.0,11.1
max,49688.0,134.0,21.0,99999.0


#### There seems to be something wrong with the prices column. The maximum is 99999, with seems a bit too high compared to the mean and median (50%)

## 03. Check for mixed-type data in your df_ords dataframe.

In [117]:
# copying the advanced code from the learing part, but corrected for the
# 'df_ords' dataframe
for col in df_ords.columns.tolist():
  weird = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_ords[weird]) > 0:
    print (col)

#### The code did not print any mixed-type column for the 'df_ords' dataframe.

## 04. If you find mixed-type data, fix it. The column in question should contain observations of a single data type.

#### There is no mixed-type data. Also, I quote the learning part "Your Instacart data has already undergone all these data-prep checks, and you know there aren’t any mixed-type columns."

## 05. Run a check for missing values in your df_ords dataframe.


In [122]:
# finding missing values
df_ords.isnull().sum()

order_id                       0
user_id                        0
order_number_history           0
order_day_of_week              0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64

#### There are 206.209 missing values for the column 'days_since_prior_order.'

## 05a. In a markdown cell, report your findings and propose an explanation for any missing values you find.

In [126]:
# finding an explanation for the missing values with despritive statistics
df_ords.describe()

Unnamed: 0,order_id,user_id,order_number_history,order_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,1.0,1.0,1.0,0.0,0.0,0.0
25%,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421083.0,206209.0,100.0,6.0,23.0,30.0


In [125]:
# finding an explanation for the missing values - looking at the data sets
df_ords.head(30)

Unnamed: 0,order_id,user_id,order_number_history,order_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
5,3367565,1,6,2,7,19.0
6,550135,1,7,1,9,20.0
7,3108588,1,8,1,14,14.0
8,2295261,1,9,1,16,0.0
9,2550362,1,10,4,8,30.0


In [128]:
# it seems that the first order of every customer is 'NaN'
# let's dig deeper and make a subset of 'order_number_history'==1
df_ords_nan = df_ords[df_ords['order_number_history']==1]

In [130]:
# comparing missing values to the shape of the df_ords_nan
df_ords_nan.isnull().sum()

order_id                       0
user_id                        0
order_number_history           0
order_day_of_week              0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64

In [133]:
# comparing missing values to the shape of the df_ords_nan
df_ords_nan.shape

(206209, 6)

#### The first order of a new customer has no value in the column 'days_since_prior_order', because there can be no difference from the prior order because it is the first order.

## 06. Address the missing values using an appropriate method.
## 06a. In a markdown cell, explain why you used your method of choice.

#### I do nothing because these data sets are not missing. They transport the information that with this order instacart won a new customer.

## 07. Run a check for duplicate values in your df_ords data.

In [140]:
# finding duplicate values
df_dups = df_ords[df_ords.duplicated()]

In [141]:
df_dups

Unnamed: 0,order_id,user_id,order_number_history,order_day_of_week,order_hour_of_day,days_since_prior_order


#### There are no duplicates in the df_ords dataframe.

## 07a. In a markdown cell, report your findings and propose an explanation for any duplicate values you find.

#### There are no duplicates in the df_ords dataframe.

## 08. Address the duplicates using an appropriate method.
## 08a. In a markdown cell, explain why you used your method of choice.

#### There are no duplicates; therefore, I do nothing.

## 09. Export your final, cleaned df_prods and df_ords data as “.csv” files in your “Prepared Data” folder and give them appropriate, succinct names.

In [143]:
# exporting df_ords
df_ords.to_csv (os.path.join (path, '02 Data','Prepared Data','orders_checked.csv'))

In [None]:
# df_prods is already exported with the name 'products_checked'