# DATA VISUALIZATIONS WITH PYTHON (part 1)

### This script contains the following:
#### 1. Importing data and libraries
#### 2. Checking and wrangling data
#### 3. Data quality and consistency checks
#### 4. Combining data sets
#### 5. Exporting data sets

# --------------------------------------------------------------------------------------------------------------

## 1. Importing libraries

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

### 1.1. Importing data

In [2]:
# We create a path to add a shortcut to our files location

path = r'C:\Users\javis\OneDrive\Documentos\Career Foundry\2. Data Immersion\4. Python\Instacart Basket Analysis'

In [5]:
# We then give Python the instruction of reading the file located in that path

df_custom = pd.read_csv(os.path.join(path, '2. Data', '2.1. Original Data', 'customers.csv'),index_col = False)

## 2. Checking and wrangling data

In [6]:
# We check the dimensions

df_custom.shape

(206209, 10)

In [7]:
# Check the types of the columns

df_custom.dtypes

user_id          int64
First Name      object
Surnam          object
Gender          object
STATE           object
Age              int64
date_joined     object
n_dependants     int64
fam_status      object
income           int64
dtype: object

In [8]:
# Check the look

df_custom.head()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [9]:
# Change data type of user_id as we don't need analysis on them

df_custom['user_id'] = df_custom['user_id'].astype('str')

In [10]:
# We rename the columns to have everything lower case

df_custom= df_custom.rename(columns = {'First Name':'first_name', 'Surnam':'surname', 'Gender':'gender', 'STATE':'state', 'Age':'age','n_dependants':'dependants'})

In [11]:
# We check for missing values

df_custom.isnull().sum()

user_id            0
first_name     11259
surname            0
gender             0
state              0
age                0
date_joined        0
dependants         0
fam_status         0
income             0
dtype: int64

In [12]:
# To avoid errors later, we are going to drop the name column

df_custom.drop(columns = ['first_name'])

Unnamed: 0,user_id,surname,gender,state,age,date_joined,dependants,fam_status,income
0,26711,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374
...,...,...,...,...,...,...,...,...,...
206204,168073,Case,Female,North Carolina,44,4/1/2020,1,married,148828
206205,49635,Robbins,Male,Hawaii,62,4/1/2020,3,married,168639
206206,135902,Richmond,Female,Missouri,66,4/1/2020,2,married,53374
206207,81095,Rollins,Female,California,27,4/1/2020,1,married,99799


In [13]:
# We overwrite the previous dataframe

df_custom = df_custom.drop(columns = ['first_name'])

## 3. Data quality and consistency checks

In [14]:
# We look for duplicates

df_custom_dups = df_custom [df_custom.duplicated()]

In [15]:
# We drop the duplicates 

df_custom = df_custom.drop_duplicates()

In [16]:
# We check for mixed data types
for col in df_custom.columns.tolist():
    weird = (df_custom[[col]].applymap(type) != df_custom[[col]].iloc[0].apply(type)).any(axis = 1)
    if len (df_custom[weird]) > 0:
        print (col)

In [17]:
# Lets check dimensions now to see if anything changed

df_custom.shape

(206209, 9)

In [18]:
# Check the appearance

df_custom.head()

Unnamed: 0,user_id,surname,gender,state,age,date_joined,dependants,fam_status,income
0,26711,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


### 3.1. Exporting file 

In [22]:
# Exporting file as pickle

df_custom.to_pickle(os.path.join(path,'2. Data', '2.2. Prepared Data', 'customers_checked.pkl'))

## 4. Combining data sets¶

In [20]:
# To combine the datasets, firstly we must import the another one

df_ords_prods = pd.read_pickle(os.path.join(path, '2. Data', '2.2. Prepared Data', 'orders_products_merged.pkl'))

In [24]:
# To check 'user_id' data type in customers dataframe

df_custom ['user_id'].dtype

dtype('O')

In [25]:
# To check 'user_id' data type in our previous dataframe

df_ords_prods ['user_id'].dtype

dtype('int64')

In [26]:
# We need to change the data type in the orders set so they have same type and we can combine them.
# We will change it to string

df_ords_prods['user_id'] = df_ords_prods['user_id'].astype('str')

In [27]:
# To merge the data sets

df_cust_ord_prods = df_ords_prods.merge(df_custom, on = 'user_id')

## 5. Exporting data sets

In [28]:
# To export the combined data sets as a pickle file

df_cust_ord_prods.to_pickle(os.path.join(path,'2. Data', '2.2. Prepared Data', 'cust_orders_products_combined.pkl'))