##  Exploring the data using graphs 
To make predictions the data must first be explored to understand what is happening inside the dataset.


In [None]:
#loading the necessary packages 
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import pandas as pd
import seaborn as sns
%matplotlib inline 
from matplotlib import rc
import tikzplotlib

### Reading the data to pandas dataframe from .csv files

In [None]:
products = pd.read_csv("../instacart/products.csv")
orders = pd.read_csv("../instacart/orders.csv")
order_products_prior = pd.read_csv("../instacart/order_products__prior.csv")
departments = pd.read_csv("../instacart/departments.csv")
aisles = pd.read_csv("../instacart/aisles.csv")

In [None]:
orders_prior = orders[orders['eval_set'] == 'prior']

### Merging the data into one pandas dataframe

In [None]:
merge_data = products.merge(order_products_prior, on = 'product_id', how = 'inner')
merge_data = departments.merge(merge_data, on = 'department_id', how = 'inner')
merge_data = aisles.merge(merge_data, on = 'aisle_id', how = 'inner')
merge_data = orders.merge(merge_data, on = 'order_id', how = 'inner')

In [None]:
#merge_data = merge_data.drop(['eval_set','order_dow','order_hour_of_day'],axis = 1)
#merge_data.head()

In [None]:
grouped_add_to_cart_order = merge_data.drop(['order_id','user_id','eval_set','order_number','order_dow','order_hour_of_day','days_since_prior_order'],axis = 1)
#grouped_add_to_cart_order

In [None]:
#plotting the number of orders that each customer has. The x-axis being the number of orders and the y-axis being the amount
# of people with that amount of orders
colors_ = sns.set_palette(sns.color_palette("RdBu_r"))
sns.set_palette(sns.color_palette("RdBu_r"))
#sns.color_palette("RdBu_r", 7)

number_of_orders_per_customer = orders.groupby("user_id")["order_number"].aggregate(np.max).reset_index()
cnt_srs = number_of_orders_per_customer.order_number.value_counts()

plt.figure(figsize=(12,8))
sns.barplot(cnt_srs.index, cnt_srs.values, alpha=0.9)
#plt.xticks(np.arange(len(cnt_srs.index)[::10])
plt.xticks(ticks = np.arange(len(cnt_srs.values))[::10], labels = np.arange(len(cnt_srs.values))[::10])
plt.ylabel('Number of Customers ', fontsize=11)
plt.xlabel('Maximum order number', fontsize=11)
plt.xticks(rotation='vertical')

plt.tight_layout()
#tikzplotlib.save("graphs_chapter4/num_orders_per_customer.tex", axis_width = '15cm',axis_height = '9cm' )

#plt.savefig('hoeveelheid_orders_per_customer.png')


In [None]:
#plotting the days of the week on the x-axis with the amount of orders for each of those on the y-axis

number_of_orders_per_day_of_week = orders['order_dow'].value_counts()

#orders['order_dow'].aggregate(np.max).reset_index()
cnt_srs = number_of_orders_per_day_of_week

plt.figure(figsize=(12,8))
sns.barplot((cnt_srs.index+1), cnt_srs.values, alpha=0.8)
plt.ylabel('Number of Orders ', fontsize=11)
plt.xlabel('Day of week', fontsize=11)
plt.xticks(rotation='vertical')
plt.show()
#plt.savefig('order_dow_plot.png')

In [None]:
#plotting the hour of the day that customer shop on the x-axis with the amount of orders for each of those on the y-axis

number_of_orders_per_day_of_week = orders['order_hour_of_day'].value_counts()

#orders['order_dow'].aggregate(np.max).reset_index()
cnt_srs = number_of_orders_per_day_of_week

plt.figure(figsize=(12,8))
sns.barplot((cnt_srs.index), cnt_srs.values, alpha=0.8)
plt.ylabel('Number of Orders ', fontsize=11)
plt.xlabel('Hour of the day', fontsize=11)
plt.xticks(rotation='vertical')
#plt.show()
plt.tight_layout()
#plt.savefig('order_hod.png')

In [None]:
#plotting the days since prior order frequency

days_since_prior_order_freq = orders_prior['days_since_prior_order'].value_counts()

#orders['order_dow'].aggregate(np.max).reset_index()
cnt_srs = days_since_prior_order_freq

plt.figure(figsize=(12,8))
sns.barplot((cnt_srs.index), cnt_srs.values, alpha=0.8)
plt.ylabel('Frequency ', fontsize=12)
plt.xlabel('Days since the previous order was made', fontsize=12)
plt.xticks(rotation='vertical')
#plt.show()
plt.tight_layout()
#plt.savefig('days_since_prior_order.png')

In [None]:
#plotting the most frequent purchased department

department_freq = merge_data['department'].value_counts()
department_id_freq = merge_data['department_id'].value_counts()

#orders['order_dow'].aggregate(np.max).reset_index()
cnt_srs = department_freq

plt.figure(figsize=(12,8))
sns.barplot((cnt_srs.index), department_id_freq.values, alpha=0.8)
plt.ylabel('Frequency ', fontsize=12)
plt.xlabel('Department', fontsize=12, labelpad=20)
plt.xticks(rotation='vertical')
#plt.show()
#plt.savefig('departments.png')


plt.tight_layout()
#tikzplotlib.save("departments.tex", axis_width = '15cm',axis_height = '6cm' )
#plt.savefig("departments.png")

In [None]:
#creating a graph displaying the time of the day vs the departments
dep_prod = products.merge(departments, on = 'department_id', how = 'inner')
order_order_prod = orders.merge(order_products_prior, on = 'order_id', how = 'inner')
order_dep_prod = dep_prod.merge(order_order_prod,on = 'product_id', how = 'inner')
order_dep_prod_cleaned = order_dep_prod.drop(['days_since_prior_order','add_to_cart_order','reordered','aisle_id','product_id','product_name','order_id','user_id','eval_set'],axis = 1)

In [None]:
#import seaborn as sns

In [None]:
# plotting the number of products in each order
num_prods = order_dep_prod.groupby("order_id")["add_to_cart_order"].aggregate("max").reset_index()
cnt_srs = num_prods.add_to_cart_order.value_counts()

plt.figure(figsize=(12,8))
sns.barplot(cnt_srs.index[:54], cnt_srs.values[:54], alpha=0.8)
plt.ylabel('Frequency', fontsize=12)
plt.xlabel('Number of products in each order', fontsize=12)
plt.xticks(rotation='vertical')
plt.savefig('products_per_order.png')