In [1]:
import os, sys, inspect, random
import pandas as pd
import numpy as np
from datetime import date
from collections import OrderedDict
from operator import itemgetter
from jinja2 import Template
from IPython.display import HTML
from urllib.parse import urlparse
from urllib.request import urlopen

import scipy.stats as stats
from scipy.stats import ttest_ind, ttest_rel, ttest_1samp
from scipy.stats import chi2, chi2_contingency
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.graphics.gofplots import qqplot
from scipy.stats import boxcox, shapiro, gaussian_kde

import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.graphics.gofplots import qqplot
from scipy.stats import boxcox, shapiro

import sqlite3
import psycopg2
from psycopg2 import Error

import param
# import hvplot
# import hvplot.pandas
# import holoviews as hv
# from holoviews import opts

import panel as pn
from io import StringIO
pn.extension()
from bokeh.io import show, curdoc
from bokeh.plotting import figure
from bokeh.transform import factor_cmap
from bokeh.models.filters import CustomJSFilter
from bokeh.models import Column, CDSView, CustomJS, CategoricalColorMapper, ColumnDataSource, HoverTool, Panel, MultiSelect
from bokeh.models.widgets import CheckboxGroup, CheckboxButtonGroup, Slider, RangeSlider, Tabs, TableColumn, DataTable
from bokeh.layouts import column, row, WidgetBox
from bokeh.palettes import Category10_10, Category20_16, Category20_20, Category20

import matplotlib
import matplotlib.pyplot as plt
from termcolor import colored
import seaborn as sns
sns.set_context('talk')
sns.set_style('white')

from IPython.display import display, Markdown, HTML, clear_output, display_html
import ipywidgets as widgets
from qgrid import show_grid

display(HTML(data="""
<style>
    div#notebook-container     {width: 80%;}
    div#menubar-container      {width: 65%;}
    div#maintoolbar-container  {width: 60%;}
</style>
"""))

In [2]:
import src
from src.analysis.processing import Analysis

In [3]:
analysis = Analysis()

In [4]:
display(Markdown('<h2>Customer Segmentation Analysis</h2>'))
loading_section       = ["Customer Data", "Features Engineer"]
sections              = ["Customer Data", "Features Engineer", "EDA" , "Statistical Analysis"]
conclusion_section    = ["Interpretation", "Reccomendation"]

summary_sub_section   = ["Project Summary", "Data Summary"]
features_sub_section  = ["Features Created"]
eda_sub_section       = ["Univariate Analysis", "Multiavariate Analysis"]
statistic_sub_section = ["T-Test", "ANOVA", "Chi-Squared Test"]

accordions = OrderedDict()
accordions["** Loading **"] = widgets.Accordion(children=[widgets.Output() for section in loading_section])
[accordions["** Loading **"].set_title(i, section) for i, section in enumerate(loading_section)]

for section in sections:
    if section == "Customer Data":
        accordions[section] = widgets.Accordion(children=[widgets.Output() for sub_section in summary_sub_section])
        [accordions[section].set_title(i, sub_section) for i, sub_section in enumerate(summary_sub_section)]
    elif section == "Features Engineer":
        accordions[section] = widgets.Accordion(children=[widgets.Output() for sub_section in features_sub_section])
        [accordions[section].set_title(i, sub_section) for i, sub_section in enumerate(features_sub_section)]
    elif section == "EDA":
        accordions[section] = widgets.Accordion(children=[widgets.Output() for sub_section in eda_sub_section])
        [accordions[section].set_title(i, sub_section) for i, sub_section in enumerate(eda_sub_section)]
    else:
        accordions[section] = widgets.Accordion(children=[widgets.Output() for sub_section in statistic_sub_section])
        [accordions[section].set_title(i, sub_section) for i, sub_section in enumerate(statistic_sub_section)]
        
accordions["** Conclusion **"] = widgets.Accordion(children=[widgets.Output() for section in conclusion_section])
[accordions["** Conclusion **"].set_title(i, section) for i, section in enumerate(conclusion_section)]
        
widget_fields = widgets.Tab(children=[accordions[t] for t in accordions])
[widget_fields.set_title(i, sub) for i, sub in enumerate(accordions.keys())]

<h2>Customer Segmentation Analysis</h2>

[None, None, None, None, None, None]

In [5]:
widget_fields

Tab(children=(Accordion(children=(Output(), Output()), _titles={'0': 'Customer Data', '1': 'Features Engineer'…

In [6]:
%matplotlib agg
customer_section = "Customer Data"
fe_section = "Features Engineer"

with accordions["** Loading **"].children[sections.index(customer_section)]:
    clear_output()
    display(Markdown("<h2> Initiating data loading ... </h2>"))
    analysis.get_data()
    
    
with accordions["** Loading **"].children[sections.index(fe_section)]:
    clear_output()
    display(Markdown("<h2> Initiating feature engineering ... </h2>"))
    analysis.feature_engineering()
    
    
with accordions[customer_section].children[0]:
    clear_output()
    display(Markdown("<h2> Kaggle Challenge - Instacart Customer Analysis & Segmentation </h2>"))
    display(Markdown(r'''<p align="center">
                         <img width="1300" height="100" src="https://miro.medium.com/max/1160/1*yf7Bk7LpZCH5wcIGSxBqjA.png"></p>'''))
    
    
with accordions[customer_section].children[1]:
    clear_output()
#     display(Markdown(r'<h2> Data Provided </h2'))
#     sub_df = analysis.data['customer_data'].sample(10000)
#     @interact
#     def show_df(column='order_number', x=3):
#         return sub_df[sub_df[column]>=x]
    
    display(Markdown(r'<h2> Overview of Data Quantity </h2>'))
    display(analysis.grid_df_display([analysis.descriptive_data(analysis.data['customer_data'][analysis.vars(['Customer'])]),
                                     analysis.data_type_analysis(analysis.data['customer_data'])]))
    
    display(Markdown(r'<h2> Overview on Customer Orders Number and Product Ordered </h2>'))
    html = analysis.overview_template.render(S=analysis.data['sum_order_summary'], 
                                             C=analysis.data['count_order_summary'],
                                             M=analysis.data['mean_hour_order_summary'],
                                             O=analysis.data['overall_summary'], 
                                             table=analysis.data['df_table'], 
                                             np=np, 
                                             pd=pd, 
                                             enumerate=enumerate)
    display(HTML(f'<div>{html}</div>'))

In [7]:
section = "Features Engineer"
    
with accordions[section].children[0]:
    clear_output()
    display(Markdown(r'<h2> Overview of Features Generated </h2>'))
    display(Markdown(r'<h4> 1. <code>Number of Orders</code> </h4>'))
    display(analysis.histogram_plot(df=analysis.data["features"], 
                                    title='Number of Orders', 
                                    var='num_orders', 
                                    xlabel='num_orders', 
                                    ylabel='Density', 
                                    bin_size=100, 
                                    count=False))
    
    display(Markdown(r'<h4> 2. <code>Rate of Order in Peak Day</code> </h4>'))
    display(analysis.histogram_plot(df=analysis.data["features"], 
                                    title='Peak Day Rate', 
                                    var='peakday_rate', 
                                    xlabel='peakday_rate', 
                                    ylabel='Density', 
                                    bin_size=100, 
                                    count=False))
    
    display(Markdown(r'<h4> 3. <code>Median of Hour in Ordering</code> </h4>'))
    display(analysis.histogram_plot(df=analysis.data["features"], 
                                    title='Median Hour', 
                                    var='median_hour', 
                                    xlabel='median_hour', 
                                    ylabel='Density', 
                                    bin_size=100, 
                                    count=False))
    
    display(Markdown(r'<h4> 4. <code>Rate of Order in Peak Time</code> </h4>'))
    display(analysis.histogram_plot(df=analysis.data["features"], 
                                    title='Peak Time Rate', 
                                    var='peaktime_rate', 
                                    xlabel='peaktime_rate', 
                                    ylabel='Density', 
                                    bin_size=100, 
                                    count=False))
    
    display(Markdown(r'<h4> 5. <code>Mean of Lag Days After Orders</code> </h4>'))
    display(analysis.histogram_plot(df=analysis.data["features"], 
                                    title='Lag Days Mean', 
                                    var='mean_lag_days', 
                                    xlabel='mean_lag_days',
                                    ylabel='Density', 
                                    bin_size=100, 
                                    count=False))
    
    display(Markdown(r'<h4> 5. <code>Mean of Number of Products Per Order</code> </h4>'))
    display(analysis.histogram_plot(df=analysis.data["features"], 
                                    title='Mean of Num of Products', 
                                    var='mean_num_products', 
                                    xlabel='mean_num_products', 
                                    ylabel='Density', 
                                    bin_size=100, 
                                    count=False))

In [None]:
%matplotlib agg
section = "EDA"
    
with accordions[section].children[0]:
    clear_output()
    display(Markdown("<h2> Overview of Product Ordered </h2>"))
    display(Markdown(r'<h4> 1. <code>Top 10 Products Ordered</code> </h4>'))
    display(analysis.horizontal_bar_plot(df=analysis.data["product_count_df"][:10], 
                                         x_var='count', 
                                         y_var='product_name', 
                                         xlabel='Number of Products', 
                                         ylabel='Products', 
                                         title='Top 10 Products ordered'))
    
    display(Markdown(r'<h4> 2. <code>Products Groups Ordered</code> </h4>'))
    display(analysis.horizontal_bar_plot(df=analysis.data["prod_gp_count_df"], 
                                         x_var='count', 
                                         y_var='product_group', 
                                         xlabel='Number of Products Groups', 
                                         ylabel='Products Groups', 
                                         title='Top 10 Product Groups ordered'))
    
    display(Markdown(r'<h2> Understanding Customer Buying Behavior </h2>'))
    display(Markdown(r'<h4> 1. <code>Order Per User</code> </h4>'))
    display(analysis.histogram_plot(df=analysis.data['customer_data'], 
                                    title='Orders Per User', 
                                    var='user_id', 
                                    xlabel='Orders', 
                                    ylabel='Density', 
                                    bin_size=100, 
                                    count=True))
    
    display(Markdown(r'<h4> 2. <code>Orders Per Days of Week</code> </h4>'))
    display(analysis.histogram_plot(df=analysis.data['customer_data'], 
                                    title='Orders Per Days of Week', 
                                    var='order_day_of_week', 
                                    xlabel='Days of Week', 
                                    ylabel='Density', 
                                    bin_size=100, 
                                    count=False))
    
    display(Markdown(r'<h4> 3. <code>Orders Per Hour of Day</code> </h4>'))
    display(analysis.histogram_plot(df=analysis.data['customer_data'], 
                                    title='Orders Per Hour of Day', 
                                    var='order_hour_of_day', 
                                    xlabel='Hour (24 Hours)', 
                                    ylabel='Density', 
                                    bin_size=100, 
                                    count=False))
    
    display(Markdown(r'<h4> 4. <code>Days Since Last Order</code> </h4>'))
    display(analysis.histogram_plot(df=analysis.data['customer_data'], 
                                    title='Days Since Prior Order', 
                                    var='days_since_last_order', 
                                    xlabel='Days', 
                                    ylabel='Density', 
                                    bin_size=100, 
                                    count=False))
    
    display(Markdown(r'<h4> 5. <code>Number of Products Per Order</code> </h4>'))
    display(analysis.histogram_plot(df=analysis.data['customer_data'], 
                                    title='Number of Products Per Order', 
                                    var='num_products',
                                    xlabel='Number of Products', 
                                    ylabel='Density', 
                                    bin_size=100, 
                                    count=False))
    
    display(Markdown(r'<h2> Patterns of Order Times In Customers </h2>'))
    display(Markdown(r'<h4> 1. <code>Order in Hours a Day</code> </h4>'))
    display(analysis.boxplot_plot(df=analysis.data['time_peak_df'], 
                                  x_var='count', 
                                  y_var='order_hour_of_day', 
                                  xlabel='Hour of Day', 
                                  title='Orders by Day and Time',
                                  day=True))
    
    display(Markdown(r'<h4> 2. <code>Size of Orders by Day of Week</code> </h4>'))
    display(analysis.boxplot_plot(df=analysis.data['day_peak_df'], 
                                  x_var='count', 
                                  y_var='order_day_of_week', 
                                  xlabel='Number of Product Ordered', 
                                  title='Size of Orders by Day of Week',
                                  day=False))
    
    display(Markdown(r'<h4> 3. <code>Size of Orders by Hours of Day</code> </h4>'))
    display(analysis.violin_plot_hours(df=analysis.data['customer_data'], 
                                       x_var='order_hour_of_day', 
                                       y_var='order_day_of_week', 
                                       xlabel='Hour of the Day', 
                                       title='Order of Hours on Each Day'))

with accordions[section].children[1]:
    clear_output()
    display(Markdown(r'<h2> Relationship Between Features </h2>'))
    

In [None]:
%matplotlib agg
section = "Statistical Analysis"

with accordions[section].children[0]:
    clear_output()
    display(Markdown(r'<h2> Relationship Between Samples </h2>'))
    display(Markdown(r'<h4> 1. <code>Do Peak Days Affect the Purchase Behabiors of Customer?</code> </h4>'))
    display(analysis.dist_plot_2_vars(df=analysis.data['day_peak_df'], 
                                      var="order_day_of_week", 
                                      title="Number of Orders for Each Product Between Peak & Peak-Off Days", 
                                      log=False, 
                                      label1="Peak Days", 
                                      label2="Off-Peak Days"))
    display(analysis.t_test(y1=analysis.data['customer_data'].loc[analysis.data['customer_data']['peak_day']==1], 
                            y2=analysis.data['customer_data'].loc[analysis.data['customer_data']['peak_day']==0], 
                            var='order_number', 
                            population=False, 
                            paired=False, 
                            alpha=0.05))
    
    
with accordions[section].children[1]:
    clear_output()
    display(Markdown(r'<h2> Relationship Between Group of Samples based on Variance </h2>'))
    display(Markdown(r'<h4> 1. <code>Do Orders Numbers Vary Following Days of Week?</code> </h4>'))
    display(analysis.box_plot(df=analysis.data['customer_data'], 
                              xVar="order_day_of_week", 
                              yVar="order_number"))
    display(analysis.anova_test(df=analysis.data['customer_data'],
                                var1='order_number',
                                cat_var1='order_day_of_week',
                                cat_var2=None,
                                two_way=False,
                                alpha=0.05))
    
    display(Markdown(r'<h4> 2. <code>Do Orders Numbers Vary Following Days of Week and Hours of Days?</code> </h4>'))
    display(analysis.box_plot(df=analysis.data['customer_data'], 
                              xVar="order_day_of_week", 
                              yVar="order_number"))
    display(analysis.box_plot(df=analysis.data['customer_data'], 
                              xVar="order_hour_of_day", 
                              yVar="order_number"))
    display(analysis.anova_test(df=analysis.data['customer_data'],
                                var1='order_number',
                                cat_var1='order_day_of_week',
                                cat_var2='order_hour_of_day',
                                two_way=True,
                                alpha=0.05))
    

with accordions[section].children[2]:
    clear_output()
    display(Markdown(r'<h2> Relationship Between Categorical Samples </h2>'))
    display(Markdown(r'<h4> 2. <code>Do Day of Week Affect the Product Groups to be Bought?</code> </h4>'))
    display(analysis.horizontal_stack_plot(analysis.data['customer_data'], 'order_day_of_week', 'product_group'))
    display(analysis.chi_squared_test(df=analysis.data['customer_data'], 
                                      var1='order_day_of_week', 
                                      var2='product_group',
                                      alpha=0.05))