In [1]:
#Importing Pandas library for cleaning up raw data
import pandas as pd
import numpy as np

In [2]:
#File name of raw csv, which should be in the same folder/directory as the Preprocessing Notebook
raw_filename = 'clean_data.csv'

In [3]:
data = pd.read_csv(raw_filename, sep='\t')

In [4]:
#Create separate dataframe for aggregate dataset
aggregate_data=data.drop_duplicates('Country')
aggregate_data.head()
aggregate_data=aggregate_data.drop('What is your biggest fear as we move towards a more connected future?',axis=1)

In [5]:
#List of each country in dataset
countryList=aggregate_data.Country.unique()
countryList

array(['United States', 'Canada', 'United Kingdom', 'France', 'India',
       'Mexico', 'Australia', 'Germany', 'Switzerland', 'Brazil',
       'Argentina', 'Italy', 'Belgium', 'Austria', 'Venezuela', 'Colombia',
       'Ecuador', 'Chile', 'Spain'], dtype=object)

In [6]:
#Create new column count of records for each country
aggregate_data['Count'] = np.nan

countryCounts=data['Country'].value_counts()
    
for country in countryList:
    aggregate_data.loc[aggregate_data.Country == country, ['Count']] = countryCounts[country]

aggregate_data


Unnamed: 0.1,Unnamed: 0,Country,I consider myself:,WiFi Router:Check all the internet connected devices you currently own:,Laptop computer:Check all the internet connected devices you currently own:,Smart phone:Check all the internet connected devices you currently own:,Smart TV:Check all the internet connected devices you currently own:,Activity Tracker (ex: Fitbit or Apple Watch):Check all the internet connected devices you currently own:,"Smarthome Hub (ex. Amazon Echo, Google Alexa):Check all the internet connected devices you currently own:",Car that connects to the internet:Check all the internet connected devices you currently own:,...,"Smart Appliance (ex. Coffeemaker, Refrigerator, Oven, Fridge):Check all the internet connected devices you currently own:",Smart Door Locks (ex. Door locks for your home you can open via bluetooth):Check all the internet connected devices you currently own:,"Smart Lighting (ex. Connected lighting switches, dimmers, or bulbs):Check all the internet connected devices you currently own:",Thinking about a future in which so much of your world is connected to the internet leaves you feeling:,What are you most excited about as we move toward a more digitally connected future?,Total Owned Devices,Country Average Devices,Country Average Tech Proficiency,Country Average Tech Sentiment,Count
0,0,United States,3,1.0,1.0,1.0,0.0,0.0,0.0,1.0,...,0.0,1.0,1.0,-2,None of the above,7.0,3.800052,2.605362,-0.071605,15404.0
4,4,Canada,3,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,None of the above,3.0,3.51341,2.590333,-0.050987,3393.0
14,22,United Kingdom,4,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1,How much fun it will be,3.0,3.662828,2.641509,-0.094095,4081.0
16,24,France,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-2,None of the above,2.0,3.101538,2.59152,-0.251335,24907.0
21,35,India,3,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,2,How it will make us all smarter and better edu...,3.0,3.471441,2.754957,0.580645,3379.0
25,44,Mexico,3,1.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1,How easy it will make life,4.0,3.469156,2.549964,0.353896,5544.0
37,70,Australia,4,1.0,1.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1,How easy it will make life,5.0,3.458269,2.56506,-0.020218,1929.0
55,113,Germany,3,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-1,How it will bring the world together,2.0,3.553127,2.607756,-0.105526,20630.0
88,192,Switzerland,4,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1,How it will bring the world together,4.0,3.655559,2.636518,-0.081605,2941.0
93,204,Brazil,3,1.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-1,How easy it will make life,4.0,3.543257,2.580833,0.520342,10299.0


In [7]:
from bokeh.io import output_notebook
output_notebook()

In [8]:
### 1st plot correlating each country's average # of owned devices vs their Sentiment of the Future

In [9]:
from bokeh.io import show
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, TapTool
from bokeh.models import HoverTool
from bokeh.models import LinearInterpolator, CategoricalColorMapper
from bokeh.palettes import inferno
from bokeh.layouts import gridplot
from bokeh.models.callbacks import CustomJS
from ipywidgets import interact
from bokeh.io import push_notebook, show, output_notebook
from bokeh.embed import file_html
from bokeh.resources import CDN
from bokeh.models import Legend

#interpolating from min max values
size_mapper = LinearInterpolator(
    x = [data['Country'].value_counts().min(), data['Country'].value_counts().max()],
    y=[5,10]
)

color_mapper = CategoricalColorMapper(
    factors = countryList,
    palette = inferno(20)
)

source = ColumnDataSource(aggregate_data)

hover = HoverTool(tooltips = [
    ("records","@Count"),
    ("country" ,"@Country"),
], show_arrow=False)


plot1 = figure(
            title=str("Average Owned Devices vs Sentiment of Future by Country"),
            x_axis_label='Average Sentiment of Future Techologies \n ((Negative) -2 to 2 (Positive))', y_axis_label='Average Number of Owned Devices',
            tools=[hover,"tap"]
)
ro=plot1.circle(
        x='Country Average Tech Sentiment', y='Country Average Devices', 
        #size= {'field': 'Count','transform':size_mapper},
        color= {'field': 'Country', 'transform':color_mapper},     
        source=source
)

### 2nd plot correlating each country's average Technology Knowledge vs their Sentiment of the Future

In [10]:
plot2 = figure(
    title=str("Average Tech Knowledge vs Sentiment of Future by Country"),
    x_axis_label='Average Sentiment of Future Techologies ((Negative) -2 to 2 (Positive))', y_axis_label='Average Technology Knowledge Proficiency',
    tools=[hover,"tap"]
)

plot2.circle(
        x='Country Average Tech Sentiment', y='Country Average Tech Proficiency', 
        #size= {'field': 'I consider myself:','transform':size_mapper},
        color= {'field': 'Country', 'transform':color_mapper}, 
        source=source
)

In [11]:
def generateBoxPlot(country):  
    df = data.loc[data['Country']==country]
    cats=list(data['Thinking about a future in which so much of your world is connected to the internet leaves you feeling:'].unique())
    filterParam='Total Owned Devices'

    # find the quartiles and IQR for each category
    groups = df.groupby('Thinking about a future in which so much of your world is connected to the internet leaves you feeling:')
    groups
    q1 = groups.quantile(q=0.25)
    q2 = groups.quantile(q=0.5)
    q3 = groups.quantile(q=0.75)
    iqr = q3 - q1
    upper = q3 + 1.5*iqr
    lower = q1 - 1.5*iqr

    # find the outliers for each category
    def outliers(group):
        cat = group.name
        return group[(group[filterParam] > upper.loc[cat][filterParam]) | (group[filterParam] < lower.loc[cat][filterParam])][filterParam]
    out = groups.apply(outliers).dropna()

    # prepare outlier data for plotting, we need coordinates for every outlier.
    if not out.empty:
        outx = []
        outy = []
        for cat in cats:
            # only add outliers if they exist
            if not out.loc[cat].empty:
                for value in out[cat]:
                    outx.append(cat)
                    outy.append(value)

    boxplot = figure(tools="save", 
               #background_fill_color="#EFE8E2", 
               title=country+": Distribution of # Owned Devices",
               x_axis_label='Average Sentiment of Future Techologies ((Negative) -2 to 2 (Positive))', y_axis_label='Average Number of '+ filterParam,
    )
    
    # if no outliers, shrink lengths of stems to be no longer than the minimums or maximums
    qmin = groups.quantile(q=0.00)
    qmax = groups.quantile(q=1.00)
    upper[filterParam] = [min([x,y]) for (x,y) in zip(list(qmax.loc[:,filterParam]),upper[filterParam])]
    lower[filterParam] = [max([x,y]) for (x,y) in zip(list(qmin.loc[:,filterParam]),lower[filterParam])]

    # stems
    boxplot.segment(cats, upper[filterParam], cats, q3[filterParam], line_color="black")
    boxplot.segment(cats, lower[filterParam], cats, q1[filterParam], line_color="black")

    # boxes
    boxplot.vbar(cats, 0.7, q2[filterParam], q3[filterParam], fill_color="#3B8686", line_color="black")
    boxplot.vbar(cats, 0.7, q1[filterParam], q2[filterParam], fill_color="#3B8686", line_color="black")

    # whiskers (almost-0 height rects simpler than segments)
    boxplot.rect(cats, lower[filterParam], 0.2, 0.01, line_color="black")
    boxplot.rect(cats, upper[filterParam], 0.2, 0.01, line_color="black")

    # outliers
    if not out.empty:
        boxplot.circle(outx, outy, size=6, color="#F38630", fill_alpha=0.6)

    boxplot.xgrid.grid_line_color = None
    boxplot.ygrid.grid_line_color = "white"
    boxplot.grid.grid_line_width = 2
    boxplot.xaxis.major_label_text_font_size="12pt"
    
    
    
    
    filterParam2='I consider myself:'

    # find the outliers for each category
    def outliers2(group):
        cat = group.name
        return group[(group[filterParam2] > upper.loc[cat][filterParam2]) | (group[filterParam2] < lower.loc[cat][filterParam2])][filterParam2]
    out = groups.apply(outliers2).dropna()

    # prepare outlier data for plotting, we need coordinates for every outlier.
    if not out.empty:
        outx = []
        outy = []
        for cat in cats:
            # only add outliers if they exist
            if not out.loc[cat].empty:
                for value in out[cat]:
                    outx.append(cat)
                    outy.append(value)

    boxplot2 = figure(tools="save", 
               title=country+": Distribution of Tech Knowledge responses",
               x_axis_label='Average Sentiment of Future Techologies ((Negative) -2 to 2 (Positive))', y_axis_label='Average Number of Technology Proficiency',
    )
    
    # if no outliers, shrink lengths of stems to be no longer than the minimums or maximums
    qmin = groups.quantile(q=0.00)
    qmax = groups.quantile(q=1.00)
    upper[filterParam2] = [min([x,y]) for (x,y) in zip(list(qmax.loc[:,filterParam2]),upper[filterParam2])]
    lower[filterParam2] = [max([x,y]) for (x,y) in zip(list(qmin.loc[:,filterParam2]),lower[filterParam2])]

    # stems
    boxplot2.segment(cats, upper[filterParam2], cats, q3[filterParam2], line_color="black")
    boxplot2.segment(cats, lower[filterParam2], cats, q1[filterParam2], line_color="black")

    # boxes
    boxplot2.vbar(cats, 0.7, q2[filterParam2], q3[filterParam2], fill_color="#3B8686", line_color="black")
    boxplot2.vbar(cats, 0.7, q1[filterParam2], q2[filterParam2], fill_color="#3B8686", line_color="black")

    # whiskers (almost-0 height rects simpler than segments)
    boxplot2.rect(cats, lower[filterParam2], 0.2, 0.01, line_color="black")
    boxplot2.rect(cats, upper[filterParam2], 0.2, 0.01, line_color="black")

    # outliers
    if not out.empty:
        boxplot2.circle(outx, outy, size=6, color="#F38630", fill_alpha=0.6)

    boxplot2.xgrid.grid_line_color = None
    boxplot2.ygrid.grid_line_color = "white"
    boxplot2.grid.grid_line_width = 2
    boxplot2.xaxis.major_label_text_font_size="12pt"
    
    
    plots = gridplot([plot1,boxplot,plot2,boxplot2], ncols=2, plot_width=425, plot_height=425)
    show(plots, notebook_handle=True)
    push_notebook()

In [12]:
interact(generateBoxPlot, title="Option:", country=countryList)
#html = file_html(plots, CDN, "my plot")

A Jupyter Widget

<function __main__.generateBoxPlot>