In [99]:
import math
import pandas as pd
import numpy as np
import scipy.stats as stats
from sklearn.linear_model import LinearRegression
from bokeh.plotting import figure, output_notebook, show
from bokeh.layouts import widgetbox, column, row
from bokeh.models import Select, CustomJS, ColumnDataSource, HoverTool
from bokeh.models.widgets import Select, CheckboxGroup

In [100]:
output_notebook()

First, I want to explore all the data and try to find some correlation between each 2 features.
I only want to have the **numeric** data, so I viewed the data and remove all obvious non-numeric data column.
Besides, I remove those feature with **too many zero**(more than 20), I think they are useless.
Also, these columns with name containning "\_" are also removed, because they tend to be components of main feature. For example something like "total_gram_sugar" and "total_calory_sugar" .

In [133]:
df = pd.read_csv("nutrition_raw_anonymized_data.csv")
df = df.replace(to_replace=["Yes","Innie"], value=1)
df = df.replace(to_replace=["No","Outie"], value=0)
col_name_0 = list(df)[1:]
col_name_0 = col_name_0[col_name_0.index("GROUP_SOLID_COUNT"):]
col_name = []
N0 = len(col_name_0)
for i in range(N0):
    if df[col_name_0[i]].tolist().count(0)<20 and col_name_0[i].find('_')<0:
        col_name.append(col_name_0[i])
print len(col_name)

120


After that, 120 features are left. That's a much smaller range.
Then I calculated the **correlation coefficient** between each 2 features.
With correlation coefficient larger than 0.9, it is reasonable to assume these two variables highly-related.

In [102]:
N = len(col_name)
cor = [[0 for i in range(N)] for j in range(N)]
for i in range(N):
    for j in range(i,N):
        c, p = stats.pearsonr(df[col_name[i]].tolist(),df[col_name[j]].tolist())
        if math.isnan(c):
            cor[i][j] = 0
        else:
            cor[i][j] = c

Then I prepare the data for plotting.
One for the scatters and the other for the linear regression.

In [130]:
x = []
y = []
z = []
x_name = []
y_name = []
name_list = []
x_list = []
y_list = []
pred_list = []
k = 0
for i in range(N):
    for j in range(i,N):
        if cor[i][j]>0.90 and i!=j:
            x.append(i)
            y.append(j)
            z.append(cor[i][j])
            x_name.append(col_name[i])
            y_name.append(col_name[j])
            if j-i>50 and len(col_name[i])<15 and len(col_name[j])<15:
                name_list.append(col_name[i]+' & '+col_name[j])
                x_list.append(df[col_name[i]].tolist())
                y_list.append(df[col_name[j]].tolist())  
                k = k+1
for i in range(len(x_list)):
    r = LinearRegression()
    r.fit(np.array(x_list[i]).reshape(-1,1),np.array(y_list[i]).reshape(-1,1))
    pred = r.predict(np.array(x_list[i]).reshape(-1,1))
    pred_list.append(pred.reshape(1,-1).tolist()[0])
for i in range(len(x_list),len(x_list[0])) :
    x_list.append([])
    y_list.append([])
    pred_list.append([])
    name_list.append('')
    
#print k
source1 = ColumnDataSource(data={"x":x, "y":y, "z":z, "x_n": x_name, "y_n": y_name})
source2 = ColumnDataSource(data={"x":x_list[0], "y":y_list[0], "p":pred_list[0],"x0":x_list, "y0":y_list, "p0":pred_list, "n":name_list})


From all the correlation coefficients above 0.9, plot them on the figure.
With hover tool, it is easy to get the features' name and correlation coefficient.

In [131]:
# add hover tool
hover = HoverTool(tooltips=[('V1', '@x_n'), ('V2', '@y_n'), ('Cor', '@z')])
p = figure(plot_width=800, plot_height=800,tools=[hover])
p.circle('x','y',source=source1,size=5,color='red',alpha=0.5)
p.circle('y','x',source=source1,size=5,color='blue',alpha=0.5)
show(p)

Pick out 15 set from all those dataset with coefficients above 0.9.
Plot the data point and draw the line of liner regression.
We can see from the figure that for each set of data, they are all positive correlative with each other.

In [132]:
p2 = figure()
p2.circle('x','y',source=source2,color='red')
p2.line('x','p',source=source2,color='blue')

callback = CustomJS(args=dict(source=source2), code="""
    var name = cb_obj.value;
    var data = source.data;
    var array = data['n'];
    var idx = array.indexOf(name);
    console.log(idx)
    data['x'] = data['x0'][idx];
    data['y'] = data['y0'][idx];
    data['p'] = data['p0'][idx];
    source.change.emit();
""")

select = Select(title="X & Y:", value="EPICATECG3 & FLD", options=name_list[0:15])
select.js_on_change('value', callback)
show(column(select,p2))