In [14]:
import pandas as pd
import numpy as np
import datetime

### Load the NCC Data

In [15]:
ncc = pd.read_csv('../data/NCC_2018_nutrients_per_100g_originalcolnames.txt',sep='\t')

Set Food ID as the index

In [16]:
ncc['Food ID'].is_unique

True

In [17]:
ncc = ncc.set_index('Food ID')

### Load the data from the ASA24 Recalls

In [18]:
asa_24 = pd.read_csv('../data/training_for_GS_122118.csv')

Set FoodCode as the index 

In [19]:
asa_24 = asa_24.set_index('FoodCode')

### Load the file listing matching columns between the datasets

In [20]:
matching = pd.read_csv('../data/matching_ncc_fndds_columns.txt',sep='\t')

Get the list of columns for each dataset

In [21]:
ncc_cols = matching['NCC.Term'].values.tolist()
asa_24_cols = matching['FNDDS.Term'].values.tolist()
asa_24_cols = [val.replace(" ","") for val in asa_24_cols]

### Calculate the pairwise correlations
Define a function to calculate the pairwise PCC matrix between two matrices A and B

In [22]:
def row_corr(A,B):
    #number of columns in A or B
    N = B.shape[1]

    # Store row-wise sums of A and B, as they would be used at few places
    sA = A.sum(1)
    sB = B.sum(1)

    # Compute the four terms in pcc matrix-wise
    p1 = N*np.einsum('ik,jk->ij',A,B)
    p2 = sB*sA[:,None]
    p3 = N*((B**2).sum(1)) - (sB**2)
    p4 = N*((A**2).sum(1)) - (sA**2)

    # compute pcc as 2D array 
    pcorr = ((p1 - p2)/np.sqrt(p3*p4[:,None]))
    return pcorr

Get the columns provided by the `matching` file

In [10]:
A = asa_24.loc[:,asa_24_cols].values
B = ncc.loc[:,ncc_cols].values

In [11]:
corr = row_corr(A,B)

  app.launch_new_instance()
  app.launch_new_instance()


### Calculate weighted pairwise correlations

Load the weights from the Lasso model

In [23]:
lasso_coef = pd.read_csv('lasso_coef.csv')
weights = lasso_coef.loc[:,'coef'].values[:-1] #omit 'year'

Define a function to calculate weighted row-wise PCC

In [24]:
def row_corr_weighted(A,B,weights):
# converted to python from here: https://stackoverflow.com/questions/9460664/weighted-pearsons-correlation
    w = weights/sum(weights)
    
    A = A - (A*w).sum(1)[:,None]
    B = B - (B*w).sum(1)[:,None]
    
    pcorr = np.matmul(A,((B*w).T))/np.sqrt(np.matmul(((A**2)*w).sum(1)[:,None],(((B**2)*w).sum(1)[:,None]).T))
    return pcorr

In [227]:
# def row_corr_weighted_X(A,B,weights):
    
#     weights = weights /sum(weights)

#     #number of columns in A or B
#     N = B.shape[1]

#     # sum of weights
#     Uw = sum(weights)

#     # row-wise weighted sums
#     Uwx = np.dot(A,weights)
#     Uwy = np.dot(B,weights)

#     # row-wise sums
#     Ux = A.sum(1)
#     Uy = B.sum(1)

#     # row-wise weighted sums of squares
#     Vwx = np.dot((A**2),weights)
#     Vwy = np.dot((B**2),weights)

#     # Compute the four terms in pcc matrix-wise
#     Exiyi = np.einsum('ik,jk->ij',A,B)
#     p3 = Uw*Vwy - (Uwy**2)
#     p4 = Uw*Vwx - (Uwx**2)

#     pcorr = (Uw**2)*Exiyi + N*(Uwy*Uwx[:,None])/(N*np.sqrt(p3*p4[:,None]))

# #     pcorr = (Uw**2)*Exiyi - Uw*((Uwx[:,None]*Uy[:,None].T) + Ux[:,None]*Uwy[:,None].T) + N*(Uwy*Uwx[:,None])/(N*np.sqrt(p3*p4[:,None]))
# #     pcorr = (Uw**2)*Exiyi - Uw*((Uwx[:,None]*Uy) + Uwy*Ux[:,None]) + N*(Uwy*Uwx[:,None])/np.sqrt(p3*p4[:,None])
#     return pcorr

In [25]:
corr_weighted = row_corr_weighted(A,B,weights)

  
  


### Get results (unweighted)

Get indices of the top matches from the correlation matrix

In [12]:
TOP = 5 # set the number of matches to return

indices_top = np.argsort(-corr,axis=1,)[:,:TOP]
pcc_top = np.sort(-corr,axis=1,)[:,:TOP]

Construct dataframe to store top results

In [13]:
iters = [np.arange(0,asa_24.shape[0],1).tolist(),np.arange(0,TOP,1).tolist()]

results_cols = asa_24.columns.values.tolist() + ['PCC'] + ncc.columns.values.tolist()

mi = pd.MultiIndex.from_product(iters, names=['asa_index', 'match_index'])

results_top = pd.DataFrame(index=mi,columns=results_cols)

Copy ASA24 values to left side

In [14]:
results_top.loc[results_top.eval('match_index==0'), asa_24.columns] = asa_24.values

results_top.loc[:,ncc.columns] = ncc.iloc[indices_top.flatten(),:].values

results_top.loc[:,'PCC'] = -pcc_top.flatten()

### Save results to CSV

In [24]:
name = 'pcc_matching_results_top_{}.tsv'.format(TOP)
path = '../data/' + name
results_top.to_csv(path,sep='\t')

In [19]:
results_top_desc_only = results_top[['Food_Description','PCC','Keylist','Food Description','Short Food Description','Food Type','Lactose (g)']]

In [25]:
name = 'pcc_matching_results_top_{}_desc_only.tsv'.format(TOP)
path = '../data/' + name
results_top_desc_only.to_csv(path,sep='\t')

### Get results (weighted)

Get indices of the top matches from the correlation matrix

In [26]:
TOP = 5 # set the number of matches to return

indices_top = np.argsort(-corr_weighted,axis=1,)[:,:TOP]
pcc_top = np.sort(-corr_weighted,axis=1,)[:,:TOP]

Construct dataframe to store top results

In [35]:
iters = [np.arange(0,asa_24.shape[0],1).tolist(),np.arange(0,TOP,1).tolist()]

results_cols = asa_24.columns.values.tolist() + ['Weighted PCC'] + ncc.columns.values.tolist()

mi = pd.MultiIndex.from_product(iters, names=['asa_index', 'match_index'])

results_top = pd.DataFrame(index=mi,columns=results_cols)

Copy ASA24 values to left side

In [38]:
results_top.loc[results_top.eval('match_index==0'), asa_24.columns] = asa_24.values

results_top.loc[:,ncc.columns] = ncc.iloc[indices_top.flatten(),:].values

results_top.loc[:,'Weighted PCC'] = -pcc_top.flatten()

In [39]:
# results_top.columns.tolist()

### Save results to CSV

In [40]:
name = 'pcc_matching_results_top_{}_weighted.tsv'.format(TOP)
path = '../data/' + name
results_top.to_csv(path,sep='\t')

In [41]:
results_top_desc_only = results_top[['Food_Description','Weighted PCC','Keylist','Food Description','Short Food Description','Food Type','Lactose (g)']]

In [42]:
name = 'pcc_matching_results_top_{}_weighted_desc_only.tsv'.format(TOP)
path = '../data/' + name
results_top_desc_only.to_csv(path,sep='\t')

### Load the data back in
(Check that it was saved, further analysis, etc.)

In [43]:
results_top_w = pd.read_csv('../data/pcc_matching_results_top_5_weighted.tsv',sep='\t')
results_top_desc_only_w =  pd.read_csv('../data/pcc_matching_results_top_5_weighted_desc_only.tsv',sep='\t')

In [33]:
results_top_desc_only =  pd.read_csv('../data/pcc_matching_results_top_5_desc_only.tsv',sep='\t')