In [52]:
import pandas as pd
import numpy as np
import datetime

### Load the NCC Data

In [2]:
ncc = pd.read_csv('../data/NCC_2018_nutrients_per_100g_originalcolnames.txt',sep='\t')

Set Food ID as the index

In [3]:
ncc['Food ID'].is_unique

True

In [4]:
ncc = ncc.set_index('Food ID')

### Load the data from the ASA24 Recalls

In [5]:
asa_24 = pd.read_csv('../data/training_for_GS_122118.csv')

Set FoodCode as the index 

In [6]:
asa_24 = asa_24.set_index('FoodCode')

### Load the file listing matching columns between the datasets

In [7]:
matching = pd.read_csv('../data/matching_ncc_fndds_columns.txt',sep='\t')

Get the list of columns for each dataset

In [8]:
ncc_cols = matching['NCC.Term'].values.tolist()
asa_24_cols = matching['FNDDS.Term'].values.tolist()
asa_24_cols = [val.replace(" ","") for val in asa_24_cols]

### Calculate the pairwise correlations
Define a function to calculate the pairwise PCC matrix between two matrices A and B

In [9]:
def row_corr(A,B):
    #number of columns in A or B
    N = B.shape[1]

    # Store row-wise sums of A and B, as they would be used at few places
    sA = A.sum(1)
    sB = B.sum(1)

    # Compute the four terms in pcc matrix-wise
    p1 = N*np.einsum('ik,jk->ij',A,B)
    p2 = sB*sA[:,None]
    p3 = N*((B**2).sum(1)) - (sB**2)
    p4 = N*((A**2).sum(1)) - (sA**2)

    # compute pcc as 2D array 
    pcorr = ((p1 - p2)/np.sqrt(p3*p4[:,None]))
    return pcorr

Get the columns provided by the `matching` file

In [10]:
A = asa_24.loc[:,asa_24_cols].values
B = ncc.loc[:,ncc_cols].values

In [11]:
corr = row_corr(A,B)

  app.launch_new_instance()
  app.launch_new_instance()


### Get results

Get indices of the top matches from the correlation matrix

In [46]:
TOP = 5 # set the number of matches to return

indices_top = np.argsort(-corr,axis=1,)[:,:TOP]
pcc_top = np.sort(-corr,axis=1,)[:,:TOP]

In [45]:
indices_top.transpose()

array([[11197, 11213,  5474, ..., 13351,  5485,  5491],
       [16436, 11246, 11314, ..., 13347,  9773, 16046],
       [11198, 11245, 11312, ..., 13350, 10583, 13444],
       [11195,  8027, 11309, ..., 13357, 10580, 15110],
       [ 7102,  6772, 11311, ...,  9867,  5657,  4406]])

Construct dataframe to store top results

In [13]:
iters = [np.arange(0,asa_24.shape[0],1).tolist(),np.arange(0,TOP,1).tolist()]

results_cols = asa_24.columns.values.tolist() + ['PCC'] + ncc.columns.values.tolist()

mi = pd.MultiIndex.from_product(iters, names=['asa_index', 'match_index'])

results_top = pd.DataFrame(index=mi,columns=results_cols)

Copy ASA24 values to left side

In [58]:
results_top.loc[results_top.eval('match_index==0'), asa_24.columns] = asa_24.values

results_top.loc[:,ncc.columns] = ncc.iloc[indices_top.flatten(),:].values

results_top.loc[:,'PCC'] = -pcc_top.flatten()

### Save results to CSV

In [60]:
name = 'pcc_matching_results_top_{}'.format(TOP)
path = '../data/' + name
results_top.to_csv(path)

In [75]:
results_top_desc_only = results_top[['Food_Description','PCC','Keylist','Food Description','Short Food Description']]

In [77]:
name = 'pcc_matching_results_top_{}_desc_only'.format(TOP)
path = '../data/' + name
results_top_desc_only.to_csv(path)