In [1]:
import os
import pandas as pd

# Data input
## Read data and cleanup
Results files can be found in sph-nearest-neighbor/test/results/ 

Naming scheme: stats_BIN.csv e.g. stats_fr_ann.csv

In [2]:
resultsdir = "../test/results"
print("Available files:")
os.listdir(resultsdir)

Available files:


['stats_fr_cellLinkedList.csv',
 'stats_fr_ann.csv.~2~',
 'stats_fr_ann_nolist.csv.~5~',
 'stats_fr_ann_nolist.csv.~1~',
 'stats_fr_ann_nolist.csv.~4~',
 'stats_fr_cellLinkedList.csv.~1~',
 'stats_fr_ann.csv~',
 'stats_fr_cellLinkedList.csv.~5~',
 'stats_fr_cellLinkedList.csv.~4~',
 'stats_fr_cellLinkedList.csv.~3~',
 'stats_fr_ann_nolist.csv.~2~',
 'stats_fr_ann_nolist.csv',
 'stats_fr_ann.csv.~1~',
 'stats_fr_ann_nolist.csv.~3~',
 'stats_fr_cellLinkedList.csv.~2~',
 'stats_fr_ann_method1.csv',
 'stats_fr_ann_method0.csv',
 'stats_fr_nanoflann.csv',
 'stats_fr_ann.csv']

Going to read three files:

In [3]:
statsfile_fr_cll = os.path.join(resultsdir, "stats_fr_cellLinkedList.csv")
statsfile_fr_ann = os.path.join(resultsdir, "stats_fr_ann.csv")
statsfile_fr_ann_nolist = os.path.join(resultsdir, "stats_fr_ann_nolist.csv")

Read data and label search method, cleanup the fill, only keep relevant data, and combine to a single dataframe:

In [4]:
df_cll = pd.read_csv(statsfile_fr_cll)
df_cll = df_cll.assign(method=pd.Series(["CLL"]*len(df_cll)).values)
df_annnl = pd.read_csv(statsfile_fr_ann_nolist)
df_annnl = df_annnl.assign(method=pd.Series(["ANN-NL"]*len(df_annnl)).values)
df_ann = pd.read_csv(statsfile_fr_ann)
df_ann = df_ann.assign(method=pd.Series(["ANN"]*len(df_ann)).values)

# set filltype to a nice integer percentage
df_cll['fill'] = df_cll.fill.mul(100).astype(int)
df_annnl['fill'] = df_annnl.fill.mul(100).astype(int)
df_ann['fill'] = df_ann.fill.mul(100).astype(int);

# drop unused columns from dataframes
df_cll.drop(columns=["time", "sizex", "sizey", "sizez"], inplace = True)
df_annnl.drop(columns=["time", "sizex", "sizey", "sizez"], inplace = True)
df_ann.drop(columns=["time", "sizex", "sizey", "sizez"], inplace = True)

# combine to a single df
df = pd.concat([df_cll, df_annnl, df_ann])

## Max time cutoff
Because in some cases with ANN search, list processing took hours, you can **set a cutoff for the maximum time and drop those observations from the dataframe.**.
To do this, first sort the dataframe by ttotal:

In [5]:
df.sort_values("ttotal", ascending=False).head()

Unnamed: 0,filltype,fill,ndatapts,ttotal,tksearch,tfrsearch,tprocessing,listmethod,memory,method
16,corners,11,28158,1345.19,0.711435,0.451737,1343.96,0,50908,ANN
15,corners,11,28158,1342.73,0.701913,0.445528,1341.52,0,50972,ANN
18,corners,11,28158,1340.65,0.701537,0.438849,1339.45,0,50980,ANN
19,corners,11,28158,1337.01,0.708696,0.446213,1335.78,0,51056,ANN
17,corners,11,28158,1336.39,0.700407,0.44291,1335.18,0,51132,ANN


Choose a cutoff time and drop the values

In [6]:
# in this case, not used
max_time_cutoff = 1400 # [s]
df = df[df["ttotal"] <= max_time_cutoff]

Resulting dataframe:

In [7]:
df.sort_values("ttotal", ascending=False).head()

Unnamed: 0,filltype,fill,ndatapts,ttotal,tksearch,tfrsearch,tprocessing,listmethod,memory,method
16,corners,11,28158,1345.19,0.711435,0.451737,1343.96,0,50908,ANN
15,corners,11,28158,1342.73,0.701913,0.445528,1341.52,0,50972,ANN
18,corners,11,28158,1340.65,0.701537,0.438849,1339.45,0,50980,ANN
19,corners,11,28158,1337.01,0.708696,0.446213,1335.78,0,51056,ANN
17,corners,11,28158,1336.39,0.700407,0.44291,1335.18,0,51132,ANN


## Convert memory values to MB
time returns: Maximum resident set size of the process during its lifetime, in Kilobytes.

In [8]:
df["memory"] = df["memory"].multiply(1/1000)

## Write out results to csv

In [9]:
df.head()

Unnamed: 0,filltype,fill,ndatapts,ttotal,tksearch,tfrsearch,tprocessing,listmethod,memory,method
0,full,100,250000,2.06572,0.0,0.0,0.0,-1,116.74,CLL
1,full,100,250000,2.16067,0.0,0.0,0.0,-1,116.784,CLL
2,full,100,250000,2.11499,0.0,0.0,0.0,-1,116.78,CLL
3,full,100,250000,2.17325,0.0,0.0,0.0,-1,116.748,CLL
4,full,100,250000,2.16133,0.0,0.0,0.0,-1,116.704,CLL


In [10]:
df.to_csv("results_df.csv", index=False)