In [1]:
using Unitful #https://painterqubits.github.io/Unitful.jl/stable/
#quantity * @u_str("unit abbreviation") 
using Symbolics #https://symbolics.juliasymbolics.org/dev/
#cite https://doi.org/10.48550/arXiv.2105.03949
using Latexify
using Test
#1 * @u_str("mA") is 1 milliamp
using CSV, DataFrames
#using Plots
using PlotlyJS
using Printf
using SymPy
using PDFIO
using Unzip
using Interpolations
using Plots
using TensorCast
using CatViews
using MappedArrays
#plotlyjs()
data_dir = "C:\\Cross-Section-Data\\EXFOR\\"

"C:\\Cross-Section-Data\\EXFOR\\"

#Design

    Looping check_line() through each line in the file detects where each dataset begins. 

    Reading the line above each output of check_line() gives the number of rows in each dataset.

    make_spacing_dict() gets the names of the columns and the instructions for how to read each dataset.

    read_dataset() reads each and constructs a dictionary from each dataset.
    
    read_exfor_file() runs all of the above in order to return a single DataFrame of all the data stored at a given file path.
     

In [2]:
function check_line(line, start)
    if length(line) > length(start) - 1
        return (line[1:length(start)] == start)
    end
    return false
end

function make_spacing_dict(line1, line2)
    spacing_ends = [collect(out)[1] for out in findall(">", line2)]
    spacing_starts = append!([1], [collect(out)[1] for out in findall("<", line2)])
    #Find where some of the spacings begin and end based on the arrows
    if length(spacing_ends) != length(spacing_starts)
        print("Error: improper formatting")
        return 
    end
    indices_with_gaps = [index for index in  1:length(spacing_ends)-1 if 
                    spacing_ends[index] != spacing_starts[index+1]-1]
    #Some of the spacings are instead denoted by the letter o instead of arrows
    missing_spacings = [spacing_ends[index]+1:spacing_starts[index+1]-1 for index in indices_with_gaps]
    spacings = append!(missing_spacings, 
    [spacing_starts[i]:spacing_ends[i] for i in 1:length(spacing_ends)])
    spacing_dict = Dict([])
    #Make a dictionary where the keys are name for each column and the values are the indices of the columns
    spacing_names = [spacing_dict[strip(line1[spacing], [' ', '#'])] = spacing for spacing in spacings]
    return spacing_dict
end

function read_datum(datum)
    #Reads a single datum from a line of data
    datum = strip(datum, [' '])
    out = tryparse(Float64, datum)
    if out == nothing
        return datum 
    end
    return out
end

function read_dataset(spacing_specifier, file_as_vector)
    index = spacing_specifier
    spacing_dict = make_spacing_dict(file_as_vector[index], file_as_vector[index + 1])
    lines_of_data = tryparse(Int64, split(file_as_vector[index - 1], [' '])[end])
    data_dict = Dict([])
    filler = [[read_datum(file_as_vector[line_num][spacing_dict[key]])
                for line_num in index + 2: index + 1 + lines_of_data] 
                for key in keys(spacing_dict)]
    return filler#, lines_of_data
end

read_dataset (generic function with 1 method)

In [3]:
function check_spacings(file_path)
    #Reads an EXFOR file and returns a dictionary of data
    file_as_vector = readlines(file_path)
    spacing_specifiers = [index for index in 1:length(file_as_vector) 
                        if check_line(file_as_vector[index], "# Prj")]
    if spacing_specifiers == []
        print("Error: no data found")
        return
    end
    list_spacing_dict = [make_spacing_dict(file_as_vector[spacing_specifier], 
                                            file_as_vector[spacing_specifier+1])
                        for spacing_specifier in spacing_specifiers]
    #check if the spacing dicts are the same
    for index in 2:length(spacing_specifiers)
        if list_spacing_dict[index] != list_spacing_dict[1]
            return false
        end
    end
    return true
end

check_spacings (generic function with 1 method)

In [5]:
subdirs = [content * "\\" for content in readdir(data_dir) if isdir(data_dir * "\\" * content)]
for subdir in subdirs
    files = readdir(data_dir * subdir)
    test = [check_spacings(data_dir * subdir * file) for file in files]
    println("It is ", all(test),
     " that the spacings in each file are all self consistent for the subdirectory ", subdir)
end

It is true that the spacings in each file are all self consistent for the subdirectory alphas\
It is true that the spacings in each file are all self consistent for the subdirectory deuterons\
It is true that the spacings in each file are all self consistent for the subdirectory gammas\
It is true that the spacings in each file are all self consistent for the subdirectory helions\
It is true that the spacings in each file are all self consistent for the subdirectory neutrons\
It is true that the spacings in each file are all self consistent for the subdirectory other\
It is true that the spacings in each file are all self consistent for the subdirectory protons\


So I only need to retrieve the spacings of the data once for each file. 

It it faster to use 1 or loop for $n$ list comprehensions? 

In [22]:
get_field(field, input_array) = reduce(CatView, mappedarray(x->x[field], input_array))

function read_exfor_file(file_path)
    #Reads an EXFOR file and returns a dictionary of data
    file_as_vector = readlines(file_path)
    spacing_specifiers = [index for index in 1:length(file_as_vector) 
                        if check_line(file_as_vector[index], "# Prj")]
    if spacing_specifiers == []
        print("Error: no data found")
        return
    end
    dataset_rows = [tryparse(Int64, split(file_as_vector[index - 1], [' '])[end])
                            for index in spacing_specifiers]
    #Each dataset should have the same column names
    spacing_dict = make_spacing_dict(file_as_vector[spacing_specifiers[1]], 
                    file_as_vector[spacing_specifiers[1] + 1])
    #make an empty dataframe to fill up with the data
    df = DataFrame([Vector{Union{Missing, Float64, String, SubString{String}}}(missing, sum(dataset_rows)) 
                   for _ in 1:length(keys(spacing_dict))], [key for key in keys(spacing_dict)])
    _ = [df[:,key] = vec([[read_datum(file_as_vector[line_num][spacing_dict[key]]) 
                    for line_num in spacing_specifiers[index] + 2: 
                                    spacing_specifiers[index] + 1 + dataset_rows[index]]
                                    for index in 1:length(spacing_specifiers)])
                    for key in keys(spacing_dict)]           
    return df
    #df = DataFrame([Vector{Union{Missing, Float64, String, SubString{String}}}(missing, rows_in_combined_df) 
    #                for _ in 1:length(spacing_keys)], spacing_keys)
    #Fill the dataframe with the data
    #current_row = 1
    #This for loop is probably the slowest part of the code. It alone takes over 50% of the runtime.  
    #df = reduce(vcat,[DataFrame(read_dataset(spacing_specifiers[i], file_as_vector)) 
    #        for i in 1:length(spacing_specifiers)])
    list_of_datasets = [read_dataset(spacing_specifiers[i], file_as_vector)
                    for i in 1:length(spacing_specifiers)] 
    #final_dict = reduce(vcat, dict_list)
    return list_of_datasets
    df = get_field("Data", [read_dataset(spacing_specifiers[i], file_as_vector)
                             for i in 1:length(spacing_specifiers)])
    #==
    for spacing_specifier in spacing_specifiers
        data_dict, rows_in_dataset = read_dataset(spacing_specifier, file_as_vector)
        _ = [df[current_row:current_row+rows_in_dataset-1, key] = data_dict[key]
            for key in keys(data_dict)]
        current_row += rows_in_dataset
    end
    ==#
    return df
end

read_exfor_file (generic function with 2 methods)

Retrieving all of data at the correct spacings (without reorganizing it) takes 4 seconds for a large file. 

In [21]:
file_path = data_dir * "alphas\\007_N_014.c4"
read_exfor_file(file_path)

UndefVarError: UndefVarError: index not defined

Maybe consider using CatViews? 

https://www.juliapackages.com/p/catviews 

https://stackoverflow.com/questions/46301279/julia-efficient-ways-to-vcat-n-arrays 

method5(json_in) = reduce(CatView, mappedarray(x->x["transactions"], json_in))

In [5]:
A = randn(10, 10);
B = randn(10, 10);
a = view(A, :);      # no copying
b = view(B, :);      # no copying
x = CatView(a, b)

200-element CatView{2, Float64}:
  0.9951794404941244
 -0.710940746884212
  0.46077682059309294
  2.20655157813473
 -0.726121253635956
  0.047923866801383386
  0.31472713103841204
  0.8359749511993977
  0.8541259742661336
  0.8118137456720229
  ⋮
 -1.5251789978482457
 -0.8789687003101161
  0.15513860791591735
  0.3847956801629314
 -0.6254079611667328
 -0.08286596220103365
 -0.09567205982105349
  0.8917107265925021
  1.2570873129485147

In [10]:
a = Dict(["cat" => 1, "dog" => 2])
length(keys(a))

2

In [29]:
c = reshape([1:1000], 1000,1);

DimensionMismatch: DimensionMismatch("new dimensions (1000, 1) must be consistent with array size 1")

For a 2 layer nested vector

In [87]:
a = [[m*n for n in 1:10] for m in 1:10]
collect(Iterators.flatten(a))

100-element Vector{Int64}:
   1
   2
   3
   4
   5
   6
   7
   8
   9
  10
   ⋮
  20
  30
  40
  50
  60
  70
  80
  90
 100

For a 3 layer nested vector

In [86]:
a = [[collect(m:n) for n in 1:10] for m in 1:10]
collect(Iterators.flatten(collect.(Iterators.flatten.(a))))

220-element Vector{Int64}:
  1
  1
  2
  1
  2
  3
  1
  2
  3
  4
  ⋮
  8
  9
  8
  9
 10
  9
  9
 10
 10

In [35]:
d = vec([1:1000])
reshape(d, 1000,1)

DimensionMismatch: DimensionMismatch("new dimensions (1000, 1) must be consistent with array size 1")