# Parse data from raw files and create .json with target data

In [1]:
# import
import os
import csv
import json
# import h5py # TODO: write data to hdf5 file

In [2]:
# config
INPUT_DIR = 'raw_data'

### Obtain Relevant Data
In this case, I'm only interested in the height and weight from each sport
we'll;
1. read in the data
2. parse relevant information
3. write the data to a new file for future use

Writing the cleaned data to a new file may not be necessary, but if we were dealing with massive ammounts of data, depending on our machiene, it may not be possible to store/read all the data at once.  In this example, we'll loop each file one at a time

### Create dictionary mapping of header information

In [3]:
# loop directory, create header details dict for each file
ATTRIBUTES = ['Height', 'Weight']
fileDict_attributeMapping = {}
for datapath in os.listdir(INPUT_DIR):
    with open(os.path.join(INPUT_DIR, datapath), 'rb') as csvfile:
        fileDict_attributeMapping[datapath] = {}
        
        # read header row, decode from bytes
        fileHeader = csvfile.readline()
        fileHeader = fileHeader.decode('ascii')
        
        # clean trailing return and newline char
        fileHeader = fileHeader.rstrip("\r\n")

        # split into contents
        fileHeader_contents = fileHeader.split(",")
        
        # create index mapping
        # loop attributes, create mapping for each item
        for attribute in ATTRIBUTES:
            for index, item in enumerate(fileHeader_contents):
                if item == attribute:
                    fileDict_attributeMapping[datapath][attribute] = index

# ensure we've created our mapping        
print(fileDict_attributeMapping)

{'nhl_ht_wt.csv': {'Weight': 4, 'Height': 3}, 'epl_2015_ht_wt.csv': {'Weight': 6, 'Height': 5}, 'nba_ht_wt.csv': {'Weight': 3, 'Height': 2}}


In [4]:
# loop directories and read in data
# NOTE: this reads the entire file into memory
# > and could be modified at a later date to read line by line 

# this will hold the data (in the intended format) to write to the `.json` file
data_dict = {}

for datapath in os.listdir(INPUT_DIR):
    # create dataset name from file path ('nba', 'nhl', 'epl)
    groupName = datapath[:3]
    
    # write relevant data from each file in the root directory
    curDict = fileDict_attributeMapping[datapath]
    with open(os.path.join(INPUT_DIR, datapath), encoding='latin-1') as csvfile:
        reader = csv.reader(csvfile)
        
        firstLine = True
        listOfData = []
        for row in reader:
            # add relevant data to a list from each row (in this case an individual player)
            individualDataDict = {}
            
            # skip the first (header) line
            if firstLine:
                firstLine = False
                continue
             
            # collect data from each target attribute
            for attribute in ATTRIBUTES:
                curVal = row[curDict[attribute]]
                individualDataDict[attribute] = curVal
            
            listOfData.append(individualDataDict)
            
    data_dict[groupName] = listOfData

In [5]:
# Write to json file
# doc: https://www.quantifiedcode.com/knowledge-base/maintainability/Use%20dump%20instead%20of%20dumps%20for%20json%20files/55b5QncE
with open("ht_wt_data_2014.json", "w") as json_file:
    json.dump(data_dict, json_file)

## All data was collected and stored in a json file
Next we'll explore this data