# Data Modifications
This notebook will strip out the uneeeded fields from the source data files in order to cut down on the amount of data placed into the databases and import times for files.

In [1]:
import json
import csv

import sys
!{sys.executable} -m pip install tqdm
from tqdm import tqdm

def generate_chunks(data, chunk_size):
    for i in range(0, len(data), chunk_size):
        yield data[i: i+chunk_size]

def generate_stripped_file(file_name, remove_fields):
    #Open the data file
    with open (file_name, 'r') as file:
        data = json.load(file)
    
    #Remove the data for each field in the remove_fields list
    for row in tqdm(data, desc="Removing Uneeded Fields..."):
        for field in remove_fields:
            row.pop(field, None)
    
    #Append _stripped.json to the file name to create the new file
    stripped_file_name = file_name[:-5] + '_stripped.json'
    #Write the new contents to the new data file
    with open (stripped_file_name, 'w') as stripped_file:
        json.dump(data, stripped_file)



## Commodities

In [6]:
file_name = 'commodities.json'
remove_fields = ['is_rare', 'is_non_marketable']
generate_stripped_file(file_name, remove_fields)

## Factions

In [7]:
file_name = 'factions.json'
remove_fields = ['updated_at', 'is_player_faction', 'government_id', 'alliance_id']
generate_stripped_file(file_name, remove_fields)

## Modules

In [8]:
file_name = 'modules.json'
remove_fields = ['belongs_to', 'ed_id', 'game_context_id']
generate_stripped_file(file_name, remove_fields)

## Stations

In [2]:
file_name = 'stations.json'
remove_fields = ['updated_at', 'government_id', 'alliance_id', 'shipyard_updated_at', 'outfitting_updated_at', 
                 'market_updated_at', 'ed_market_id', 'body_id']
generate_stripped_file(file_name, remove_fields)

with open ('stations.json', 'r') as file:
    data = json.load(file)

#Generate the modules listings .csv files from the stations
data_list = []
for row in data:
    if row['selling_modules']:
        data_list.append({'station_id': row['id'], 'modules': row['selling_modules']})
chunks = list(generate_chunks(data_list, 400))
csv_columns = data_list[0].keys()
for i in tqdm(range(len(chunks)), desc="Writing module chunk files..."):
    with open (f'module_listings/modules_listing{i}.csv', 'w', newline='') as module_file:
        writer = csv.DictWriter(module_file, fieldnames=csv_columns)
        writer.writeheader()
        for row in chunks[i]:
            writer.writerow(row)
del chunks
del data

Removing Uneeded Fields...: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 66024/66024 [00:00<00:00, 421654.78it/s]
Writing module chunk files...: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 113/113 [00:09<00:00, 12.41it/s]


## System Populated

In [10]:
file_name = 'systems_populated.json'
remove_fields = ['allegiance_id', 'security_id', 'primary_economy_id', 'power_state_id', 'reserve_type_id', 
                 'ed_system_address']
generate_stripped_file(file_name, remove_fields)

## Listings

In [8]:
remove_fields = ['supply_bracket', 'demand_bracket', 'collected_at']

with open ('listings.csv', 'r', encoding='utf-8') as file:
    csv_reader = csv.DictReader(file)
    data = list(csv_reader)
    
for row in tqdm(data, desc="Removing Uneeded Fields..."):
    for field in remove_fields:
        row.pop(field, None)

chunks = list(generate_chunks(data, int(len(data)/15)))
csv_columns = data[0].keys()
for i in tqdm(range(len(chunks)), desc="Writing Chunk Files..."):
    with open (f'listing_chunks/listings_stripped{i}.csv', 'w', newline='') as stripped_file:
        writer = csv.DictWriter(stripped_file, fieldnames=csv_columns)
        writer.writeheader()
        for row in chunks[i]:
            writer.writerow(row)
del data
del chunks

Removing Uneeded Fields...: 100%|████████████████████████████████████████| 3931637/3931637 [00:04<00:00, 882091.26it/s]
Writing Chunk Files...: 100%|██████████████████████████████████████████████████████████| 16/16 [00:19<00:00,  1.21s/it]
