In [None]:
from pathlib import Path
import csv

DATA_ROOT = Path("../data/raw")         

total_rows = 0
file_rows  = {}
print(f"Counting rows in all CSVs in {DATA_ROOT}...\n")
for csv_path in DATA_ROOT.rglob("*.csv"):
    with csv_path.open(newline="") as f:
        reader = csv.reader(f)
        row_count = sum(1 for _ in reader) 
    file_rows[csv_path] = row_count
    total_rows += row_count

print(f"\nGrand total rows across all CSVs: {total_rows:,}\n")


Counting rows in all CSVs in ..\data\raw...


Grand total rows across all CSVs: 19,797,842



In [None]:

print("Top-level summary (folder → rows):")
folder_totals = {}
for path, rows in file_rows.items():
    top_folder = path.relative_to(DATA_ROOT).parts[0]  # e.g. 'commodities'
    folder_totals[top_folder] = folder_totals.get(top_folder, 0) + rows

for folder, rows in sorted(folder_totals.items()):
    print(f"  {folder:<25} {rows:,}")

#uncomment to list every file
#for path, rows in sorted(file_rows.items()):
#    print(f"{path}: {rows:,}")


Top-level summary (folder → rows):
  commodities               573,159
  dax                       341,750
  forex                     627,009
  indices                   372,543
  japan-investment-trust    41,589
  japanstockes              11,554,859
  s&p500                    6,286,933


In [None]:
from pathlib import Path
import csv
from collections import defaultdict

DATA_ROOT = Path("../data/raw")     

grand_total = 0
summary = defaultdict(lambda: defaultdict(int))

for csv_path in DATA_ROOT.rglob("*.csv"):
    with csv_path.open(newline="") as f:
        row_count = sum(1 for _ in csv.reader(f)) - 1   # minus header

    rel_parts = csv_path.relative_to(DATA_ROOT).parts

    # top-level folder, e.g. 'commodities'
    top_folder = rel_parts[0]

    # frequency = second part if it looks like '1D' / '1h', else 'other'
    freq = rel_parts[1] if len(rel_parts) > 1 and rel_parts[1].lower() in {"1d", "1h"} else "other"

    summary[top_folder][freq] += row_count
    grand_total += row_count

print(f"\nGrand total rows across all CSVs: {grand_total:,}\n")

for folder in sorted(summary):
    print(f"{folder}/")
    for freq, rows in sorted(summary[folder].items()):
        print(f"  └─ {freq:<5} : {rows:,}")
    print()



Grand total rows across all CSVs: 22,313,454

commodities/
  └─ 1D    : 190,202
  └─ 1h    : 382,957

dax/
  └─ 1D    : 153,885
  └─ 1h    : 187,865

forex/
  └─ 1D    : 142,085
  └─ 1h    : 484,924

france/
  └─ 1D    : 171,923
  └─ 1h    : 191,332

indices/
  └─ 1D    : 226,707
  └─ 1h    : 145,836

japan-investment-trust/
  └─ 1D    : 16,090
  └─ 1h    : 25,499

japanstockes/
  └─ 1D    : 5,418,373
  └─ 1h    : 6,136,486

s&p500/
  └─ 1D    : 3,780,318
  └─ 1h    : 2,506,615

uk/
  └─ 1D    : 985,740
  └─ 1h    : 1,166,617



In [None]:
#remove unwanted characters from CSV ("" and ,)
from pathlib import Path

src  = Path("../uk.csv")
dest = Path("../uk1.csv")

# translation table maps unwanted characters to None
table = str.maketrans('', '', '",')

with src.open(newline='', encoding='utf-8') as fin, \
     dest.open('w', newline='', encoding='utf-8') as fout:
    for line in fin:
        fout.write(line.translate(table))
