In [None]:
import gc
import os
import json
import psutil
import zipfile
import warnings
from pathlib import Path
from datetime import datetime

import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

warnings.filterwarnings("ignore")

In [None]:
symbol = "SOLUSDT"
data_type = "klines"

data_folder = Path.cwd().parent / "data"
downloaded_folder = data_folder / "downloaded"
raw_folder = data_folder / "raw"
symbol_folder = raw_folder / symbol
symbol_folder.mkdir(parents=True, exist_ok=True)

### Data Extracting

This step unzips downloaded archives containing CSVs for the target symbol (BTCUSDT), splits the extracted rows into manageable chunks and saves each chunk as a Parquet file in the `symbol` folder, then loads and concatenates the relevant Parquet parts for the requested date range so the data is ready for analysis.

In [None]:
ROWS = 250_000
buf = []
buf_rows = 0
part = 1

cols = [
	"open_time",
	"open",
	"high",
	"low",
	"close",
	"volume",
	"close_time",
	"quote_volume",
	"count",
	"taker_buy_volume",
	"taker_buy_quote_volume",
	"ignore",
]

dtypes = {
	"open_time": "int64",
	"open": "float64",
	"high": "float64",
	"low": "float64",
	"close": "float64",
	"volume": "float64",
	"close_time": "int64",
	"quote_volume": "float64",
	"count": "int64",
	"taker_buy_volume": "float64",
	"taker_buy_quote_volume": "float64",
	"ignore": "int64",
}

def flush():
	global buf, buf_rows, part
	if buf_rows == 0:
		return
	df = pd.concat(buf, ignore_index=True)
	table = pa.Table.from_pandas(df, preserve_index=False)
	pq.write_table(table, symbol_folder / f"part{part:04d}.parquet")
	buf.clear()
	buf_rows = 0
	part += 1

def convert_dtypes(df):
	for col, dtype in dtypes.items():
		df[col] = df[col].astype(dtype)
	return df

for zip_file in sorted(
	downloaded_folder.glob(f"{symbol}-*{data_type}.zip"),
	key=lambda x: datetime.strptime("-".join(x.stem.split("-")[-3:]).split("_")[0], "%Y-%m-%d"),
):
	with zipfile.ZipFile(zip_file) as zf:
		for name in sorted([n for n in zf.namelist() if n.lower().endswith(".csv")]):
			with zf.open(name) as f:
				for chunk in pd.read_csv(
					f,
					header=None,
					names=cols,
					chunksize=10_000
				):
					chunk = chunk[~chunk.iloc[:, 0].eq(chunk.columns[0])]
					chunk = convert_dtypes(chunk)
					while len(chunk) > 0:
						need = ROWS - buf_rows
						take = chunk.iloc[:need]
						if len(take) > 0:
							buf.append(take)
							buf_rows += len(take)
						chunk = chunk.iloc[need:]
						if buf_rows == ROWS:
							flush()
	zip_file.unlink()

flush()
print(f"Saved {part-1} parts to {symbol_folder}")

### Resource usage

Loads each Parquet part briefly to report on-disk size, the DataFrame's logical memory usage, and the process RSS delta incurred while reading the file. The printed per-part metrics (disk, logical_mem, rss_delta) and the derived "overhead" factor are intended to help you estimate the real memory cost of loading the dataset as a single DataFrame.

Review the SUMMARY output to understand whether it is safe to load all parts at once or whether you should instead: process data in streaming/chunked fashion, filter or downcast columns, increase swap/VM resources, or use out-of-core tools (e.g., Dask, Vaex) to avoid exhausting system RAM.

In [None]:
def sizeof_fmt(num, suffix="B"):
	for unit in ["", "Ki", "Mi", "Gi", "Ti"]:
		if abs(num) < 1024.0:
			return f"{num:3.1f}{unit}{suffix}"
		num /= 1024.0
	return f"{num:.1f}Pi{suffix}"

def rss_bytes():
	return psutil.Process(os.getpid()).memory_info().rss

parts = sorted(symbol_folder.glob("part*.parquet"))
if not parts:
	print(f"No parquet parts found in {symbol_folder}")
else:
	rows = []
	disk_sizes = []
	logical_mem = []
	rss_deltas = []
	overheads = []

	for p in parts:
		size_on_disk = p.stat().st_size
		before = rss_bytes()
		df = pd.read_parquet(p, engine="pyarrow")
		after = rss_bytes()
		row_count = len(df)
		mem_bytes = df.memory_usage(deep=True).sum()
		delta_rss = max(after - before, 0)
		overhead = delta_rss / mem_bytes if mem_bytes else 0

		rows.append(row_count)
		disk_sizes.append(size_on_disk)
		logical_mem.append(mem_bytes)
		rss_deltas.append(delta_rss)
		overheads.append(overhead)

		print(f"{p.name}: rows={row_count:,}, disk={sizeof_fmt(size_on_disk)}, "
			  f"logical_mem={sizeof_fmt(mem_bytes)}, rss_delta={sizeof_fmt(delta_rss)}, "
			  f"overhead={overhead:,.2f}x")

		del df
		gc.collect()

	total_rows = sum(rows)
	total_disk = sum(disk_sizes)
	total_logical = sum(logical_mem)
	total_rss = sum(rss_deltas)
	avg_overhead = sum(overheads) / len(overheads) if overheads else 0
	avg_mem_per_row = total_rss / total_rows if total_rows else 0
	est_total_single_df = int(total_rss * 1.10)

	print("\nSUMMARY")
	print(f"parts: {len(parts)}")
	print(f"total rows: {total_rows:,}")
	print(f"total disk size: {sizeof_fmt(total_disk)}")
	print(f"total logical mem: {sizeof_fmt(total_logical)}")
	print(f"total rss delta: {sizeof_fmt(total_rss)}")
	print(f"avg overhead factor: {avg_overhead:.2f}x")
	print(f"avg mem per row: {sizeof_fmt(avg_mem_per_row)}/row")
	print(f"estimated memory to load as one DataFrame (≈+10% overhead): {sizeof_fmt(est_total_single_df)}")

### Continuity checks for time series integrity

Vrifies the 1-minute cadence of the `open_time` column across and between Parquet parts. It detects duplicates, non-monotonic steps, and missing rows by comparing successive timestamps. Use these diagnostics to identify gaps or anomalies that must be handled before feature engineering or model training (e.g., imputation, interpolation, or dropping corrupted rows).

In [None]:
# verify continuity of open_time across parquet parts (1 minute = 60_000 ms)
DELTA = 60_000

issues = []
prev_last = None

for p in sorted(parts):
	df = pd.read_parquet(p, engine="pyarrow")
	df = df.sort_values("open_time").reset_index(drop=True)

	diffs = df["open_time"].diff()
	bad_idx = diffs[1:].ne(DELTA)
	if bad_idx.any():
		for rel_i in bad_idx[bad_idx].index:
			i = rel_i  # already offset by 0 because we sliced [1:]
			prev_time = int(df.at[i - 1, "open_time"])
			cur_time = int(df.at[i, "open_time"])
			diff = cur_time - prev_time
			if diff == 0:
				kind = "duplicate"
				missing = 0
			elif diff < DELTA:
				kind = "non-monotonic/too-close"
				missing = 0
			else:
				kind = "missing"
				missing = int(diff // DELTA) - 1
			issues.append({
				"part": p.name,
				"type": kind,
				"prev_index": i - 1,
				"cur_index": i,
				"prev_time_ms": prev_time,
				"cur_time_ms": cur_time,
				"prev_time": pd.to_datetime(prev_time, unit="ms"),
				"cur_time": pd.to_datetime(cur_time, unit="ms"),
				"diff_ms": diff,
				"missing_rows": missing,
			})

	# check boundary with previous part
	first_time = int(df.at[0, "open_time"])
	if prev_last is not None:
		if first_time != prev_last + DELTA:
			diff = first_time - prev_last
			if diff == 0:
				kind = "boundary_duplicate"
				missing = 0
			elif diff < DELTA:
				kind = "boundary_non_monotonic/too-close"
				missing = 0
			else:
				kind = "boundary_missing"
				missing = int(diff // DELTA) - 1
			issues.append({
				"part": p.name,
				"type": kind,
				"prev_part_last_time_ms": prev_last,
				"cur_part_first_time_ms": first_time,
				"prev_part_last_time": pd.to_datetime(prev_last, unit="ms"),
				"cur_part_first_time": pd.to_datetime(first_time, unit="ms"),
				"diff_ms": diff,
				"missing_rows": missing,
			})

	prev_last = int(df.at[len(df) - 1, "open_time"])

	del df
	gc.collect()

if not issues:
	print("No gaps or anomalies detected!")
else:
	print(f"Detected {len(issues)} issue(s). Sample:")
	for it in issues[:10]:
		it2 = it.copy()
		if "prev_time" in it2:
			it2["prev_time"] = str(it2["prev_time"])
		if "cur_time" in it2:
			it2["cur_time"] = str(it2["cur_time"])
		if "prev_part_last_time" in it2:
			it2["prev_part_last_time"] = str(it2["prev_part_last_time"])
		if "cur_part_first_time" in it2:
			it2["cur_part_first_time"] = str(it2["cur_part_first_time"])
		print(json.dumps(it2, default=str, indent=2))
	issues_df = pd.DataFrame(issues)
	display(issues_df)

In [None]:
dfs_with_symbols = []
for p in sorted(parts):
	df = pd.read_parquet(p, engine="pyarrow")
	df = df.sort_values("open_time").reset_index(drop=True)
	dfs_with_symbols.append(df)
	gc.collect()

df = pd.concat(dfs_with_symbols, ignore_index=True)
del dfs_with_symbols
gc.collect()
df
	

In [None]:
df_without_time = df.drop(["open_time", "close_time"], axis=1)
print(f"\n{'='*15} STATS {'='*15}\n")
print(df_without_time.describe().T.to_string(float_format='{:,.0f}'.format))

print(f"\n{'='*15} INFO {'='*15}\n")
df.info()

print(f"\n{'='*15} DUPLICATES STATS {'='*15}\n")
duplicates_per_column = pd.DataFrame({
	"count": df.shape[0] - df.nunique(),
    "percent": ((df.shape[0] - df.nunique()) / df.shape[0] * 100).round(2)
})
print(duplicates_per_column)

print(f"\n{'='*15} TOP 5 MOST FREQUENT VALUES {'='*15}\n")
frequent_values = {}
for col in df.columns:
	if duplicates_per_column.T[col].iloc[0] == 0:
		continue

	counts = df[col].value_counts(normalize=False).head(5)
	percents = df[col].value_counts(normalize=True).head(5) * 100
	# counts = counts.reindex(range(5))
	combined = pd.DataFrame({
		"Value": counts.index,
		"Count": counts.values,
		"Percent": percents.values.round(2)
	})
	frequent_values[col] = combined

for col, table in frequent_values.items():
	print(f"Column: {col}")
	print(f"{table}\n")
