In [4]:
import polars as pl
import pandas as pd
import time
from pathlib import Path

In [5]:
project_root = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()

data_path = project_root / "data" / "raw"
bronze_path = project_root / "data" / "bronze"

bronze_path.mkdir(parents=True, exist_ok=True)

print(f"Project root: {project_root}")
print(f"Data path: {data_path}")
print(f"Bronze folder created: {bronze_path}")

Project root: c:\python-projects\F1-pipeline
Data path: c:\python-projects\F1-pipeline\data\raw
Bronze folder created: c:\python-projects\F1-pipeline\data\bronze


In [6]:
csv_files = list(data_path.glob("*.csv"))
print(f"Number of CSV files: {len(csv_files)}\n")
for file in csv_files:
    print(f"  - {file.name}")

Number of CSV files: 14

  - circuits.csv
  - constructors.csv
  - constructor_results.csv
  - constructor_standings.csv
  - drivers.csv
  - driver_standings.csv
  - lap_times.csv
  - pit_stops.csv
  - qualifying.csv
  - races.csv
  - results.csv
  - seasons.csv
  - sprint_results.csv
  - status.csv


In [12]:
print("\n" + "="*50)
print("PANDAS vs POLARS")
print("="*50)

start = time.time()
df_pandas = pd.read_csv(data_path / "circuits.csv")
pandas_time = time.time() - start
print(f"\n Pandas: {pandas_time:.4f} seconds")
print(f"  Shape: {df_pandas.shape}")
print(f" Memory: {df_pandas.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

start = time.time()
df_polars = pl.read_csv(data_path / "circuits.csv")
polars_time = time.time() - start
print(f"\n Polars: {polars_time:.4f} seconds")
print(f"  Shape: {df_polars.shape}")
print(f" Memory: {df_polars.estimated_size('mb'):.2f} MB")

print(f"\nPolars was {pandas_time / polars_time:.2f}x faster!")


PANDAS vs POLARS

 Pandas: 0.0043 seconds
  Shape: (77, 9)
 Memory: 0.03 MB

 Polars: 0.0010 seconds
  Shape: (77, 9)
 Memory: 0.01 MB

Polars was 4.32x faster!


In [13]:
print("\n" + "="*50)
print("DATASET STRUCTURE: circuits")
print("="*50)

print("\nSchema (data types):")
print(df_polars.schema)

print("\nFirst 5 rows:")
print(df_polars.head())

print("\nDescriptive statistics:")
print(df_polars.describe())


DATASET STRUCTURE: circuits

Schema (data types):
Schema([('circuitId', Int64), ('circuitRef', String), ('name', String), ('location', String), ('country', String), ('lat', Float64), ('lng', Float64), ('alt', Int64), ('url', String)])

First 5 rows:
shape: (5, 9)
┌───────────┬─────────────┬─────────────┬─────────────┬───┬──────────┬─────────┬─────┬─────────────┐
│ circuitId ┆ circuitRef  ┆ name        ┆ location    ┆ … ┆ lat      ┆ lng     ┆ alt ┆ url         │
│ ---       ┆ ---         ┆ ---         ┆ ---         ┆   ┆ ---      ┆ ---     ┆ --- ┆ ---         │
│ i64       ┆ str         ┆ str         ┆ str         ┆   ┆ f64      ┆ f64     ┆ i64 ┆ str         │
╞═══════════╪═════════════╪═════════════╪═════════════╪═══╪══════════╪═════════╪═════╪═════════════╡
│ 1         ┆ albert_park ┆ Albert Park ┆ Melbourne   ┆ … ┆ -37.8497 ┆ 144.968 ┆ 10  ┆ http://en.w │
│           ┆             ┆ Grand Prix  ┆             ┆   ┆          ┆         ┆     ┆ ikipedia.or │
│           ┆             ┆ 