<a href="https://colab.research.google.com/github/Mevaria/AAI614_Wehbe/blob/main/Notebook2_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# AAI614: Data Science & its Applications

*Notebook 2.5: Practice with Parquet and File Types*



In [1]:
!pip install pyarrow



In [4]:
import pandas as pd
import time
import ssl

ssl._create_default_https_context = ssl._create_unverified_context

In [5]:
class Timer:
    def __enter__(self):
        self.start = time.perf_counter()
        return self

    def __exit__(self, *args):
        self.end = time.perf_counter()
        self.interval = self.end - self.start

##### Read the Parqeut file and time it

In [6]:
with Timer() as t_pd:
    df = pd.read_parquet('https://raw.githubusercontent.com/harmanani/AAI614/main/Week%202/niaaa-report.parquet')
print(t_pd.interval)

0.35003377999998975


#### Read the CSV file and time it

In [7]:
with Timer() as t_pd:
    df = pd.read_csv('https://raw.githubusercontent.com/harmanani/AAI614/main/Week%202/niaaa-report.csv')
print(t_pd.interval)

0.14548256300000162


#### Read the ZIP file and time it

In [8]:
import zipfile
with Timer() as t_pd:
    df = pd.read_csv('https://raw.githubusercontent.com/harmanani/AAI614/main/Week%202/niaaa-report.zip', compression="zip")
print(t_pd.interval)

0.1257508180000002


In [16]:
import numpy as np
import string, random
from pathlib import Path
from zipfile import ZipFile, ZIP_DEFLATED

def generate_big_dataset(rows=1_000_000, out_dir="output", base_name="big_dataset"):
    """
    Generate a synthetic dataset with the given number of rows and
    save it as CSV, ZIP (containing the CSV), and Parquet.

    Args:
        rows (int): number of rows to generate
        out_dir (str): directory to save files
        base_name (str): base name for output files
    """
    Path(out_dir).mkdir(parents=True, exist_ok=True)

    # Generate synthetic data
    rng = np.random.default_rng(42)
    categories = [f"cat_{i}" for i in range(50)]
    dates = pd.date_range("2010-01-01", periods=3650, freq="D")

    def random_strings(n, length=10):
        letters = string.ascii_letters + string.digits
        return ["".join(random.choices(letters, k=length)) for _ in range(n)]

    df = pd.DataFrame({
        "id": np.arange(rows, dtype=np.int64),
        "category": rng.choice(categories, size=rows),
        "value_float": rng.normal(100, 15, size=rows),
        "value_int": rng.integers(0, 1000, size=rows),
        "flag": rng.choice([True, False], size=rows),
        "date": rng.choice(dates, size=rows),
        "text": random_strings(rows, 12),
    })

    # File paths
    csv_path = Path(out_dir) / f"{base_name}.csv"
    zip_path = Path(out_dir) / f"{base_name}.zip"
    parquet_path = Path(out_dir) / f"{base_name}.parquet"

    # Save CSV
    df.to_csv(csv_path, index=False)

    # Save ZIP (containing CSV)
    with ZipFile(zip_path, "w", compression=ZIP_DEFLATED, compresslevel=6) as zf:
        zf.write(csv_path, arcname=csv_path.name)

    # Save Parquet (requires pyarrow or fastparquet)
    df.to_parquet(parquet_path, index=False, engine="pyarrow")

    print(f"Generated {rows:,} rows")
    print(f"CSV: {csv_path}")
    print(f"ZIP: {zip_path}")
    print(f"Parquet: {parquet_path}")
    return df


In [17]:
df_big = generate_big_dataset(rows=2_000_000, out_dir="output", base_name="niaaa_fake")

Generated 2,000,000 rows
CSV: output/niaaa_fake.csv
ZIP: output/niaaa_fake.zip
Parquet: output/niaaa_fake.parquet


In [18]:
with Timer() as t_pd:
    df = pd.read_parquet('/content/output/niaaa_fake.parquet')
print(t_pd.interval)

1.7667702629998985


In [19]:
with Timer() as t_pd:
    df = pd.read_csv('/content/output/niaaa_fake.csv')
print(t_pd.interval)

4.6059821200000215


In [20]:
import zipfile
with Timer() as t_pd:
    df = pd.read_csv('/content/output/niaaa_fake.zip', compression="zip")
print(t_pd.interval)

4.640943649000064
