# Polars Tutorial - Part 2: Data Import and Export

In this notebook, we'll explore:
- Reading data from various file formats (CSV, JSON, Parquet, Excel)
- Writing data to different formats
- Handling different encodings and compression
- Working with our sample datasets

In [None]:
import polars as pl
import os

# Set data directory path
DATA_DIR = '../data/'

print(f"Polars version: {pl.__version__}")
print(f"Data directory: {DATA_DIR}")

## 1. Reading CSV Files

### 1.1 Basic CSV Reading

In [None]:
# Read CSV file
df_sales = pl.read_csv(os.path.join(DATA_DIR, 'sales_data.csv'))

print("Sales Data:")
print(df_sales)
print(f"\nShape: {df_sales.shape}")
print(f"\nData types:\n{df_sales.dtypes}")

### 1.2 CSV Reading with Options

In [None]:
# Read CSV with specific options
df_sales_typed = pl.read_csv(
    os.path.join(DATA_DIR, 'sales_data.csv'),
    try_parse_dates=True,  # Automatically parse date columns
    null_values=['NA', 'null', ''],  # Treat these as null
)

print("Sales Data with parsed dates:")
print(df_sales_typed)
print(f"\nDate column type: {df_sales_typed['date'].dtype}")

### 1.3 Reading Specific Columns

In [None]:
# Read only specific columns
df_subset = pl.read_csv(
    os.path.join(DATA_DIR, 'sales_data.csv'),
    columns=['date', 'product', 'revenue', 'region']
)

print("Subset of columns:")
print(df_subset.head())

### 1.4 CSV Scanning (Lazy Reading)

In [None]:
# Scan CSV file lazily (doesn't load into memory immediately)
lf_sales = pl.scan_csv(os.path.join(DATA_DIR, 'sales_data.csv'))

# Apply operations on lazy frame
result = lf_sales.filter(
    pl.col('revenue') > 1000
).select(['product', 'revenue', 'region']).collect()

print("Lazy reading with filtering:")
print(result)

## 2. Reading JSON Files

### 2.1 Basic JSON Reading

In [None]:
# Read JSON file
df_employees = pl.read_json(os.path.join(DATA_DIR, 'employees.json'))

print("Employee Data:")
print(df_employees)
print(f"\nShape: {df_employees.shape}")

### 2.2 JSON Lines Format

In [None]:
# Create a JSON Lines file for demonstration
import json

json_lines_data = [
    {"id": 1, "name": "Item A", "value": 100},
    {"id": 2, "name": "Item B", "value": 200},
    {"id": 3, "name": "Item C", "value": 300}
]

with open(os.path.join(DATA_DIR, 'items.jsonl'), 'w') as f:
    for item in json_lines_data:
        f.write(json.dumps(item) + '\n')

# Read JSON Lines
df_jsonl = pl.read_ndjson(os.path.join(DATA_DIR, 'items.jsonl'))
print("JSON Lines data:")
print(df_jsonl)

## 3. Reading Parquet Files

Parquet is a columnar storage format that's very efficient for analytical workloads.

In [None]:
# Read Parquet file
df_transactions = pl.read_parquet(os.path.join(DATA_DIR, 'transactions.parquet'))

print("Transaction Data:")
print(df_transactions.head(10))
print(f"\nShape: {df_transactions.shape}")

### 3.1 Lazy Parquet Reading

In [None]:
# Scan Parquet file lazily
lf_transactions = pl.scan_parquet(os.path.join(DATA_DIR, 'transactions.parquet'))

# Query optimization example
result = lf_transactions.filter(
    pl.col('status') == 'completed'
).select(['transaction_id', 'customer_name', 'amount']).limit(5).collect()

print("Completed transactions (lazy):")
print(result)

## 4. Reading Excel Files

In [None]:
# Read Excel file
df_students = pl.read_excel(os.path.join(DATA_DIR, 'students.xlsx'))

print("Student Data:")
print(df_students)
print(f"\nShape: {df_students.shape}")

### 4.1 Reading Specific Sheets

In [None]:
# If your Excel file has multiple sheets, you can specify which one
# df_sheet = pl.read_excel('file.xlsx', sheet_name='Sheet2')

# Or read by sheet index (0-indexed)
# df_sheet = pl.read_excel('file.xlsx', sheet_id=0)

print("Excel reading supports sheet selection!")

## 5. Writing Data to Files

### 5.1 Writing to CSV

In [None]:
# Write DataFrame to CSV
output_csv = os.path.join(DATA_DIR, 'output_sales.csv')
df_sales.write_csv(output_csv)

print(f"Data written to: {output_csv}")

# Verify by reading back
df_verify = pl.read_csv(output_csv)
print(f"Verified - rows written: {df_verify.height}")

### 5.2 Writing to JSON

In [None]:
# Write to JSON
output_json = os.path.join(DATA_DIR, 'output_employees.json')
df_employees.write_json(output_json)

print(f"Data written to: {output_json}")

# Write to JSON Lines (NDJSON)
output_ndjson = os.path.join(DATA_DIR, 'output_employees.ndjson')
df_employees.write_ndjson(output_ndjson)

print(f"Data written to: {output_ndjson}")

### 5.3 Writing to Parquet

In [None]:
# Write to Parquet (highly compressed, fast)
output_parquet = os.path.join(DATA_DIR, 'output_sales.parquet')
df_sales.write_parquet(output_parquet)

print(f"Data written to: {output_parquet}")

# Compare file sizes
import os as os_module
csv_size = os_module.path.getsize(output_csv)
parquet_size = os_module.path.getsize(output_parquet)

print(f"\nFile size comparison:")
print(f"CSV: {csv_size:,} bytes")
print(f"Parquet: {parquet_size:,} bytes")
print(f"Compression ratio: {csv_size/parquet_size:.2f}x")

### 5.4 Writing to Excel

In [None]:
# Write to Excel
output_excel = os.path.join(DATA_DIR, 'output_students.xlsx')
df_students.write_excel(output_excel)

print(f"Data written to: {output_excel}")

## 6. Working with Compressed Files

### 6.1 Reading Compressed CSV

In [None]:
import gzip

# Create a gzip compressed CSV
csv_gz_path = os.path.join(DATA_DIR, 'sales_data.csv.gz')
with open(os.path.join(DATA_DIR, 'sales_data.csv'), 'rb') as f_in:
    with gzip.open(csv_gz_path, 'wb') as f_out:
        f_out.writelines(f_in)

# Polars can read compressed files directly
df_compressed = pl.read_csv(csv_gz_path)

print("Data read from compressed file:")
print(df_compressed.head())
print(f"Rows: {df_compressed.height}")

### 6.2 Writing Compressed Files

In [None]:
# Parquet has built-in compression
df_sales.write_parquet(
    os.path.join(DATA_DIR, 'sales_compressed.parquet'),
    compression='snappy'  # Options: snappy, gzip, lz4, zstd
)

print("Compressed Parquet file created with Snappy compression")

## 7. Reading from URLs

In [None]:
# Polars can read directly from URLs
# Example with a public dataset
url = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv"

try:
    df_iris = pl.read_csv(url)
    print("Iris dataset from URL:")
    print(df_iris.head())
    print(f"Shape: {df_iris.shape}")
except Exception as e:
    print(f"Note: Reading from URL requires internet connection. Error: {e}")

## 8. Schema Override and Type Specification

In [None]:
# Specify schema when reading CSV
schema = {
    'date': pl.Utf8,  # Read as string first
    'product': pl.Utf8,
    'category': pl.Utf8,
    'quantity': pl.Int64,
    'price': pl.Float64,
    'revenue': pl.Float64,
    'customer_id': pl.Utf8,
    'region': pl.Utf8
}

df_typed = pl.read_csv(
    os.path.join(DATA_DIR, 'sales_data.csv'),
    schema=schema
)

# Then parse dates
df_typed = df_typed.with_columns(
    pl.col('date').str.strptime(pl.Date, format='%Y-%m-%d')
)

print("DataFrame with explicit schema:")
print(df_typed.head())
print(f"\nData types:\n{df_typed.dtypes}")

## 9. Batch Processing Multiple Files

In [None]:
# Create multiple CSV files for demonstration
for i in range(3):
    df_batch = pl.DataFrame({
        'id': range(i*10, (i+1)*10),
        'value': [x * 2 for x in range(i*10, (i+1)*10)],
        'batch': [i] * 10
    })
    df_batch.write_csv(os.path.join(DATA_DIR, f'batch_{i}.csv'))

# Read and concatenate multiple files
import glob

batch_files = glob.glob(os.path.join(DATA_DIR, 'batch_*.csv'))
dfs = [pl.read_csv(f) for f in batch_files]
df_combined = pl.concat(dfs)

print("Combined data from multiple files:")
print(df_combined)
print(f"Total rows: {df_combined.height}")

## 10. Advanced: Reading Large Files Efficiently

In [None]:
# For very large files, use lazy reading and process in chunks
lf = pl.scan_csv(os.path.join(DATA_DIR, 'sales_data.csv'))

# Show the optimized query plan
print("Query plan for lazy operations:")
print(lf.filter(pl.col('revenue') > 500).select(['product', 'revenue']).explain())

# Execute the query
result = lf.filter(pl.col('revenue') > 500).select(['product', 'revenue']).collect()
print("\nFiltered results:")
print(result)

## 11. Summary

In this notebook, we explored:
- ✅ Reading CSV, JSON, Parquet, and Excel files
- ✅ Writing data to various formats
- ✅ Working with compressed files
- ✅ Lazy reading for large files
- ✅ Reading from URLs
- ✅ Schema specification and type control
- ✅ Batch processing multiple files

### Key Takeaways:
1. **Parquet** is the most efficient format for large datasets
2. **Lazy reading** (`scan_*` methods) is ideal for large files
3. Polars handles compression automatically
4. Schema specification gives you full control over data types
5. Polars can read directly from URLs and compressed files

### Performance Tips:
- Use Parquet for storage when possible
- Use lazy reading for large files
- Specify schema to avoid type inference overhead
- Read only required columns for better performance

**Next:** In the next notebook, we'll dive deep into data manipulation and transformations!