In [1]:
import pandas as pd
import json
import pyarrow as pa
import pyarrow.parquet as pq
import fastavro
import os

In [2]:
# Create output directory if it doesn't exist
output_dir = 'output'
os.makedirs(output_dir, exist_ok=True)

In [4]:
def csv_to_parquet():
    """Convert CSV to Parquet format"""
    # Read CSV
    csv_file = f'{output_dir}/sample.csv'
    df = pd.read_csv(csv_file)
    
    # Write to Parquet
    parquet_file = f'{output_dir}/csv_to_parquet.parquet'
    df.to_parquet(parquet_file)
    print(f"Converted CSV to Parquet: {parquet_file}")
csv_to_parquet()

Converted CSV to Parquet: output/csv_to_parquet.parquet


In [5]:
def json_to_csv():
    """Convert JSON to CSV format"""
    # Read JSON
    json_file = f'{output_dir}/sample.json'
    with open(json_file, 'r') as f:
        data = json.load(f)
    
    # Convert to DataFrame and save as CSV
    df = pd.DataFrame(data)
    csv_file = f'{output_dir}/json_to_csv.csv'
    df.to_csv(csv_file, index=False)
    print(f"Converted JSON to CSV: {csv_file}")
json_to_csv()

Converted JSON to CSV: output/json_to_csv.csv


In [6]:
def csv_to_avro():
    """Convert CSV to Avro format"""
    # Read CSV
    csv_file = f'{output_dir}/sample.csv'
    df = pd.read_csv(csv_file)
    
    # Define Avro schema
    schema = {
        'type': 'record',
        'name': 'User',
        'fields': [
            {'name': 'name', 'type': 'string'},
            {'name': 'age', 'type': 'int'},
            {'name': 'city', 'type': 'string'}
        ]
    }
    
    # Prepare records
    records = df.to_dict('records')
    
    # Write to Avro
    avro_file = f'{output_dir}/csv_to_avro.avro'
    with open(avro_file, 'wb') as f:
        fastavro.writer(f, schema, records)
    print(f"Converted CSV to Avro: {avro_file}")
csv_to_avro()

Converted CSV to Avro: output/csv_to_avro.avro


In [7]:
def json_to_parquet():
    """Convert JSON to Parquet format"""
    # Read JSON
    json_file = f'{output_dir}/sample.json'
    with open(json_file, 'r') as f:
        data = json.load(f)
    
    # Convert to DataFrame
    df = pd.DataFrame(data)
    
    # Write to Parquet
    parquet_file = f'{output_dir}/json_to_parquet.parquet'
    df.to_parquet(parquet_file)
    print(f"Converted JSON to Parquet: {parquet_file}")
json_to_parquet()

Converted JSON to Parquet: output/json_to_parquet.parquet
