In [1]:
import pandas as pd
import sqlite3
import random
from contextlib import contextmanager

# Step 1: Generate sample weather data (simulating HDFS CSV file)
def generate_sample_data(num_records=1000):
    years = list(range(1900, 2021))
    data = {
        'record_id': range(1, num_records + 1),
        'year': [random.choice(years) for _ in range(num_records)],
        'temperature_c': [random.uniform(-50, 50) for _ in range(num_records)]
    }
    df = pd.DataFrame(data)
    # Save to CSV (simulating HDFS file)
    csv_path = '/content/weather_data.csv'
    df.to_csv(csv_path, index=False)
    print(f"Sample data generated and saved to {csv_path} (simulating HDFS file).")
    return csv_path

# Step 2: Simulate SQLite connection for Hive-like table
@contextmanager
def sqlite_connection(db_name):
    conn = sqlite3.connect(db_name)
    try:
        yield conn
    finally:
        conn.close()

# Step 3: Simulate Sqoop export/import (CSV to SQLite)
def sqoop_like_import(csv_path, db_name, table_name):
    # Read CSV (simulating Sqoop export from HDFS)
    df = pd.read_csv(csv_path)
    print(f"Sqoop-like export: Read {len(df)} records from {csv_path} (HDFS).")

    # Import to SQLite (simulating Hive table)
    with sqlite_connection(db_name) as conn:
        df.to_sql(table_name, conn, if_exists='replace', index=False)
        print(f"Sqoop-like import: Loaded data into {db_name}.{table_name} (Hive table).")

        # Create index (optional, for performance, mimicking Hive index)
        conn.execute(f'CREATE INDEX idx_year ON {table_name}(year)')
        print(f"Index 'idx_year' created on {table_name}.year.")

# Step 4: Generate weather report from SQLite (Hive-like query)
def generate_weather_report(db_name, table_name):
    with sqlite_connection(db_name) as conn:
        query = f'''
            SELECT year,
                   MIN(temperature_c) AS min_temp_c,
                   MAX(temperature_c) AS max_temp_c
            FROM {table_name}
            GROUP BY year
            ORDER BY year
        '''
        report_df = pd.read_sql_query(query, conn)
        report_df['min_temp_c'] = report_df['min_temp_c'].round(1)
        report_df['max_temp_c'] = report_df['max_temp_c'].round(1)
    return report_df

# Step 5: Run the POC
if __name__ == "__main__":
    print("=== Simulating Sqoop Export/Import to Hive ===")

    # Generate sample data (HDFS-like CSV)
    csv_path = generate_sample_data(1000)

    # Simulate Sqoop import to Hive
    db_name = 'weather_hive.db'
    table_name = 'weather_data'
    sqoop_like_import(csv_path, db_name, table_name)

    # Generate report to verify data
    print("\nGenerating Weather Temperature Statistics Report...")
    report = generate_weather_report(db_name, table_name)

    print("\n=== Weather Report ===")
    print("Year\tMin Temp (°C)\tMax Temp (°C)")
    print("-" * 35)
    for _, row in report.iterrows():
        print(f"{int(row['year'])}\t{row['min_temp_c']}\t\t{row['max_temp_c']}")

    print(f"\nSample data from {table_name} (first 5 rows):")
    with sqlite_connection(db_name) as conn:
        sample_data = pd.read_sql_query(f'SELECT * FROM {table_name} LIMIT 5', conn)
        print(sample_data)

=== Simulating Sqoop Export/Import to Hive ===
Sample data generated and saved to /content/weather_data.csv (simulating HDFS file).
Sqoop-like export: Read 1000 records from /content/weather_data.csv (HDFS).
Sqoop-like import: Loaded data into weather_hive.db.weather_data (Hive table).
Index 'idx_year' created on weather_data.year.

Generating Weather Temperature Statistics Report...

=== Weather Report ===
Year	Min Temp (°C)	Max Temp (°C)
-----------------------------------
1900	-42.7		45.6
1901	-16.2		33.6
1902	-38.1		33.9
1903	-26.1		43.9
1904	-44.9		44.6
1905	-38.9		40.4
1906	-48.9		32.7
1907	-49.4		49.8
1908	-48.7		27.2
1909	-28.3		13.9
1910	-34.6		43.5
1911	-38.6		25.3
1912	-46.8		48.2
1913	-44.1		28.6
1914	-16.0		32.1
1915	-47.9		49.6
1916	-49.2		49.7
1917	-28.7		42.6
1918	-14.5		49.6
1919	-34.3		21.9
1920	-41.3		46.3
1921	-45.2		43.5
1922	-34.2		32.6
1923	-24.8		36.9
1924	-30.1		49.5
1925	-49.1		28.2
1926	-47.6		-12.9
1927	-46.1		49.0
1928	-47.4		47.8
1929	-46.1		48.1
1930	-13.