In [26]:
%pip uninstall -y sasl || true
%pip install --quiet "pyhive>=0.7.0" "thrift>=0.16,<1" thrift_sasl pure-sasl pymysql pandas


[0mNote: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [27]:
from pyhive import hive
import pandas as pd

conn = hive.Connection(
    host="hive-server",
    port=10000,
    username="hive",
    database="default",
    auth="NONE",          # uses thrift_sasl + pure-sasl under the hood
)
print(pd.read_sql("SELECT 1 AS ok", conn))
conn.close()


   ok
0   1


  print(pd.read_sql("SELECT 1 AS ok", conn))


In [29]:
from pyhive import hive
import pymysql
from pymysql.constants import CLIENT

HIVE = dict(host="hive-server", port=10000, database="default", username="hive", auth="NONE")
MYSQL = dict(host="mariadb", port=3306, db="analytics", user="etl", password="etlpass")

HIVE_SQL = """
SELECT
  CAST(year AS INT)               AS year,
  CAST(month AS INT)              AS month,
  COALESCE(passenger_count, -1.0) AS passenger_count,   -- map NULL → -1
  AVG(total_amount)               AS avg_total_amount,
  COUNT(*)                        AS n_trips
FROM yellow_taxi
WHERE year='2019'
GROUP BY year, month, COALESCE(passenger_count, -1.0)
ORDER BY year, month, passenger_count
"""

MYSQL_DDL = """
CREATE TABLE IF NOT EXISTS taxi_monthly_summary (
  year INT NOT NULL,
  month INT NOT NULL,
  passenger_count DECIMAL(10,2),
  avg_total_amount DECIMAL(18,4),
  n_trips BIGINT,
  PRIMARY KEY (year, month, passenger_count)
) ENGINE=InnoDB;
"""

UPSERT_SQL = """
INSERT INTO taxi_monthly_summary
  (year, month, passenger_count, avg_total_amount, n_trips)
VALUES (%s, %s, %s, %s, %s)
ON DUPLICATE KEY UPDATE
  avg_total_amount = VALUES(avg_total_amount),
  n_trips          = VALUES(n_trips);
"""

def hive_stream(sql, arraysize=5000):
    c = hive.Connection(host=HIVE["host"], port=HIVE["port"],
                        username=HIVE["username"], database=HIVE["database"],
                        auth=HIVE["auth"])
    cur = c.cursor()
    cur.execute(sql)
    while True:
        rows = cur.fetchmany(arraysize)
        if not rows: break
        for r in rows:
            yield r
    cur.close(); c.close()

m = pymysql.connect(host=MYSQL["host"], port=MYSQL["port"], user=MYSQL["user"],
                    password=MYSQL["password"], database=MYSQL["db"],
                    autocommit=False, client_flag=CLIENT.MULTI_STATEMENTS)
mc = m.cursor()
mc.execute(MYSQL_DDL); m.commit()

buf, total = [], 0
for y,mn,pc,avgv,cnt in hive_stream(HIVE_SQL):
    buf.append((int(y), int(mn),
                None if pc is None else float(pc),
                None if avgv is None else float(avgv),
                int(cnt)))
    if len(buf) >= 10_000:
        mc.executemany(UPSERT_SQL, buf); m.commit(); total += len(buf); buf.clear()

if buf:
    mc.executemany(UPSERT_SQL, buf); m.commit(); total += len(buf)

mc.close(); m.close()
print(f"[OK] Upserted {total} rows into analytics.taxi_monthly_summary")


[OK] Upserted 132 rows into analytics.taxi_monthly_summary
