In [None]:
import pandas as pd
from sqlalchemy import create_engine
import json
import os

with open('../Credentials/awm_database_credentials.json') as data_file:
    data = json.load(data_file)
    
HOST = data['host']
PORT = data['port']
USER = data['user']
PASSWORD = data['password']
DATABASE = data['database']

engine = create_engine(f"mysql+pymysql://{USER}:{PASSWORD}@{HOST}:{PORT}/{DATABASE}")

# estimate_query = """
# SELECT COUNT(*) AS total_rows
# FROM waypoint wp
# JOIN tracking t ON wp.id_tracking = t.id
# WHERE t.duration > 300000
# AND t.length > 500
# AND t.is_exported = 0
# AND (t.is_invalid IS NULL OR t.is_invalid = 0)
# AND (SELECT MAX(latitude) - MIN(latitude) FROM waypoint WHERE id_tracking = t.id) > 0.001
# AND (SELECT MAX(longitude) - MIN(longitude) FROM waypoint WHERE id_tracking = t.id) > 0.001;
# """

# # Fetch row count estimate
# row_count_df = pd.read_sql(estimate_query, engine)
# print(f"Estimated number of rows to load: {row_count_df['total_rows'][0]}")

# Query: Select only valid trackings and their waypoints in chunks
query = """
SELECT wp.id_tracking, wp.id, wp.time, wp.type, wp.sequence, wp.comment, wp.speed, wp.heading, wp.duration, 
       wp.block_type, wp.log, wp.latitude, wp.longitude, wp.altitude, wp.meta_tag, wp.meta_value
FROM waypoint wp
JOIN tracking t ON wp.id_tracking = t.id
WHERE t.duration > 300000
AND t.length > 500
AND t.is_exported = 0
AND (t.is_invalid IS NULL OR t.is_invalid = 0)
AND (SELECT MAX(latitude) - MIN(latitude) FROM waypoint WHERE id_tracking = t.id) > 0.001
AND (SELECT MAX(longitude) - MIN(longitude) FROM waypoint WHERE id_tracking = t.id) > 0.001;
"""

chunk_size = 1000  # Adjust based on RAM capacity
output_file = "filtered_waypoints.parquet"

if os.path.exists(output_file):
    os.remove(output_file)

for chunk in pd.read_sql(query, engine, chunksize=chunk_size):
    chunk.to_parquet(output_file, engine="pyarrow", compression="snappy", index=False, append=False)
    print(f"Saved {len(chunk)} rows to {output_file}...")

df = pd.read_parquet(output_file)

print(df.info())
print(df.describe())

engine.dispose()

TypeError: __cinit__() got an unexpected keyword argument 'append'

In [None]:
import pymysql
import pandas as pd
import pyarrow.parquet as pq  
import pyarrow as pa

HOST = data['host']
PORT = int(data['port']) 
USER = data['user']
PASSWORD = data['password']
DATABASE = data['database']

query = """
SELECT 
    wp.id_tracking, wp.id, wp.time, wp.type, wp.sequence, wp.comment, 
    wp.speed, wp.heading, wp.duration, wp.block_type, wp.log, 
    wp.latitude, wp.longitude, wp.altitude, wp.meta_tag, wp.meta_value
FROM waypoint wp
JOIN tracking t ON wp.id_tracking = t.id
WHERE 
    t.duration BETWEEN 18000000000 AND 360000000000  -- Between 0.5 Hour and 10 Hours
    AND t.length BETWEEN 5 AND 150  -- Between 5km and 150km
    AND (t.is_invalid IS NULL OR t.is_invalid = 0)
    AND EXISTS (
        SELECT 1 FROM waypoint w 
        WHERE w.id_tracking = t.id
        HAVING COUNT(*) > 10  -- Ensure at least 10 waypoints exist
    )
    AND (
        (SELECT MAX(latitude) FROM waypoint WHERE id_tracking = t.id) - 
        (SELECT MIN(latitude) FROM waypoint WHERE id_tracking = t.id)
    ) > 0.0005  -- At least ~50m in latitude
    AND (
        (SELECT MAX(longitude) FROM waypoint WHERE id_tracking = t.id) - 
        (SELECT MIN(longitude) FROM waypoint WHERE id_tracking = t.id)
    ) > 0.0005  -- At least ~50m in longitude
ORDER BY t.duration DESC;  -- Sort by duration, longest trips first


"""

parquet_file = "gps_data_relaxed_parameters_more.parquet"

conn = pymysql.connect(
    host=HOST,
    port=PORT,
    user=USER,
    password=PASSWORD,
    database=DATABASE,
    cursorclass=pymysql.cursors.SSCursor  
)

chunk_size = 1000000 

first_chunk = True  

try:
    for chunk in pd.read_sql(query, conn, chunksize=chunk_size):
        table = pa.Table.from_pandas(chunk)

        if first_chunk:
            pq.write_table(table, parquet_file, compression="snappy")
            first_chunk = False
        else:
            with pq.ParquetWriter(parquet_file, table.schema, compression="snappy") as writer:
                writer.write_table(table)

        print(f"Processed {len(chunk)} rows...")
    
    print("data saved")

except Exception as e:
    print(f"Error: {e}")

finally:
    conn.close() 


  for chunk in pd.read_sql(query, conn, chunksize=chunk_size):


Processed 1000000 rows...
Processed 1000000 rows...
Processed 1000000 rows...
Processed 1000000 rows...
Processed 1000000 rows...
Processed 1000000 rows...
Processed 1000000 rows...
Processed 1000000 rows...
Processed 1000000 rows...
Processed 1000000 rows...
Processed 1000000 rows...
Processed 1000000 rows...
Processed 1000000 rows...
Processed 1000000 rows...
Processed 1000000 rows...
Processed 1000000 rows...
Processed 1000000 rows...
Processed 1000000 rows...
Processed 1000000 rows...
Processed 1000000 rows...
Processed 1000000 rows...
Processed 1000000 rows...
Processed 1000000 rows...
Processed 1000000 rows...
Processed 1000000 rows...
Processed 1000000 rows...
Processed 1000000 rows...
Processed 1000000 rows...
Processed 1000000 rows...
Processed 1000000 rows...
Processed 1000000 rows...
Processed 1000000 rows...
Processed 1000000 rows...
Processed 1000000 rows...
Processed 1000000 rows...
Processed 1000000 rows...
Processed 1000000 rows...
Processed 1000000 rows...
Processed 10

In [4]:
# no filter version. (Load all data)
import pandas as pd
from sqlalchemy import create_engine
import json
import os
import pymysql
import pandas as pd
import pyarrow.parquet as pq  
import pyarrow as pa

with open('../Credentials/awm_database_credentials.json') as data_file:
    data = json.load(data_file)

HOST = data['host']
PORT = int(data['port']) 
USER = data['user']
PASSWORD = data['password']
DATABASE = data['database']

query = """
SELECT 
    wp.id_tracking, wp.id, wp.time, wp.type, wp.sequence, wp.comment, 
    wp.speed, wp.heading, wp.duration, wp.block_type, wp.log, 
    wp.latitude, wp.longitude, wp.altitude, wp.meta_tag, wp.meta_value
FROM waypoint wp
JOIN tracking t ON wp.id_tracking = t.id;



"""

parquet_file = "all_gps_data_without_sorting.parquet"

conn = pymysql.connect(
    host=HOST,
    port=PORT,
    user=USER,
    password=PASSWORD,
    database=DATABASE,
    cursorclass=pymysql.cursors.SSCursor  
)

chunk_size = 10000000 # was beofre 100.000 and working

first_chunk = True  

try:
    for chunk in pd.read_sql(query, conn, chunksize=chunk_size):
        table = pa.Table.from_pandas(chunk)

        if first_chunk:
            pq.write_table(table, parquet_file, compression="snappy")
            first_chunk = False
        else:
            with pq.ParquetWriter(parquet_file, table.schema, compression="snappy") as writer:
                writer.write_table(table)

        print(f"Processed {len(chunk)} rows...")
    
    print("data saved")

except Exception as e:
    print(f"Error: {e}")

finally:
    conn.close() 


  for chunk in pd.read_sql(query, conn, chunksize=chunk_size):


Processed 10000000 rows...
Processed 10000000 rows...
Processed 10000000 rows...
Processed 10000000 rows...
Processed 10000000 rows...
Processed 10000000 rows...
Processed 10000000 rows...
Processed 10000000 rows...
Processed 10000000 rows...
Processed 10000000 rows...
Processed 10000000 rows...
Processed 10000000 rows...
Processed 7759733 rows...
data saved
