In [None]:
import os
import pandas as pd
import geopandas as gpd

output_dir = "/Users/keshavsaraogi/Desktop/freight"
data_dir = "/path/to/dataset/folder"
os.makedirs(output_dir, exist_ok=True)


In [None]:

def load_traffic_data(street_prefix):
    """
    Combines all traffic CSV files for a given street prefix.
    :param street_prefix: Prefix of the street files (e.g., "And", "Bel", "Bxl").
    :return: Combined DataFrame with traffic data.
    """
    files = [f for f in os.listdir(data_dir) if f.startswith(street_prefix) and f.endswith('.csv')]
    dfs = []
    for file in files:
        file_path = os.path.join(data_dir, file)
        print(f"Loading {file_path}...")
        df = pd.read_csv(file_path)
        dfs.append(df)
    combined_df = pd.concat(dfs, ignore_index=True)
    return combined_df

traffic_data_and = load_traffic_data("And")

def save_to_parquet(dataframe, output_file):
    """
    Save a DataFrame to a Parquet file.
    :param dataframe: pandas DataFrame to save
    :param output_file: Output file path for the Parquet file
    """
    try:
        dataframe.to_parquet(output_file, engine="pyarrow", index=False)
        print(f"Data successfully saved to {output_file}")
    except Exception as e:
        print(f"Error saving to Parquet: {e}")

output_file = os.path.join(output_dir, "traffic_data_and.parquet")
save_to_parquet(traffic_data_and, output_file)


In [None]:

geo_data_files = [
    "/Users/keshavsaraogi/Desktop/freight/Anderlecht_streets.json",
    "/Users/keshavsaraogi/Desktop/freight/Belgium_streets.json,
    "/Users/keshavsaraogi/Desktop/freight/Bruxelles_streets.json",
    "/Users/keshavsaraogi/Desktop/freight/bruxelles.json"
]

# Function to load and combine geospatial data
def combine_geospatial_data(file_list):
    """
    Combines multiple GeoJSON files into a single GeoDataFrame.
    :param file_list: List of file paths to GeoJSON files.
    :return: Combined GeoDataFrame.
    """
    gdfs = []
    for file in file_list:
        print(f"Loading {file}...")
        gdf = gpd.read_file(file)
        gdfs.append(gdf)
    
    # Concatenate all GeoDataFrames
    combined_gdf = gpd.GeoDataFrame(pd.concat(gdfs, ignore_index=True))
    
    # Ensure a consistent CRS (Coordinate Reference System)
    if combined_gdf.crs is None:
        combined_gdf.set_crs("EPSG:4326", inplace=True)  # Default to WGS84 if CRS is missing
    else:
        combined_gdf = combined_gdf.to_crs("EPSG:4326")
    
    return combined_gdf

# Combine geospatial data
combined_geo_data = combine_geospatial_data(geo_data_files)

# Save combined geospatial data to a GeoJSON file
output_file = "/path/to/combined_geospatial_data.geojson"
combined_geo_data.to_file(output_file, driver="GeoJSON")
print(f"Combined geospatial data saved to {output_file}")

# Optional: Plot the combined data
combined_geo_data.plot()


In [None]:
print("Info:")
print(combined_gdf.info())

print("\nMissing Values:")
print(combined_gdf.isnull().sum())

print("\nCoordinate Reference System (CRS):")
print(combined_gdf.crs)

print("\nInvalid Geometries:")
print(combined_gdf[~combined_gdf.is_valid])  
print("\nEmpty Geometries:")
print(combined_gdf[combined_gdf.is_empty])  

print("\nDuplicate Entries:")
duplicates = combined_gdf.duplicated(subset=None, keep=False)
print(combined_gdf[duplicates])

if "geometry" not in combined_gdf.columns:
    print("\nError: No 'geometry' column found!")
elif combined_gdf['geometry'].isnull().any():
    print("\nError: Null geometries found!")
else:
    print("\nGeometry column is valid.")

combined_gdf.plot()

In [None]:
# 1. Check the structure of the dataset
print("Info:")
print(traffic_data.info())

# 2. Check for missing values
print("\nMissing Values:")
print(traffic_data.isnull().sum())

# 3. Check for duplicate rows
print("\nDuplicate Entries:")
duplicates = traffic_data.duplicated(keep=False)
print(traffic_data[duplicates])

# 4. Check for invalid or outlier values in numerical columns
numerical_columns = traffic_data.select_dtypes(include=["float64", "int64"]).columns
print("\nOutlier Statistics (5th and 95th percentiles):")
for col in numerical_columns:
    lower = traffic_data[col].quantile(0.05)
    upper = traffic_data[col].quantile(0.95)
    print(f"{col}: [{lower}, {upper}]")

# 5. Validate timestamps (convert to datetime and check for issues)
if "timestamp" in traffic_data.columns:
    try:
        traffic_data["timestamp"] = pd.to_datetime(traffic_data["timestamp"], errors="coerce")
        print("\nInvalid Timestamps:")
        print(traffic_data[traffic_data["timestamp"].isnull()])
    except Exception as e:
        print(f"Error converting timestamps: {e}")
else:
    print("\nWarning: No 'timestamp' column found!")

# 6. Check for negative or zero values in critical columns (e.g., traffic_volume)
if "traffic_volume" in traffic_data.columns:
    invalid_traffic = traffic_data[traffic_data["traffic_volume"] <= 0]
    print("\nInvalid Traffic Volume:")
    print(invalid_traffic)
else:
    print("\nWarning: No 'traffic_volume' column found!")

# 7. Visualize distributions of key metrics (optional)
import matplotlib.pyplot as plt

for col in ["traffic_volume", "average_speed"]:  # Adjust based on your data
    if col in traffic_data.columns:
        traffic_data[col].plot(kind="hist", bins=50, title=f"Distribution of {col}")
        plt.show()