In [1]:
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.types import Float, String, Integer, DateTime
from geoalchemy2 import Geometry
import requests
import json

# Database configuration
POSTGRES_USER = "user"
POSTGRES_PASSWORD = "password"
POSTGRES_DB = "urban_data"
POSTGRES_HOST = "postgres"
POSTGRES_PORT = "5432"

# Create a connection string
connection_string = f"postgresql://{POSTGRES_USER}:{POSTGRES_PASSWORD}@{POSTGRES_HOST}:{POSTGRES_PORT}/{POSTGRES_DB}"
engine = create_engine(connection_string)

# Load datasets
link_info_path = '/home/jovyan/work/data/link_info.parquet.gz'
speed_data_path = '/home/jovyan/work/data/duval_jan1_2024.parquet.gz'
link_info_url = 'https://cdn.urbansdk.com/data-engineering-interview/link_info.parquet.gz'
speed_data_url = 'https://cdn.urbansdk.com/data-engineering-interview/duval_jan1_2024.parquet.gz'

# Download dataset func
def download_file(url, local_path):
    response = requests.get(url)
    response.raise_for_status() 
    with open(local_path, 'wb') as file:
        file.write(response.content)

# Fazer o download dos arquivos
download_file(link_info_url, link_info_path)
download_file(speed_data_url, speed_data_path)

link_info_df = pd.read_parquet(link_info_path)
speed_data_df = pd.read_parquet(speed_data_path)

In [2]:
# Define functions to transform data for insertion
def transform_link_info(df):
    # Ensure geo_json is a valid geometry type for PostGIS
    df['geometry'] = df['geo_json'].apply(lambda x: json.dumps(x) if isinstance(x, dict) else x)
    return df.drop(columns=['geo_json'])  # Drop the geo_json column after transformation

def transform_speed_data(df):
    # Ensure date_time is in a DateTime format
    df['date_time'] = pd.to_datetime(df['date_time'])
    return df

# Transform the datasets
link_info_df = transform_link_info(link_info_df)
speed_data_df = transform_speed_data(speed_data_df)

In [None]:
# Insert data into the existing tables
def insert_data(df, table_name):
    df.to_sql(table_name, engine, if_exists="append", index=False)

# Insert data with transformations applied
insert_data(link_info_df, "link_info")
insert_data(speed_data_df, "speed_data")

print("Data inserted successfully.")

In [None]:
# Load data into DataFrames
link_info_df = pd.read_sql_table("link_info", engine)
speed_data_df = pd.read_sql_table("speed_data", engine)

# Check the first few rows and data types
print("Link Info Data:")
display(link_info_df.head())
print("Speed Data:")
display(speed_data_df.head())

# Summary statistics
print("Summary Statistics - Link Info Data:")
display(link_info_df.describe())
print("Summary Statistics - Speed Data:")
display(speed_data_df.describe())

In [None]:
# Check for missing values
print("Missing values in Link Info Data:")
print(link_info_df.isnull().sum())
print("Missing values in Speed Data:")
print(speed_data_df.isnull().sum())


In [None]:
# Query to calculate average speed by link_id, period, and day of the week
query = """
SELECT
    s.link_id,
    AVG(s.average_speed) AS avg_speed,
    l.road_name,
    l._length AS length,
    l.geometry AS geo_json,
    s.period
FROM
    speed_data s
JOIN
    link_info l ON s.link_id = l.link_id
GROUP BY
    s.link_id, s.period, s.day_of_week, l.road_name, l._length, l.geometry
"""

# Fetch data into a DataFrame
transformed_df = pd.read_sql_query(query, engine)
transformed_df.head()

In [None]:
# Query to find  the names of the roads in a given bounding box
query = """
SELECT DISTINCT
    road_name,
    ST_AsText(geometry) AS geometry_text
FROM 
    link_info
WHERE 
    geometry && ST_MakeEnvelope(-81.7, 30.2, -81.5, 30.4, 4326)
AND
    road_name <> 'None';
"""
# Fetch data into a DataFrame
roads_df = pd.read_sql_query(query, engine)
roads_df.tail()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Average speed by day of the week
plt.figure(figsize=(10, 6))
sns.boxplot(data=transformed_df, x="period", y="avg_speed")
plt.title("Distribution of Average Speeds by Time Period")
plt.xlabel("Period")
plt.ylabel("Average Speed")
plt.show()

# Average speed by link and period
avg_speed_by_link = transformed_df.groupby("link_id")["avg_speed"].mean()
plt.figure(figsize=(12, 6))
avg_speed_by_link.plot(kind="hist", bins=30)
plt.title("Distribution of Average Speeds across Links")
plt.xlabel("Average Speed")
plt.ylabel("Frequency")
plt.show()


In [None]:
from shapely import wkb
import json
from shapely.geometry import shape

# Convert GeoJSON strings to Shapely geometries
transformed_df['geometry'] = transformed_df['geo_json'].apply(lambda x: shape(json.loads(x)) if x else None)

# Extract coordinates from Shapely geometries
# For MultiLineString geometries, we'll use the centroid for simplicity
transformed_df['coordinates'] = transformed_df['geometry'].apply(lambda geom: geom.centroid.coords[0] if geom else (None, None))

# Separate longitude and latitude
transformed_df['longitude'] = transformed_df['coordinates'].apply(lambda x: x[0])
transformed_df['latitude'] = transformed_df['coordinates'].apply(lambda x: x[1])

# Display the resulting DataFrame
transformed_df[['link_id', 'avg_speed', 'longitude', 'latitude']].head()

# Scatter plot of locations colored by average speed
plt.figure(figsize=(10, 8))
sns.scatterplot(data=transformed_df, x="longitude", y="latitude", hue="avg_speed", palette="coolwarm", size="avg_speed")
plt.title("Spatial Distribution of Links by Average Speed")
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.show()


In [None]:
# Select numeric columns for correlation analysis
corr_features = transformed_df[['avg_speed', 'length', 'longitude', 'latitude']]
corr_matrix = corr_features.corr()

# Display correlation matrix
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", vmin=-1, vmax=1)
plt.title("Correlation Matrix")
plt.show()


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Select features for modeling
X = transformed_df[['length', 'longitude', 'latitude', 'period']]
y = transformed_df['avg_speed']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Fit a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Model evaluation
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R^2 Score:", r2_score(y_test, y_pred))


In [None]:
# Average speed by period
avg_speed_by_period = transformed_df.groupby('period')['avg_speed'].mean()
print(avg_speed_by_period)

In [None]:
# Convert DataFrame to GeoJSON
geojson_data = {
    "type": "FeatureCollection",
    "features": []
}

for _, row in transformed_df.iterrows():
    # Convert geometry if it's stored as a string
    geometry = json.loads(row["geo_json"]) if isinstance(row["geo_json"], str) else row["geo_json"]

    feature = {
        "type": "Feature",
        "geometry": geometry,
        "properties": {
            "link_id": row["link_id"],
            "avg_speed": row["avg_speed"],
            "road_name": row["road_name"],
            "length": row["length"]
        }
    }
    geojson_data["features"].append(feature)


In [None]:
# Save GeoJSON data to a file for verification
with open("test_geojson.json", "w") as f:
    json.dump(geojson_data, f, indent=2)


In [None]:
from mapboxgl.utils import create_color_stops
from mapboxgl.viz import ChoroplethViz
from IPython.display import IFrame
import json

# Mapbox token
mapbox_token = "pk.eyJ1IjoiZ3VpYm9ydG9sYXNvIiwiYSI6ImNtMndoajR4czA2d2sybXB0enNhc2dtcjAifQ.PHvelUdPZ4tW7WQL0fvNmQ"

# Define color stops for visualization
color_stops = create_color_stops([20, 40, 60, 80], colors=['blue', 'green', 'yellow', 'red'])

# Create the map visualization
viz = ChoroplethViz(
    geojson_data,
    access_token=mapbox_token,
    color_property="avg_speed",
    color_stops=color_stops,
    center=(-81.6556, 30.3322), 
    zoom=10,
    line_stroke="blue"
)

# Save the map visualization to HTML
viz.create_html('map_visualization.html')

# Display the HTML file in an IFrame
IFrame('map_visualization.html', width=700, height=500)
