In [15]:
from IPython.display import display
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.types import Float, String, Integer, DateTime
from geoalchemy2 import Geometry
import requests
import json

# Database configuration
POSTGRES_USER = "user"
POSTGRES_PASSWORD = "password"
POSTGRES_DB = "urban_data"
POSTGRES_HOST = "postgres"
POSTGRES_PORT = "5432"

# Create a connection string
connection_string = f"postgresql://{POSTGRES_USER}:{POSTGRES_PASSWORD}@{POSTGRES_HOST}:{POSTGRES_PORT}/{POSTGRES_DB}"
engine = create_engine(connection_string)

# Load datasets
link_info_path = '/home/jovyan/work/data/link_info.parquet.gz'
speed_data_path = '/home/jovyan/work/data/duval_jan1_2024.parquet.gz'
link_info_url = 'https://cdn.urbansdk.com/data-engineering-interview/link_info.parquet.gz'
speed_data_url = 'https://cdn.urbansdk.com/data-engineering-interview/duval_jan1_2024.parquet.gz'

# Download dataset func
def download_file(url, local_path):
    response = requests.get(url)
    response.raise_for_status() 
    with open(local_path, 'wb') as file:
        file.write(response.content)

# Fazer o download dos arquivos
download_file(link_info_url, link_info_path)
download_file(speed_data_url, speed_data_path)

link_info_df = pd.read_parquet(link_info_path)
speed_data_df = pd.read_parquet(speed_data_path)

In [16]:
# Define functions to transform data for insertion
def transform_link_info(df):
    # Ensure geo_json is a valid geometry type for PostGIS
    df['geometry'] = df['geo_json'].apply(lambda x: json.dumps(x) if isinstance(x, dict) else x)
    return df.drop(columns=['geo_json'])  # Drop the geo_json column after transformation

def transform_speed_data(df):
    # Ensure date_time is in a DateTime format
    df['date_time'] = pd.to_datetime(df['date_time'])
    return df

# Transform the datasets
link_info_df = transform_link_info(link_info_df)
speed_data_df = transform_speed_data(speed_data_df)

In [None]:
# Insert data into the existing tables
def insert_data(df, table_name):
    df.to_sql(table_name, engine, if_exists="append", index=False)

# Insert data with transformations applied
insert_data(link_info_df, "link_info")
insert_data(speed_data_df, "speed_data")

print("Data inserted successfully.")

In [5]:
# Apply Coordinates System to DB
from sqlalchemy import text

query = """
UPDATE link_info
SET geometry = ST_SetSRID(geometry, 4326)
WHERE ST_SRID(geometry) IS DISTINCT FROM 4326;
"""

with engine.connect() as connection:
    connection.execute(text(query))
    connection.commit() 

In [17]:
# Load data into DataFrames
link_info_df = pd.read_sql_table("link_info", engine)
speed_data_df = pd.read_sql_table("speed_data", engine)

# Check the first few rows and data types
print("Link Info Data:")
display(link_info_df.head())
print("Speed Data:")
display(speed_data_df.head())

# Summary statistics
print("Summary Statistics - Link Info Data:")
display(link_info_df.describe())
print("Summary Statistics - Speed Data:")
display(speed_data_df.describe())

Link Info Data:


Unnamed: 0,link_id,_length,road_name,usdk_speed_category,funclass_id,speedcat,volume_value,volume_bin_id,volume_year,volumes_bin_description,geometry
0,1148855686,0.027340324,,40,4,2,800,1,2022,0-1999,0105000020E6100000010000000102000000020000000A...
1,1240632857,0.019262501,E 21st St,40,4,5,5000,3,2022,5000-7449,0105000020E610000001000000010200000002000000D8...
2,1240632858,0.021126614,E 21st St,40,4,5,5000,3,2022,5000-7449,0105000020E6100000010000000102000000030000008D...
3,1240474884,0.037903631,University Blvd W,40,4,5,18000,6,2022,15000-19999,0105000020E610000001000000010200000002000000C5...
4,1240959781,0.022369356,,40,4,5,3500,2,2022,2000-4999,0105000020E61000000100000001020000000400000055...


Speed Data:


Unnamed: 0,link_id,date_time,freeflow,count,std_dev,min,max,confidence,average_speed,average_pct_85,average_pct_95,day_of_week,period
0,1295292965,2024-01-01 10:00:00+00:00,44.739,63,9.3,27.962,57.788,40,42.75,50.331,52.817,2,4
1,23058981,2024-01-01 20:00:00+00:00,49.71,27,12.2,41.01,67.729,40,52.63,64.001,67.729,2,7
2,1202899217,2024-01-01 15:00:00+00:00,34.797,13,15.5,8.078,41.632,40,31.1,42.253,45.671,2,5
3,1241183256,2024-01-01 13:00:00+00:00,13.67,6,4.8,8.078,16.777,40,13.359,16.777,16.777,2,5
4,835679519,2024-01-01 15:00:00+00:00,29.826,13,20.8,3.728,46.603,30,32.032,42.564,46.292,2,5


Summary Statistics - Link Info Data:


Unnamed: 0,link_id,usdk_speed_category,funclass_id,speedcat,volume_value,volume_bin_id,volume_year
count,100924.0,100924.0,100924.0,100924.0,100924.0,100924.0,100924.0
mean,766371600.0,34.331576,4.528467,5.537394,4144.457215,2.104088,2022.0
std,556060100.0,7.371569,0.796229,1.024515,7963.697327,1.909518,0.0
min,16981050.0,20.0,1.0,2.0,100.0,1.0,2022.0
25%,23053490.0,30.0,4.0,5.0,200.0,1.0,2022.0
50%,1032972000.0,30.0,5.0,6.0,400.0,1.0,2022.0
75%,1293699000.0,40.0,5.0,6.0,5000.0,3.0,2022.0
max,1334783000.0,70.0,5.0,7.0,60000.0,9.0,2022.0


Summary Statistics - Speed Data:


Unnamed: 0,link_id,freeflow,count,std_dev,min,max,confidence,average_speed,average_pct_85,average_pct_95,day_of_week,period
count,1239946.0,1239946.0,1239946.0,1239946.0,1239946.0,1239946.0,1239946.0,1239946.0,1239946.0,1239946.0,1239946.0,1239946.0
mean,881168600.0,31.18691,30.41918,9.096169,21.83279,41.47413,31.867,32.40514,38.23041,40.19489,2.0,4.352719
std,504509400.0,14.2776,80.93345,6.754173,14.69899,21.09085,10.23361,16.36159,17.98278,19.34773,0.0,2.092258
min,16981050.0,0.497,1.0,0.0,0.621,0.621,10.0,0.621,0.621,0.621,2.0,1.0
25%,721293000.0,22.369,2.0,3.8,9.942,24.233,30.0,19.884,24.233,24.233,2.0,3.0
50%,1188470000.0,29.826,6.0,8.9,19.884,41.632,40.0,32.063,39.146,41.01,2.0,5.0
75%,1282162000.0,39.768,25.0,13.0,32.311,55.923,40.0,43.123,50.331,54.059,2.0,6.0
max,1313710000.0,70.215,2603.0,104.9,154.721,158.45,40.0,154.721,155.343,158.45,2.0,7.0


In [7]:
# Check for missing values
print("Missing values in Link Info Data:")
print(link_info_df.isnull().sum())
print("Missing values in Speed Data:")
print(speed_data_df.isnull().sum())


Missing values in Link Info Data:
link_id                        0
_length                        0
road_name                  10986
usdk_speed_category            0
funclass_id                    0
speedcat                       0
volume_value                   0
volume_bin_id                  0
volume_year                    0
volumes_bin_description        0
geometry                       0
dtype: int64
Missing values in Speed Data:
link_id           0
date_time         0
freeflow          0
count             0
std_dev           0
min               0
max               0
confidence        0
average_speed     0
average_pct_85    0
average_pct_95    0
day_of_week       0
period            0
dtype: int64


In [9]:
# Query to calculate average speed by link_id, period, and day of the week
query = """
SELECT
    s.link_id,
    AVG(s.average_speed) AS avg_speed,
    l.road_name,
    l._length AS length,
    l.geometry AS geo_json,
    s.period
FROM
    speed_data s
JOIN
    link_info l ON s.link_id = l.link_id
GROUP BY
    s.link_id, s.period, s.day_of_week, l.road_name, l._length, l.geometry
"""

# Fetch data into a DataFrame
transformed_df = pd.read_sql_query(query, engine)
transformed_df

Unnamed: 0,link_id,avg_speed,road_name,length,geo_json,period
0,16981048,47.845500,Philips Hwy,0.009320565,0105000020E610000001000000010200000002000000E3...,1
1,16981048,47.804333,Philips Hwy,0.009320565,0105000020E610000001000000010200000002000000E3...,2
2,16981048,45.401333,Philips Hwy,0.009320565,0105000020E610000001000000010200000002000000E3...,3
3,16981048,47.265333,Philips Hwy,0.009320565,0105000020E610000001000000010200000002000000E3...,4
4,16981048,47.970000,Philips Hwy,0.009320565,0105000020E610000001000000010200000002000000E3...,5
...,...,...,...,...,...,...
467275,1313709937,13.981000,Walkers Ridge Dr,0.259733078,0105000020E610000001000000010200000002000000E3...,3
467276,1313709937,5.126500,Walkers Ridge Dr,0.259733078,0105000020E610000001000000010200000002000000E3...,4
467277,1313709937,16.010667,Walkers Ridge Dr,0.259733078,0105000020E610000001000000010200000002000000E3...,5
467278,1313709937,7.011000,Walkers Ridge Dr,0.259733078,0105000020E610000001000000010200000002000000E3...,6


In [10]:
# Query to find  the names of the roads in a given bounding box
query = """
SELECT DISTINCT
    road_name,
    ST_AsText(geometry) AS geometry_text
FROM 
    link_info
WHERE 
    geometry && ST_MakeEnvelope(-81.7, 30.2, -81.5, 30.4, 4326)
AND
    road_name <> 'None';
"""
# Fetch data into a DataFrame
roads_df = pd.read_sql_query(query, engine)
display(roads_df)


Unnamed: 0,road_name,geometry_text
0,10000-2 Gate Ap,"MULTILINESTRING((-81.54144 30.25629,-81.5413 3..."
1,10075-10 Gate Ap,"MULTILINESTRING((-81.53506 30.25922,-81.53525 ..."
2,10075-11 Gate Ap,"MULTILINESTRING((-81.53416 30.25972,-81.53438 ..."
3,10075-12 Gate Ap,"MULTILINESTRING((-81.5348 30.25989,-81.5344 30..."
4,10075-13 Gate Ap,"MULTILINESTRING((-81.53528 30.25988,-81.53491 ..."
...,...,...
35725,Zinnia Ave,"MULTILINESTRING((-81.70034 30.37884,-81.69999 ..."
35726,Zion Rd,"MULTILINESTRING((-81.62542 30.29006,-81.62451 ..."
35727,Zona Ave,"MULTILINESTRING((-81.58229 30.32044,-81.58099 ..."
35728,Zona Ave,"MULTILINESTRING((-81.58305 30.32036,-81.58229 ..."


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Average speed by day of the week
plt.figure(figsize=(10, 6))
sns.boxplot(data=transformed_df, x="period", y="avg_speed")
plt.title("Distribution of Average Speeds by Time Period")
plt.xlabel("Period")
plt.ylabel("Average Speed")
plt.show()

# Average speed by link and period
avg_speed_by_link = transformed_df.groupby("link_id")["avg_speed"].mean()
plt.figure(figsize=(12, 6))
avg_speed_by_link.plot(kind="hist", bins=30)
plt.title("Distribution of Average Speeds across Links")
plt.xlabel("Average Speed")
plt.ylabel("Frequency")
plt.show()


In [None]:
from shapely import wkb
import json
from shapely.geometry import shape

# Convert GeoJSON strings to Shapely geometries
transformed_df['geometry'] = transformed_df['geo_json'].apply(lambda x: shape(json.loads(x)) if x else None)

# Extract coordinates from Shapely geometries
# For MultiLineString geometries, we'll use the centroid for simplicity
transformed_df['coordinates'] = transformed_df['geometry'].apply(lambda geom: geom.centroid.coords[0] if geom else (None, None))

# Separate longitude and latitude
transformed_df['longitude'] = transformed_df['coordinates'].apply(lambda x: x[0])
transformed_df['latitude'] = transformed_df['coordinates'].apply(lambda x: x[1])

# Display the resulting DataFrame
transformed_df[['link_id', 'avg_speed', 'longitude', 'latitude']].head()

# Scatter plot of locations colored by average speed
plt.figure(figsize=(10, 8))
sns.scatterplot(data=transformed_df, x="longitude", y="latitude", hue="avg_speed", palette="coolwarm", size="avg_speed")
plt.title("Spatial Distribution of Links by Average Speed")
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.show()


In [None]:
# Select numeric columns for correlation analysis
corr_features = transformed_df[['avg_speed', 'length', 'longitude', 'latitude']]
corr_matrix = corr_features.corr()

# Display correlation matrix
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", vmin=-1, vmax=1)
plt.title("Correlation Matrix")
plt.show()


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Select features for modeling
X = transformed_df[['length', 'longitude', 'latitude', 'period']]
y = transformed_df['avg_speed']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Fit a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Model evaluation
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R^2 Score:", r2_score(y_test, y_pred))


In [None]:
# Average speed by period
avg_speed_by_period = transformed_df.groupby('period')['avg_speed'].mean()
print(avg_speed_by_period)

In [None]:
# Convert DataFrame to GeoJSON
geojson_data = {
    "type": "FeatureCollection",
    "features": []
}

for _, row in transformed_df.iterrows():
    # Convert geometry if it's stored as a string
    geometry = json.loads(row["geo_json"]) if isinstance(row["geo_json"], str) else row["geo_json"]

    feature = {
        "type": "Feature",
        "geometry": geometry,
        "properties": {
            "link_id": row["link_id"],
            "avg_speed": row["avg_speed"],
            "road_name": row["road_name"],
            "length": row["length"]
        }
    }
    geojson_data["features"].append(feature)


In [None]:
# Save GeoJSON data to a file for verification
with open("test_geojson.json", "w") as f:
    json.dump(geojson_data, f, indent=2)


In [None]:
from mapboxgl.utils import create_color_stops
from mapboxgl.viz import ChoroplethViz
from IPython.display import IFrame
import json

# Mapbox token
mapbox_token = "pk.eyJ1IjoiZ3VpYm9ydG9sYXNvIiwiYSI6ImNtMndoajR4czA2d2sybXB0enNhc2dtcjAifQ.PHvelUdPZ4tW7WQL0fvNmQ"

# Define color stops for visualization
color_stops = create_color_stops([20, 40, 60, 80], colors=['blue', 'green', 'yellow', 'red'])

# Create the map visualization
viz = ChoroplethViz(
    geojson_data,
    access_token=mapbox_token,
    color_property="avg_speed",
    color_stops=color_stops,
    center=(-81.6556, 30.3322), 
    zoom=10,
    line_stroke="blue"
)

# Save the map visualization to HTML
viz.create_html('map_visualization.html')

# Display the HTML file in an IFrame
IFrame('map_visualization.html', width=700, height=500)


# 1. ST_Distance: Calculate the distance between two geometries
Purpose: Measures the shortest distance between two geometries. This is useful for proximity analysis, such as finding nearby points of interest.

In [None]:

query = """
SELECT 
    a.link_id AS link_a,
    b.link_id AS link_b,
    ST_Distance(a.geometry::geography, b.geometry::geography) AS distance_meters
FROM 
    link_info a, 
    link_info b
WHERE 
    a.link_id < b.link_id
ORDER BY 
    distance_meters
LIMIT 10;
"""

distance_df = pd.read_sql_query(query, engine)
distance_df.head()

# 2. ST_Intersects: Check if two geometries intersect
Purpose: Determines if two geometries share any space. This is useful for filtering geometries that overlap or touch.

In [None]:
query = """
SELECT 
    a.link_id AS link_a,
    b.link_id AS link_b
FROM 
    link_info a, 
    link_info b
WHERE 
    a.link_id < b.link_id
    AND ST_Intersects(a.geometry, b.geometry);
"""

intersect_df = pd.read_sql_query(query, engine)
intersect_df.head()

# 3. ST_Within: Check if one geometry is within another
Purpose: Returns true if one geometry is entirely within another. This is useful for containment analysis, such as finding all features within a defined area.

In [None]:
query = """
SELECT 
    link_id,
    road_name
FROM 
    link_info
WHERE 
    ST_Within(geometry, ST_MakeEnvelope(-81.7, 30.2, -81.5, 30.4, 4326));
"""

geo_within_df = pd.read_sql_query(query, engine)

# 4. ST_Length: Calculate the length of a line geometry
Purpose: Calculates the length of a line geometry, useful for road or path length calculations.

In [18]:
query = """
SELECT 
    link_id,
    road_name,
    ST_Length(geometry::geography) AS length_meters
FROM 
    link_info
ORDER BY 
    length_meters DESC
LIMIT 10;
"""

lenght_df = pd.read_sql_query(query, engine)
display(lenght_df)

SyntaxError: incomplete input (795311176.py, line 1)