In [None]:
# Install required libraries
!pip install kagglehub pyspark scikit-learn plotly pandas matplotlib

# Step 1: Import Required Libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from pyspark.sql import SparkSession
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import kagglehub

# Step 2: Download Dataset from KaggleHub
# Automatically download dataset from KaggleHub
path = kagglehub.dataset_download("vaishalij/san-francisco-caltrain-uber-movement-data")
print("Path to dataset files:", path)

# Step 3: List All Files in the Dataset Directory
files = os.listdir(path)
print("Files in dataset directory:", files)

# Automatically select the first CSV file in the directory
file_name = [file for file in files if file.endswith('.csv')][0]  # Automatically detects CSV file
data_path = os.path.join(path, file_name)

# Step 4: Load the Dataset
data = pd.read_csv(data_path)

# Print dataset column names to debug
print("Column names in the dataset:")
print(data.columns)

# Debugging: Print first few rows to understand the dataset structure
print("First few rows of the dataset:")
print(data.head())

# Step 5: Data Preprocessing
# Adjust column names based on the dataset structure
try:
    # Print the columns available in the dataset
    print("Available columns:", data.columns.tolist())

    # Replace these with actual column names from the dataset
    data = data[['Origin Movement ID', 'Origin Display Name', 'Destination Movement ID',
                 'Destination Display Name', 'Date Range', 'Mean Travel Time (Seconds)']].dropna()

    # Rename columns to standard names
    data.rename(columns={
        'Mean Travel Time (Seconds)': 'travel_time',
        'Date Range': 'date_range'
    }, inplace=True)

    # Convert date_range to datetime format if necessary
    # Assuming date_range is a string indicating the time period
    print("Cleaned data:")
    print(data.head())

except KeyError as e:
    print(f"KeyError: {e}")
    print("The specified columns do not exist in the dataset. Please update the column names based on the dataset structure.")
    raise

# Step 6: Load Data into PySpark
# Initialize Spark Session
spark = SparkSession.builder \
    .appName("CAV Big Data Analytics") \
    .config("spark.executor.memory", "2g") \
    .getOrCreate()

# Convert Pandas DataFrame to Spark DataFrame
spark_df = spark.createDataFrame(data)

# Show Spark DataFrame schema
spark_df.printSchema()

# Example: Compute average travel time by origin
avg_travel_time = spark_df.groupBy("Origin Display Name").avg("travel_time")
print("Average travel time by origin:")
avg_travel_time.show()

# Step 7: Feature Engineering
# Add additional features as necessary
data['hour'] = pd.to_datetime(data['date_range'].str.split(' - ').str[0]).dt.hour

# Select features (hour, origin, destination) and label (travel_time)
X = data[['hour', 'Origin Movement ID', 'Destination Movement ID']]
y = data['travel_time']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Feature engineering completed.")

# Step 8: Train Machine Learning Model
# Train a Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on test data
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Model evaluation results:")
print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

# Step 9: Visualization of Predictions
# Add predictions to the test set for visualization
X_test['predicted_travel_time'] = y_pred

# Plotting the predictions
fig = px.scatter(X_test, x='hour', y='predicted_travel_time',
                 color='predicted_travel_time',
                 title="Predicted Travel Time by Hour",
                 labels={'predicted_travel_time': 'Predicted Travel Time (Seconds)', 'hour': 'Hour of Day'})
fig.show()

# Optional: Save the model for future use
import joblib
joblib.dump(model, 'traffic_model.pkl')
print("Model saved successfully!")

# Stop the Spark session
spark.stop()

Downloading from https://www.kaggle.com/api/v1/datasets/download/vaishalij/san-francisco-caltrain-uber-movement-data?dataset_version_number=1...


100%|██████████| 35.6k/35.6k [00:00<00:00, 8.96MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/vaishalij/san-francisco-caltrain-uber-movement-data/versions/1
Files in dataset directory: ['Travel_Times.csv']
Column names in the dataset:
Index(['Origin Movement ID', 'Origin Display Name', 'Destination Movement ID',
       'Destination Display Name', 'Date Range', 'Mean Travel Time (Seconds)',
       'Range - Lower Bound Travel Time (Seconds)',
       'Range - Upper Bound Travel Time (Seconds)'],
      dtype='object')
First few rows of the dataset:
   Origin Movement ID                                Origin Display Name  \
0                3863  CALTRAIN MAIN STATION, SF, 700 4th St, San Fra...   
1                3863  CALTRAIN MAIN STATION, SF, 700 4th St, San Fra...   
2                3863  CALTRAIN MAIN STATION, SF, 700 4th St, San Fra...   
3                3863  CALTRAIN MAIN STATION, SF, 700 4th St, San Fra...   
4                3863  CALTRAIN MAIN STATION, SF, 700 4th St, San Fra...   

   Destinat




root
 |-- Origin Movement ID: long (nullable = true)
 |-- Origin Display Name: string (nullable = true)
 |-- Destination Movement ID: long (nullable = true)
 |-- Destination Display Name: string (nullable = true)
 |-- date_range: string (nullable = true)
 |-- travel_time: long (nullable = true)

Average travel time by origin:
+--------------------+-----------------+
| Origin Display Name| avg(travel_time)|
+--------------------+-----------------+
|CALTRAIN MAIN STA...|1702.336690647482|
+--------------------+-----------------+

Feature engineering completed.
Model evaluation results:
Mean Squared Error: 586222.6839711671
R^2 Score: 0.07239881340910104


Model saved successfully!


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import plotly.express as px

# Load the dataset
file_path = '/root/.cache/kagglehub/datasets/vaishalij/san-francisco-caltrain-uber-movement-data/versions/1/Travel_Times.csv'
data = pd.read_csv(file_path)

# Display the first few rows and available columns
print("Column names in the dataset:\n", data.columns)
print("First few rows of the dataset:\n", data.head())

# Feature extraction from the "Date Range" column
data['start_date'] = data['Date Range'].str.split(' - ').str[0]  # Extract start date
data['day_of_week'] = pd.to_datetime(data['start_date']).dt.dayofweek  # Day of the week
data['hour_of_day'] = np.random.randint(0, 24, data.shape[0])  # Simulated hour feature

# Rename and filter relevant columns
data = data[['Origin Movement ID', 'Destination Movement ID', 'day_of_week', 'hour_of_day', 'Mean Travel Time (Seconds)']]
data.rename(columns={'Mean Travel Time (Seconds)': 'travel_time'}, inplace=True)

# Display cleaned data
print("Cleaned data:\n", data.head())

# Handle missing values (if any)
data.dropna(inplace=True)

# Split the data into features (X) and target (y)
X = data[['Origin Movement ID', 'Destination Movement ID', 'day_of_week', 'hour_of_day']]
y = data['travel_time']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model training with Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"R² Score: {r2}")

# Add predictions to the test set for visualization
X_test['actual_travel_time'] = y_test
X_test['predicted_travel_time'] = y_pred

# Visualization: Predicted travel time by hour of day
fig = px.scatter(
    X_test,
    x='hour_of_day',
    y='predicted_travel_time',
    color='predicted_travel_time',
    title="Predicted Travel Time by Hour",
    labels={'hour_of_day': "Hour of Day", 'predicted_travel_time': "Predicted Travel Time (Seconds)"},
    color_continuous_scale='Viridis'
)
fig.show()

# Visualization: 3D scatter plot for additional insights
fig_3d = px.scatter_3d(
    X_test,
    x='hour_of_day',
    y='day_of_week',
    z='predicted_travel_time',
    color='predicted_travel_time',
    title="Predicted Travel Time by Hour and Day",
    labels={'hour_of_day': "Hour of Day", 'day_of_week': "Day of Week", 'predicted_travel_time': "Predicted Travel Time (Seconds)"},
    color_continuous_scale='Viridis'
)
fig_3d.show()

# Save the model (optional)
import joblib
joblib.dump(rf_model, "travel_time_model.pkl")
print("Model saved successfully!")

Column names in the dataset:
 Index(['Origin Movement ID', 'Origin Display Name', 'Destination Movement ID',
       'Destination Display Name', 'Date Range', 'Mean Travel Time (Seconds)',
       'Range - Lower Bound Travel Time (Seconds)',
       'Range - Upper Bound Travel Time (Seconds)'],
      dtype='object')
First few rows of the dataset:
    Origin Movement ID                                Origin Display Name  \
0                3863  CALTRAIN MAIN STATION, SF, 700 4th St, San Fra...   
1                3863  CALTRAIN MAIN STATION, SF, 700 4th St, San Fra...   
2                3863  CALTRAIN MAIN STATION, SF, 700 4th St, San Fra...   
3                3863  CALTRAIN MAIN STATION, SF, 700 4th St, San Fra...   
4                3863  CALTRAIN MAIN STATION, SF, 700 4th St, San Fra...   

   Destination Movement ID Destination Display Name  \
0                     1315            609692 (1315)   
1                     1320            609694 (1320)   
2                     1369     

Model saved successfully!


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import plotly.express as px
import joblib
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import datetime

# Load the dataset
file_path = '/root/.cache/kagglehub/datasets/vaishalij/san-francisco-caltrain-uber-movement-data/versions/1/Travel_Times.csv'
data = pd.read_csv(file_path)

# Display the first few rows and available columns
print("Column names in the dataset:\n", data.columns)
print("First few rows of the dataset:\n", data.head())

# Feature extraction from the "Date Range" column
data['start_date'] = data['Date Range'].str.split(' - ').str[0]  # Extract start date
data['day_of_week'] = pd.to_datetime(data['start_date']).dt.dayofweek  # Day of the week
data['hour_of_day'] = np.random.randint(0, 24, data.shape[0])  # Simulated hour feature

# Simulating additional features: Origin and Destination Coordinates
# For the sake of this example, we simulate random coordinates within a specific region (latitude, longitude)
np.random.seed(42)  # Set random seed for reproducibility
data['origin_lat'] = np.random.uniform(37.5, 37.9, data.shape[0])  # Random latitudes for origin
data['origin_lon'] = np.random.uniform(-122.5, -123.0, data.shape[0])  # Random longitudes for origin
data['dest_lat'] = np.random.uniform(37.5, 37.9, data.shape[0])  # Random latitudes for destination
data['dest_lon'] = np.random.uniform(-122.5, -123.0, data.shape[0])  # Random longitudes for destination

# Simulating weather conditions (e.g., temperature, precipitation, and wind speed)
data['weather_temp'] = np.random.uniform(15, 30, data.shape[0])  # Temperature in Celsius
data['weather_precip'] = np.random.uniform(0, 10, data.shape[0])  # Precipitation in mm
data['weather_wind'] = np.random.uniform(0, 30, data.shape[0])  # Wind speed in km/h

# Simulating special events/holidays
data['is_holiday'] = data['day_of_week'].apply(lambda x: 1 if x in [5, 6] else 0)  # Simplified weekend = holiday

# Renaming and selecting relevant columns
data = data[['Origin Movement ID', 'Destination Movement ID', 'day_of_week', 'hour_of_day', 'origin_lat', 'origin_lon',
             'dest_lat', 'dest_lon', 'weather_temp', 'weather_precip', 'weather_wind', 'is_holiday', 'Mean Travel Time (Seconds)']]

# Handle missing values (if any)
data.dropna(inplace=True)

# Split the data into features (X) and target (y)
X = data.drop(columns=['Mean Travel Time (Seconds)'])
y = data['Mean Travel Time (Seconds)']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model 1: Random Forest Regressor for comparison
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict on the test set
y_pred_rf = rf_model.predict(X_test)

# Evaluate the Random Forest model
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)
print(f"Random Forest Model - Mean Squared Error: {mse_rf}")
print(f"Random Forest Model - R² Score: {r2_rf}")

# Visualization: Predicted travel time by hour of day
fig = px.scatter(
    X_test,
    x='hour_of_day',
    y=y_pred_rf,
    color=y_pred_rf,
    title="Predicted Travel Time by Hour",
    labels={'hour_of_day': "Hour of Day", 'predicted_travel_time': "Predicted Travel Time (Seconds)"},
    color_continuous_scale='Viridis'
)
fig.show()

# Model 2: Neural Network (Keras/TensorFlow)
nn_model = keras.Sequential([
    layers.Dense(64, activation='relu', input_dim=X_train.shape[1]),
    layers.Dense(32, activation='relu'),
    layers.Dense(1)
])

# Compile the model
nn_model.compile(optimizer='adam', loss='mean_squared_error')

# Train the Neural Network
nn_model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=1)

# Predict using the Neural Network model
y_pred_nn = nn_model.predict(X_test)

# Evaluate the Neural Network model
mse_nn = mean_squared_error(y_test, y_pred_nn)
r2_nn = r2_score(y_test, y_pred_nn)
print(f"Neural Network Model - Mean Squared Error: {mse_nn}")
print(f"Neural Network Model - R² Score: {r2_nn}")

# Visualize Neural Network Predictions
fig_nn = px.scatter(
    X_test,
    x='hour_of_day',
    y=y_pred_nn.flatten(),  # Flatten to convert predictions to 1D
    color=y_pred_nn.flatten(),
    title="Neural Network Predicted Travel Time by Hour",
    labels={'hour_of_day': "Hour of Day", 'predicted_travel_time': "Predicted Travel Time (Seconds)"},
    color_continuous_scale='Viridis'
)
fig_nn.show()

# Save the models
joblib.dump(rf_model, "rf_travel_time_model.pkl")
nn_model.save('nn_travel_time_model.h5')
print("Models saved successfully!")

Column names in the dataset:
 Index(['Origin Movement ID', 'Origin Display Name', 'Destination Movement ID',
       'Destination Display Name', 'Date Range', 'Mean Travel Time (Seconds)',
       'Range - Lower Bound Travel Time (Seconds)',
       'Range - Upper Bound Travel Time (Seconds)'],
      dtype='object')
First few rows of the dataset:
    Origin Movement ID                                Origin Display Name  \
0                3863  CALTRAIN MAIN STATION, SF, 700 4th St, San Fra...   
1                3863  CALTRAIN MAIN STATION, SF, 700 4th St, San Fra...   
2                3863  CALTRAIN MAIN STATION, SF, 700 4th St, San Fra...   
3                3863  CALTRAIN MAIN STATION, SF, 700 4th St, San Fra...   
4                3863  CALTRAIN MAIN STATION, SF, 700 4th St, San Fra...   

   Destination Movement ID Destination Display Name  \
0                     1315            609692 (1315)   
1                     1320            609694 (1320)   
2                     1369     


Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.



Epoch 1/10
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - loss: 1927139.1250
Epoch 2/10
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 519998.5625
Epoch 3/10
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 388769.2188
Epoch 4/10
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 323328.3438
Epoch 5/10
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 274588.2500
Epoch 6/10
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 233193.4844
Epoch 7/10
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 213865.5469
Epoch 8/10
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 203343.8906
Epoch 9/10
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 199503.2656
Epoch 10/10
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[



Models saved successfully!
