In [1]:
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from data import *
from model import *

taxi_type = GREEN
train_sql = getSqlForModeling(taxi_type, isTrain=True)
test_sql = getSqlForModeling(taxi_type, isTrain=False)

scaler = StandardScaler()

model = Sequential([
    Dense(64, input_dim=len(FEATURES), activation="relu"),  # Input layer + first hidden layer
    Dropout(0.2),  # Dropout for regularization
    Dense(32, activation="relu"),  # Second hidden layer
    Dropout(0.2),  # Dropout for regularization
    Dense(16, activation="relu"),  # Third hidden layer
    Dense(1)  # Output layer for regression
])

model.compile(optimizer="adam", loss="mse", metrics=["mae"])

# Sample Test
with DR.engn.connect() as conn:
    for df in pd.read_sql(test_sql, conn, chunksize=CHUNK_SIZE):
        sample_X_test = df[FEATURES]
        sample_y_test = df[VARIABLE]
        break

# Training
early_stopping = EarlyStopping(monitor="val_loss", patience=10, restore_best_weights=True)

with DR.engn.connect() as conn:
    processed = 0
    for df in pd.read_sql(train_sql, conn, chunksize=CHUNK_SIZE):
        processed += len(df)
        X = df[FEATURES]
        y = df[VARIABLE]

        X_train_scaled = scaler.fit_transform(X)
        X_test_scaled = scaler.transform(sample_X_test)

        model.fit(
            X_train_scaled, y,
            validation_data=(X_test_scaled, sample_y_test),
            epochs=100,
            batch_size=32,
            callbacks=[early_stopping],
            verbose=1
        )
        O.out(f'Trained {processed} rows')

# Predictions
with DR.engn.connect() as conn:
    processed = 0
    y_test = []
    y_pred = []
    for df in pd.read_sql(test_sql, conn, chunksize=CHUNK_SIZE):
        processed += len(df)
        X = df[FEATURES]
        y_test.append(df[VARIABLE])

        X_test_scaled = scaler.transform(X)

        y_p = model.predict(X_test_scaled)
        y_pred.append(y_p.flatten())
        O.out(f'Predicted {processed} rows')

# Performance
merged_y_test = pd.concat(y_test, ignore_index=True)
merged_y_pred = np.concatenate(y_pred)

showPerformance(merged_y_test, merged_y_pred)



sql engine ready
Epoch 1/100
[1m31250/31250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 1ms/step - loss: 8.7965 - mae: 1.2926 - val_loss: 8187.0884 - val_mae: 89.2745
Epoch 2/100
[1m31250/31250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 1ms/step - loss: 3.8295 - mae: 0.7071 - val_loss: 15065.6787 - val_mae: 121.4119
Epoch 3/100
[1m31250/31250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 1ms/step - loss: 3.5588 - mae: 0.6553 - val_loss: 12750.2393 - val_mae: 111.5926
Epoch 4/100
[1m31250/31250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 1ms/step - loss: 3.3866 - mae: 0.6234 - val_loss: 15435.0518 - val_mae: 122.9528
Epoch 5/100
[1m31250/31250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 1ms/step - loss: 3.2955 - mae: 0.6089 - val_loss: 11832.7666 - val_mae: 107.6758
Epoch 6/100
[1m31250/31250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 1ms/step - loss: 3.3851 - mae: 0.6058 - val_loss: 17664.1992 - val_mae: 131.7482
E

In [2]:
from model import *

data = readData(taxi_type=GREEN)

sql engine ready
1675331
1011017
802356
757206


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

X = data[FEATURES_SET1]
y = data[VARIABLE]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [5]:
# X_train_scaled.shape[1]
len(FEATURES_SET1)

8

In [None]:
# Define the neural network architecture
model = Sequential([
    Dense(64, input_dim=X_train_scaled.shape[1], activation="relu"),  # Input layer + first hidden layer
    Dropout(0.2),  # Dropout for regularization
    Dense(32, activation="relu"),  # Second hidden layer
    Dropout(0.2),  # Dropout for regularization
    Dense(16, activation="relu"),  # Third hidden layer
    Dense(1)  # Output layer for regression
])

# Compile the model
model.compile(optimizer="adam", loss="mse", metrics=["mae"])


In [5]:
# Define early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor="val_loss", patience=10, restore_best_weights=True)

# Train the model
history = model.fit(
    X_train_scaled, y_train,
    validation_data=(X_test_scaled, y_test),
    epochs=100,
    batch_size=32,
    callbacks=[early_stopping],
    verbose=1
)


Epoch 1/100
[1m106148/106148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m208s[0m 2ms/step - loss: 26.6261 - mae: 2.7059 - val_loss: 23.3239 - val_mae: 2.3512
Epoch 2/100
[1m106148/106148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 2ms/step - loss: 20.2680 - mae: 2.1882 - val_loss: 22.0766 - val_mae: 2.2526
Epoch 3/100
[1m106148/106148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m199s[0m 2ms/step - loss: 19.8041 - mae: 2.1372 - val_loss: 22.9811 - val_mae: 2.3695
Epoch 4/100
[1m106148/106148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m197s[0m 2ms/step - loss: 19.4772 - mae: 2.1123 - val_loss: 24.6057 - val_mae: 2.4823
Epoch 5/100
[1m106148/106148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m197s[0m 2ms/step - loss: 19.3902 - mae: 2.0917 - val_loss: 24.6416 - val_mae: 2.5117
Epoch 6/100
[1m106148/106148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m174s[0m 2ms/step - loss: 19.2847 - mae: 2.0823 - val_loss: 22.2330 - val_mae: 2.4166
Epoch 7/100
[1m106148

In [None]:
# Evaluate the model on the test set
test_loss, test_mae = model.evaluate(X_test_scaled, y_test, verbose=1)
print(f"Test Mean Absolute Error: {test_mae:.2f}")

# Predict on the test set
y_pred = model.predict(X_test_scaled)

[1m26537/26537[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 1ms/step - loss: 22.1325 - mae: 2.2476
Test Mean Absolute Error: 2.25
[1m26537/26537[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 2ms/step


In [11]:
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import plotly.graph_objects as go

y_pred = y_pred.flatten()

# Calculate Model Accuracy Metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"Model Accuracy Metrics:")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R-squared (Accuracy): {r2:.2f}")

# Limit data to 150 samples for visualization
samples_to_plot = 150
y_test_limited = y_test[:samples_to_plot].reset_index(drop=True)
y_pred_limited = y_pred[:samples_to_plot]

# Create Line Chart with Plotly
fig = go.Figure()

# Actual Values
fig.add_trace(go.Scatter(
    x=y_test_limited.index, 
    y=y_test_limited, 
    mode="lines+markers", 
    name="Actual",
    line=dict(color="blue")
))

# Predicted Values
fig.add_trace(go.Scatter(
    x=y_test_limited.index, 
    y=y_pred_limited, 
    mode="lines+markers", 
    name="Predicted",
    line=dict(color="orange")
))

# Customize Layout
fig.update_layout(
    title="Actual vs Predicted Fare Amount (Limited to 150 Samples)",
    xaxis_title="Sample Index",
    yaxis_title="Fare Amount ($)",
    legend=dict(x=0.5, y=1.15, xanchor="center", yanchor="top"),
    template="plotly_white"
)

# Show the plot
fig.show()


Model Accuracy Metrics:
Mean Absolute Error (MAE): 2.25
Root Mean Squared Error (RMSE): 4.70
R-squared (Accuracy): 0.85
