In [9]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from statsmodels.tsa.arima.model import ARIMA
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor

# Load the dataset
# Assuming 'data.csv' is the name of your dataset
df = pd.read_csv('dataset.csv')

# Feature engineering
df['start_datetime'] = pd.to_datetime(df['start_date'] + ' ' + df['start_time_hour'].astype(str) + ':' + df['start_time_minute'].astype(str))
df['end_datetime'] = pd.to_datetime(df['end_date'] + ' ' + df['end_time_hour'].astype(str) + ':' + df['end_time_minute'].astype(str))
df['trip_duration_minutes'] = (df['end_datetime'] - df['start_datetime']).dt.total_seconds() / 60.0

# Aggregating demand by time
demand_data = df.groupby('start_datetime')['tripDistance'].count().reset_index()
demand_data.columns = ['timestamp', 'demand']

# Resample to 15-minute intervals
demand_data = demand_data.set_index('timestamp').resample('15T').sum().reset_index()

# Feature selection
demand_data['hour'] = demand_data['timestamp'].dt.hour
demand_data['minute'] = demand_data['timestamp'].dt.minute
X = demand_data[['hour', 'minute']]
y = demand_data['demand']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# Linear Regression model
model_lr = LinearRegression()
model_lr.fit(X_train, y_train)

# XGBoost model
model_xgb = XGBRegressor()
model_xgb.fit(X_train, y_train)

# ARIMA model
model_arima = ARIMA(y_train, order=(5, 1, 0))
model_arima_fit = model_arima.fit()

# Deep Neural Network model
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model_dnn = MLPRegressor(hidden_layer_sizes=(100,), max_iter=500)
model_dnn.fit(X_train_scaled, y_train)

# Random Forest model
model_rf = RandomForestRegressor()
model_rf.fit(X_train, y_train)

# Evaluate models
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    return mse

mse_lr = evaluate_model(model_lr, X_test, y_test)
mse_xgb = evaluate_model(model_xgb, X_test, y_test)
mse_arima = mean_squared_error(y_test, model_arima_fit.forecast(steps=len(X_test)))
mse_dnn = evaluate_model(model_dnn, X_test_scaled, y_test)
mse_rf = evaluate_model(model_rf, X_test, y_test)

# Compare model performances
print(f'Mean Squared Error - Linear Regression: {mse_lr}')
print(f'Mean Squared Error - XGBoost: {mse_xgb}')
print(f'Mean Squared Error - ARIMA: {mse_arima}')
print(f'Mean Squared Error - Deep Neural Network: {mse_dnn}')
print(f'Mean Squared Error - Random Forest: {mse_rf}')


Mean Squared Error - Linear Regression: 36.47475102148569
Mean Squared Error - XGBoost: 20.600286109514986
Mean Squared Error - ARIMA: 511.47982120952094
Mean Squared Error - Deep Neural Network: 21.631000567806247
Mean Squared Error - Random Forest: 20.597555570732894


In [10]:
# Sample dataset for prediction
sample_data = pd.DataFrame({
    'start_date': ['04-01-2022', '04-01-2022'],
    'start_time_hour': [8, 12],
    'start_time_minute': [0, 30],
})

# Feature engineering for the sample dataset
sample_data['start_datetime'] = pd.to_datetime(sample_data['start_date'] + ' ' + sample_data['start_time_hour'].astype(str) + ':' + sample_data['start_time_minute'].astype(str))
sample_data['hour'] = sample_data['start_datetime'].dt.hour
sample_data['minute'] = sample_data['start_datetime'].dt.minute

# Select relevant features
sample_features = sample_data[['hour', 'minute']]

# Predict using the trained XGBoost model
sample_prediction_xgb = model_xgb.predict(sample_features)

# Predict using the trained Linear Regression model
sample_prediction_lr = model_lr.predict(sample_features)

# Predict using the trained ARIMA model
sample_prediction_arima = model_arima_fit.forecast(steps=len(sample_features))

# Predict using the trained Deep Neural Network model
sample_features_scaled = scaler.transform(sample_features)
sample_prediction_dnn = model_dnn.predict(sample_features_scaled)

# Predict using the trained Random Forest model
sample_prediction_rf = model_rf.predict(sample_features)

# Display the predictions
sample_data['prediction_xgb'] = sample_prediction_xgb
sample_data['prediction_lr'] = sample_prediction_lr
sample_data['prediction_arima'] = sample_prediction_arima
sample_data['prediction_dnn'] = sample_prediction_dnn
sample_data['prediction_rf'] = sample_prediction_rf

print("Sample Predictions:")
print(sample_data[['start_datetime', 'prediction_xgb', 'prediction_lr', 'prediction_arima', 'prediction_dnn', 'prediction_rf']])


Sample Predictions:
       start_datetime  prediction_xgb  prediction_lr  prediction_arima  \
0 2022-04-01 08:00:00        1.695130       0.332314               NaN   
1 2022-04-01 12:30:00        0.225307       1.687168               NaN   

   prediction_dnn  prediction_rf  
0        1.804121       1.703766  
1        0.092288       0.226951  
