In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load and preprocess data
def load_and_preprocess_data(file_path):
    data = pd.read_csv(file_path)


In [None]:
    
    # Convert time-related features to appropriate datetime format
    data['timestamp'] = pd.to_datetime(data['timestamp'])


In [None]:
    data['hour'] = data['timestamp'].dt.hour
    data['day_of_week'] = data['timestamp'].dt.dayofweek
    
    # Preserve route_id before one-hot encoding for feature engineering
    data['route_id_original'] = data['route_id']  # Create a copy of the original route_id column
    
    # One-hot encode categorical variables
    data = pd.get_dummies(data, columns=['weather_condition'])  # Exclude route_id from one-hot encoding
    
    return data


In [None]:

# Feature engineering
def engineer_features(data):
    # Create lag features for demand using the original route_id column
    data['demand_lag_1h'] = data.groupby('route_id_original')['demand'].shift(1)
    data['demand_lag_1d'] = data.groupby('route_id_original')['demand'].shift(24)
    
    # Create rolling mean features using the original route_id column
    data['demand_rolling_mean_6h'] = data.groupby('route_id_original')['demand'].rolling(window=6).mean().reset_index(0, drop=True)
    
    # Drop rows with NaN values created by lag and rolling features
    data = data.dropna()
    
    # Drop the original route_id column if not needed further
    # data = data.drop('route_id_original', axis=1)
    
    return data



# Split features and target variable
def split_data(data):
    # Drop 'route_id', 'fare', 'timestamp', and any route_id related columns (e.g., 'route_id_original')
    X = data.drop(['fare', 'timestamp', 'route_id', 'route_id_original'], axis=1)  # Drop 'route_id' and related columns
    
    # If you have one-hot encoded route_id, drop those columns as well:
    route_id_columns = [col for col in X.columns if col.startswith('route_id_')]  # Find columns starting with 'route_id_'
    X = X.drop(columns=route_id_columns)  # Drop those columns
    
    y = data['fare']
    return train_test_split(X, y, test_size=0.2, random_state=42)



# Train the model
def train_model(X_train, y_train):
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    return model

# Evaluate the model
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"Mean Squared Error: {mse}")
    print(f"R-squared Score: {r2}")

# Main function to run the entire process
def main():
    # Load and preprocess data
    data = load_and_preprocess_data('transit_data.csv')
    
    # Engineer features
    data = engineer_features(data)
    
    # Split the data
    X_train, X_test, y_train, y_test = split_data(data)
    
    # Scale the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Train the model
    model = train_model(X_train_scaled, y_train)
    
    # Evaluate the model
    evaluate_model(model, X_test_scaled, y_test)
    
    return model, scaler

# Run the main function
if __name__ == "__main__":
    model, scaler = main()

# Function to predict fare for new data
def predict_fare(model, scaler, new_data):
    new_data_scaled = scaler.transform(new_data)
    return model.predict(new_data_scaled)