In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

# 1. Load & inspect data
df = pd.read_csv('dataset.csv')
print(df.head())

# 2. Feature engineering
BASE_PRICE = 10.0

# Occupancy ratio
df['occupancy_ratio'] = df['Occupancy'] / df['Capacity']

# Extract Hour and Day of Week
df['Hour'] = pd.to_datetime(df['LastUpdatedTime'], format='%H:%M:%S').dt.hour
df['DayOfWeek'] = pd.to_datetime(df['LastUpdatedDate'], format='%d-%m-%Y').dt.dayofweek

# Encode categorical variables
df['TrafficCondition_encoded'] = df['TrafficConditionNearby'].map({
    'low': 0, 'average': 1, 'high': 2
}).fillna(1)

df['VehicleType_encoded'] = df['VehicleType'].map({
    'car': 0, 'bike': 1, 'cycle': 1, 'truck': 2
}).fillna(0)

print("Data preprocessing complete.")

# 3. Demand factor function
def calculate_demand_factor(
    occupancy_ratio, queue_length, traffic_level,
    special_day, vehicle_type, hour, day_of_week
):
    if pd.isna(occupancy_ratio) or pd.isna(queue_length):
        return 1.0

    occupancy_ratio = np.clip(occupancy_ratio, 0, 1)
    queue_length = max(0, queue_length)

    time_factor = np.sin(hour * np.pi / 12) * 0.2 + 0.8
    weekend_factor = 1.15 if day_of_week >= 5 else 0.95

    occupancy_component = (occupancy_ratio ** 1.5) * 0.6
    queue_component = (min(queue_length / 12.0, 1.0) ** 1.2) * 0.3 * (1 + traffic_level * 0.15)

    vehicle_multipliers = {0: 1.0, 1: 0.7, 2: 1.3}
    vehicle_adjustment = vehicle_multipliers.get(vehicle_type, 1.0)

    special_surge = special_day * 0.25
    traffic_surge = traffic_level * 0.1

    base_variation = np.random.normal(0.1, 0.05)

    raw_demand = (
        (0.4 + occupancy_component + queue_component + special_surge + traffic_surge)
        * vehicle_adjustment * weekend_factor * time_factor
    ) + base_variation

    sigmoid_input = (raw_demand - 0.7) * 3
    normalized_demand = 1 / (1 + np.exp(-sigmoid_input))
    demand_factor = 0.6 + 1.4 * normalized_demand

    return np.clip(demand_factor, 0.6, 2.0)

# Apply demand factor
df['demand_factor'] = df.apply(lambda row: calculate_demand_factor(
    row['occupancy_ratio'], row['QueueLength'], row['TrafficCondition_encoded'],
    row['IsSpecialDay'], row['VehicleType_encoded'], row['Hour'], row['DayOfWeek']
), axis=1)

df['target_price'] = BASE_PRICE * df['demand_factor']

print(f"Price range: ${df['target_price'].min():.2f} - ${df['target_price'].max():.2f}")

# 4. Train XGBoost model
features = [
    'occupancy_ratio', 'QueueLength', 'TrafficCondition_encoded',
    'IsSpecialDay', 'VehicleType_encoded', 'Hour', 'DayOfWeek', 'demand_factor'
]

X = df[features]
y = df['target_price']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

xgb_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=200,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

xgb_model.fit(X_train, y_train)

y_pred = np.clip(
    xgb_model.predict(X_test),
    BASE_PRICE * 0.6, BASE_PRICE * 2.0
)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"XGBoost Model RMSE: {rmse:.4f}")

# 5. Feature importance
importance_df = pd.DataFrame({
    'feature': features,
    'importance': xgb_model.feature_importances_
}).sort_values('importance', ascending=False)

# 6. Visualizations
plt.figure(figsize=(12, 8))

plt.subplot(2, 2, 1)
plt.scatter(y_test, y_pred, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title('Actual vs Predicted (XGBoost)')

plt.subplot(2, 2, 2)
plt.barh(importance_df['feature'][:5], importance_df['importance'][:5])
plt.title('Top 5 Feature Importance (XGBoost)')

plt.subplot(2, 2, 3)
hourly_avg = df.groupby('Hour')['target_price'].mean()
plt.plot(hourly_avg.index, hourly_avg.values, marker='o')
plt.xlabel('Hour')
plt.ylabel('Avg Price')
plt.title('Hourly Pricing Pattern')

plt.subplot(2, 2, 4)
plt.hist(df['target_price'], bins=30, color='green', alpha=0.7, edgecolor='black')
plt.axvline(BASE_PRICE * 0.6, color='red', linestyle='--', label='0.6x Base')
plt.axvline(BASE_PRICE * 2.0, color='red', linestyle='--', label='2.0x Base')
plt.axvline(BASE_PRICE, color='blue', linestyle='-', linewidth=2, label='Base Price')
plt.xlabel('Price ($)')
plt.ylabel('Frequency')
plt.title('Price Distribution')
plt.legend()

plt.tight_layout()
plt.show()

print(f"Model Summary | Price Range: ${df['target_price'].min():.2f} - ${df['target_price'].max():.2f}")

# 7. Real-time pricing function
def dynamic_pricing_function(
    occupancy, capacity, queue_length,
    traffic_condition, special_day, vehicle_type, hour, day_of_week
):
    occupancy_ratio = occupancy / capacity
    traffic_encoded = {'Low': 0, 'Medium': 1, 'High': 2}.get(traffic_condition, 1)
    vehicle_encoded = {'car': 0, 'bike': 1, 'truck': 2}.get(vehicle_type, 0)

    demand_factor = calculate_demand_factor(
        occupancy_ratio, queue_length, traffic_encoded,
        int(special_day), vehicle_encoded, hour, day_of_week
    )

    features = np.array([[
        occupancy_ratio, queue_length, traffic_encoded,
        int(special_day), vehicle_encoded, hour, day_of_week, demand_factor
    ]])

    predicted_price = xgb_model.predict(features)[0]
    return np.clip(predicted_price, BASE_PRICE * 0.6, BASE_PRICE * 2.0)

# 8. Test cases
test_cases = [
    (20, 100, 0, 'Low', False, 'bike', 9, 1),
    (60, 100, 3, 'Medium', False, 'car', 14, 3),
    (85, 100, 8, 'High', False, 'car', 17, 1),
    (95, 100, 12, 'High', True, 'truck', 19, 6)
]

for i, case in enumerate(test_cases, 1):
    price = dynamic_pricing_function(*case)
    print(f"Case {i}: ${price:.2f} | Conditions: {case[0]}/{case[1]} occupancy, "
          f"{case[2]} queue, {case[3]} traffic")
