# Exploratory df Analysis

In [1]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

df = pd.read_csv('./dataset/dynamic_pricing.csv')
print(df.head())

   Number_of_Riders  Number_of_Drivers Location_Category   
0                90                 45             Urban  \
1                58                 39          Suburban   
2                42                 31             Rural   
3                89                 28             Rural   
4                78                 22             Rural   

  Customer_Loyalty_Status  Number_of_Past_Rides  Average_Ratings   
0                  Silver                    13             4.47  \
1                  Silver                    72             4.06   
2                  Silver                     0             3.99   
3                 Regular                    67             4.31   
4                 Regular                    74             3.77   

  Time_of_Booking Vehicle_Type  Expected_Ride_Duration   
0           Night      Premium                      90  \
1         Evening      Economy                      43   
2       Afternoon      Premium                      76  

In [2]:
# descriptive statistics
print(df.describe())

       Number_of_Riders  Number_of_Drivers  Number_of_Past_Rides   
count       1000.000000        1000.000000           1000.000000  \
mean          60.372000          27.076000             50.031000   
std           23.701506          19.068346             29.313774   
min           20.000000           5.000000              0.000000   
25%           40.000000          11.000000             25.000000   
50%           60.000000          22.000000             51.000000   
75%           81.000000          38.000000             75.000000   
max          100.000000          89.000000            100.000000   

       Average_Ratings  Expected_Ride_Duration  Historical_Cost_of_Ride  
count      1000.000000              1000.00000              1000.000000  
mean          4.257220                99.58800               372.502623  
std           0.435781                49.16545               187.158756  
min           3.500000                10.00000                25.993449  
25%           3.8

In [3]:
fig = px.scatter(df, x='Expected_Ride_Duration', 
                 y='Historical_Cost_of_Ride',
                 title='Expected Ride Duration vs. Historical Cost of Ride', 
                 trendline='ols')
                 
fig.show()
fig.write_html('plots/Expected_Ride_Duration_vs_Historical_Cost_of_Ride.html')



In [4]:
# distribution of the historical cost of rides based on the vehicle type
fig = px.box(df, x='Vehicle_Type', y='Historical_Cost_of_Ride', 
             title='Distribution of Historical Cost of Rides by Vehicle Type')
fig.show()
fig.write_html('plots/Distribution_of_Historical_Cost_of_Rides_by_Vehicle_Type.html')

 

In [None]:
# correlation matrix for features selection
## select the numeric columns
from plotly.graph_objs import Data


df_numeric = df.select_dtypes(include=['float64', 'int64'])
## calculate the correlation matrix
correlation_matrix = df_numeric.corr()
# Create annotation text for each cell in the heatmap
annotations = []
for i, row in enumerate(correlation_matrix.values):
    for j, value in enumerate(row):
        annotations.append(
            dict(
                x=correlation_matrix.columns[j],
                y=correlation_matrix.columns[i],
                text=f"{value:.2f}",
                showarrow=False,
                font=dict(color="black" if abs(value) < 0.5 else "white")
            )
        )
## generate the layout for the heatmap
fig = go.Figure(data=go.Heatmap(
    z=correlation_matrix.values,
    x=correlation_matrix.columns,
    y=correlation_matrix.columns,
    colorscale='RdBu'
))
## add the values for each cell in the heatmap
fig.update_layout(annotations=annotations)
## add the title for the heatmap
fig.update_layout(title='Correlation Matrix of Features')
## show the heatmap 
fig.show()

## Save as HTML file
fig.write_html('plots/Correlation_Matrix_of_Features.html')


## Implement the Dynamic Pricing 

In [11]:
import numpy as np

# Calculate demand_multiplier based on percentile for high and low demand
high_demand_percentile = 75 ## which is 75th percentile 
## any value with rider count above this percentile will be considered as high demand

low_demand_percentile = 25 ## which is 25th percentile 
## any value with rider count below this percentile will be considered as low demand

df['demand_multiplier'] = np.where(df['Number_of_Riders'] > np.percentile(df['Number_of_Riders'], high_demand_percentile),
                                     df['Number_of_Riders'] / np.percentile(df['Number_of_Riders'], high_demand_percentile),
                                     df['Number_of_Riders'] / np.percentile(df['Number_of_Riders'], low_demand_percentile))
explan1 = """
Code Explanation:
np.where(condition, value_if_true, value_if_false) is like an if else statement for arrays:
   - condition: compare each row of number of riders with the 75th percentile threshold
   - value_if_true: if the condition is true, then the value of demand_multiplier is the number of riders divided by the 75th percentile
   - value_if_false: if the condition is false, then the value of demand_multiplier is the number of riders divided by the 25th percentile
"""

# Calculate supply_multiplier based on percentile for high and low supply
high_supply_percentile = 75
low_supply_percentile = 25

df['supply_multiplier'] = np.where(df['Number_of_Drivers'] > np.percentile(df['Number_of_Drivers'], low_supply_percentile),
                                     np.percentile(df['Number_of_Drivers'], high_supply_percentile) / df['Number_of_Drivers'],
                                     np.percentile(df['Number_of_Drivers'], low_supply_percentile) / df['Number_of_Drivers'])
explan2 ="""The idea is same with demand_multiplier"""

# Define price adjustment factors for high and low demand/supply
demand_threshold_high = 1.2  # Higher demand threshold
demand_threshold_low = 0.8  # Lower demand threshold
supply_threshold_high = 0.8  # Higher supply threshold
supply_threshold_low = 1.2  # Lower supply threshold

# Calculate adjusted_ride_cost for dynamic pricing
df['adjusted_ride_cost'] = df['Historical_Cost_of_Ride'] * (
    np.maximum(df['demand_multiplier'], demand_threshold_low) *
    np.maximum(df['supply_multiplier'], supply_threshold_high)
)
explan3 =""" Code Explanation:
    The idea of this code is to adjust the ride cost based on the demand and supply. So first it will take the higher
    value between demand_multiplier and demand_threshold_low (Ensures demand multiplier never goes below 0.8, which is 20% max price),
    same idea for supply_multiplier and supply_threshold_high (Ensures supply multiplier never goes below 0.8, which is 20% max price).
    Then it will multiply with the historical cost of ride to get the adjusted ride cost.
"""
explan4 ="""
    Why we need to do this?
    - To ensure the adjusted ride cost is not too high or too low.
    - To ensure the ride cost is profitable for the company.
"""



## Result after implemeting Dynamic Pricing Strategy

In [None]:
# Visualize the profit percentage after implementing this dynamic pricing strategy
## calculate


## visualize

## Training a Predictive Model

In [13]:
# Data Preprocessing

In [14]:
# Model training
## split the data into training and testing set

## Reshape y to 1D array

## Training a random forest regression model

In [None]:
# Model Tesing and Evaluation