In [1]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from ydata_profiling import ProfileReport
from sklearn.preprocessing import LabelEncoder



In [13]:
data = pd.read_csv('dynamic_pricing.csv')
print(data.head())

   Number_of_Riders  Number_of_Drivers Location_Category  \
0                90                 45             Urban   
1                58                 39          Suburban   
2                42                 31             Rural   
3                89                 28             Rural   
4                78                 22             Rural   

  Customer_Loyalty_Status  Number_of_Past_Rides  Average_Ratings  \
0                  Silver                    13             4.47   
1                  Silver                    72             4.06   
2                  Silver                     0             3.99   
3                 Regular                    67             4.31   
4                 Regular                    74             3.77   

  Time_of_Booking Vehicle_Type  Expected_Ride_Duration  \
0           Night      Premium                      90   
1         Evening      Economy                      43   
2       Afternoon      Premium                      76  

In [14]:
data.describe()

Unnamed: 0,Number_of_Riders,Number_of_Drivers,Number_of_Past_Rides,Average_Ratings,Expected_Ride_Duration,Historical_Cost_of_Ride
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,60.372,27.076,50.031,4.25722,99.588,372.502623
std,23.701506,19.068346,29.313774,0.435781,49.16545,187.158756
min,20.0,5.0,0.0,3.5,10.0,25.993449
25%,40.0,11.0,25.0,3.87,59.75,221.365202
50%,60.0,22.0,51.0,4.27,102.0,362.019426
75%,81.0,38.0,75.0,4.6325,143.0,510.497504
max,100.0,89.0,100.0,5.0,180.0,836.116419


In [15]:
# Produce the data profiling report
original_report = ProfileReport(data, title='Original Data')
original_report
#original_report.to_file("original_report.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [18]:
# Converting all categorical columns to numeric
le = LabelEncoder()
df=data.copy()

# Using .fit_transform function to fit label and transform
df[['Location_Category_num','Customer_Loyalty_Status_num','Time_of_Booking_num','Vehicle_Type_num']]=df[['Location_Category','Customer_Loyalty_Status','Time_of_Booking','Vehicle_Type']].apply(LabelEncoder().fit_transform)
df.drop(['Location_Category','Customer_Loyalty_Status','Time_of_Booking','Vehicle_Type'],axis=1,inplace=True)
df

 

Unnamed: 0,Number_of_Riders,Number_of_Drivers,Number_of_Past_Rides,Average_Ratings,Expected_Ride_Duration,Historical_Cost_of_Ride,Location_Category_num,Customer_Loyalty_Status_num,Time_of_Booking_num,Vehicle_Type_num
0,90,45,13,4.47,90,284.257273,2,2,3,1
1,58,39,72,4.06,43,173.874753,1,2,1,0
2,42,31,0,3.99,76,329.795469,0,2,0,1
3,89,28,67,4.31,134,470.201232,0,1,0,1
4,78,22,74,3.77,149,579.681422,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...
995,33,23,24,4.21,11,91.389526,2,0,2,1
996,84,29,92,4.55,94,424.155987,2,1,2,1
997,44,6,80,4.13,40,157.364830,1,0,3,1
998,53,27,78,3.63,58,279.095048,1,1,3,1


In [20]:
# Correlation matrix to check correlation between all variables
#df.drop(['Location_Category','Customer_Loyalty_Status','Time_of_Booking','Vehicle_Type'],axis='columns',inplace=True)

corr_matrix = df.corr()
#corr_matrix
fig = go.Figure(data=go.Heatmap(z=corr_matrix.values, 
                                x=corr_matrix.columns, 
                                y=corr_matrix.columns,
                                colorscale='Viridis'))
fig.update_layout(title='Correlation Matrix')
fig.show()


In [21]:
# Get columns where absolute correlation value is greater than 0.5
highly_correlated_columns = corr_matrix[abs(corr_matrix) > 0.5].stack().index.tolist()
# Filter out duplicate and self-correlated pairs
highly_correlated_columns = [(col1, col2) for col1, col2 in highly_correlated_columns if col1 != col2]

print("Pairs of columns with correlation > 0.5:")
for col1, col2 in highly_correlated_columns:
    correlation_value = corr_matrix.loc[col1, col2]
    print(f"{col1} - {col2}: {correlation_value}")



Pairs of columns with correlation > 0.5:
Number_of_Riders - Number_of_Drivers: 0.6270163439391535
Number_of_Drivers - Number_of_Riders: 0.6270163439391535
Expected_Ride_Duration - Historical_Cost_of_Ride: 0.9275471833882497
Historical_Cost_of_Ride - Expected_Ride_Duration: 0.9275471833882497


##### Based on the data, Expected Ride duration has a strong correlation with historical cost of ride.

### Goal is to implement dynamic pricing strategy using Time of Booking and Supply/Demand levels.

1. Implementing dynamic pricing using <b>Time of Booking</b> the ride.

In [23]:
# Define pricing tiers based on time of booking
pricing_tiers = {
    "Morning": 1.2,  # 20% increase in price
    "Afternoon": 1.0,  # Standard price
    "Evening": 1.5,  # 50% increase in price
    "Night": 1.3,  # 30% increase in price
}

# Function to calculate dynamic price based on time of booking
def calculate_dynamic_price(row):
    time_of_booking = row['Time_of_Booking']
    base_price = row['Historical_Cost_of_Ride']
    if time_of_booking in pricing_tiers:
        return base_price * pricing_tiers[time_of_booking]
    else:
        return base_price

# Apply dynamic pricing to the dataset
data['adjusted_ride_cost_timing'] = data.apply(calculate_dynamic_price, axis=1)

# Compare original price with dynamic price
data[['Time_of_Booking', 'Historical_Cost_of_Ride', 'adjusted_ride_cost_timing']]

Unnamed: 0,Time_of_Booking,Historical_Cost_of_Ride,adjusted_ride_cost_timing
0,Night,284.257273,369.534455
1,Evening,173.874753,260.812129
2,Afternoon,329.795469,329.795469
3,Afternoon,470.201232,470.201232
4,Afternoon,579.681422,579.681422
...,...,...,...
995,Morning,91.389526,109.667431
996,Morning,424.155987,508.987184
997,Night,157.364830,204.574279
998,Night,279.095048,362.823563


2. Calculating adjusted dynamic pricing based on Supply and Demand levels along with Timing of Booking

In [24]:
import numpy as np

# Load the dataset
# Assuming data is your DataFrame containing the dataset

# Calculate demand_multiplier based on percentile for high and low demand
high_demand_percentile = 75
low_demand_percentile = 25

data['demand_multiplier'] = np.where(data['Number_of_Riders'] > np.percentile(data['Number_of_Riders'], high_demand_percentile),
                                    data['Number_of_Riders'] / np.percentile(data['Number_of_Riders'], high_demand_percentile),
                                    data['Number_of_Riders'] / np.percentile(data['Number_of_Riders'], low_demand_percentile))

# Calculate supply_multiplier based on percentile for high and low supply
high_supply_percentile = 75
low_supply_percentile = 25

data['supply_multiplier'] = np.where(data['Number_of_Drivers'] > np.percentile(data['Number_of_Drivers'], low_supply_percentile),
                                    np.percentile(data['Number_of_Drivers'], high_supply_percentile) / data['Number_of_Drivers'],
                                    np.percentile(data['Number_of_Drivers'], low_supply_percentile) / data['Number_of_Drivers'])

# Define price adjustment factors for high and low demand/supply
demand_threshold_high = 1.2  # Higher demand threshold
demand_threshold_low = 0.8  # Lower demand threshold
supply_threshold_high = 0.8  # Higher supply threshold
supply_threshold_low = 1.2  # Lower supply threshold

# Function to calculate dynamic price based on time of booking and demand/supply multipliers
def calculate_dynamic_price(row):
    time_of_booking = row['Time_of_Booking']
    base_price = row['Historical_Cost_of_Ride']
    demand_multiplier = row['demand_multiplier']
    supply_multiplier = row['supply_multiplier']
    
    # Adjust price based on timing of booking
    if time_of_booking == 'Morning':
        time_adjustment_factor = 1.2  # 10% increase in price for morning bookings
    elif time_of_booking == 'Evening':
        time_adjustment_factor = 1.5  # 30% increase in price for evening bookings
    else:
        time_adjustment_factor = 1.0  # No adjustment for other times
    
    # Calculate adjusted price considering demand/supply multipliers and timing adjustment
    adjusted_price = base_price * demand_multiplier * supply_multiplier * time_adjustment_factor
    
    # Ensure the adjusted price is not too high or low from the historical price
    adjusted_price = min(max(adjusted_price, base_price * 0.8), base_price * 1.2)  # Adjusted price capped between 80% and 120% of historical price
    
    return adjusted_price

# Apply dynamic pricing to the dataset
data['adjusted_ride_cost'] = data.apply(calculate_dynamic_price, axis=1)

# Compare original price with dynamic price
data[['Time_of_Booking','demand_multiplier','supply_multiplier', 'Historical_Cost_of_Ride', 'adjusted_ride_cost']]


Unnamed: 0,Time_of_Booking,demand_multiplier,supply_multiplier,Historical_Cost_of_Ride,adjusted_ride_cost
0,Night,1.111111,0.844444,284.257273,266.710528
1,Evening,1.450000,0.974359,173.874753,208.649703
2,Afternoon,1.050000,1.225806,329.795469,395.754563
3,Afternoon,1.098765,1.357143,470.201232,564.241478
4,Afternoon,1.950000,1.727273,579.681422,695.617707
...,...,...,...,...,...
995,Morning,0.825000,1.652174,91.389526,109.667431
996,Morning,1.037037,1.310345,424.155987,508.987184
997,Night,1.100000,1.833333,157.364830,188.837796
998,Night,1.325000,1.407407,279.095048,334.914058


### Calculating profit percentage for each ride while identifying profitable and non-profitable rides

In [25]:
# Calculate the profit percentage for each ride
data['profit_percentage'] = ((data['adjusted_ride_cost'] - data['Historical_Cost_of_Ride']) / data['Historical_Cost_of_Ride']) * 100
# Identify profitable rides where profit percentage is positive
profitable_rides = data[data['profit_percentage'] > 0]

# Identify loss rides where profit percentage is negative
loss_rides = data[data['profit_percentage'] < 0]


import plotly.graph_objects as go

# Calculate the count of profitable and loss rides
profitable_count = len(profitable_rides)
loss_count = len(loss_rides)

# Create a donut chart to show the distribution of profitable and loss rides
labels = ['Profitable Rides', 'Loss Rides']
values = [profitable_count, loss_count]

fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=0.4)])
fig.update_layout(title='Profitability of Rides (Dynamic Pricing vs. Historical Pricing)')
fig.show()

#### Relationship between the expected ride duration and the cost of the ride based on the dynamic pricing strategy:

In [26]:
fig = px.scatter(data, 
                 x='Expected_Ride_Duration', 
                 y='adjusted_ride_cost',
                 title='Expected Ride Duration vs. Cost of Ride', 
                 trendline='ols')
fig.show()

## Training Predictive Model

### Data Preprocessing Pipeline

In [27]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

def data_preprocessing_pipeline(data):
    #Identify numeric and categorical features
    numeric_features = data.select_dtypes(include=['float', 'int']).columns
    categorical_features = data.select_dtypes(include=['object']).columns

    #Handle missing values in numeric features
    data[numeric_features] = data[numeric_features].fillna(data[numeric_features].mean())

    #Detect and handle outliers in numeric features using IQR
    for feature in numeric_features:
        Q1 = data[feature].quantile(0.25)
        Q3 = data[feature].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - (1.5 * IQR)
        upper_bound = Q3 + (1.5 * IQR)
        data[feature] = np.where((data[feature] < lower_bound) | (data[feature] > upper_bound),
                                 data[feature].mean(), data[feature])

    #Handle missing values in categorical features
    data[categorical_features] = data[categorical_features].fillna(data[categorical_features].mode().iloc[0])

    return data

#### Converting categorical variable Vehicle Type to numeric

In [28]:
data["Vehicle_Type"] = data["Vehicle_Type"].map({"Premium": 1, 
                                           "Economy": 0})

### Splitting the data and fitting the model

In [38]:
#splitting data
from sklearn.model_selection import train_test_split
x = np.array(data[["Number_of_Riders", "Number_of_Drivers", "Vehicle_Type", "Expected_Ride_Duration"]])
y = np.array(data[["adjusted_ride_cost"]])

x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)

# Reshape y to 1D array
y_train = y_train.ravel()
y_test = y_test.ravel()

# Training a random forest regression model
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(oob_score=True)
model.fit(x_train, y_train)

### Predicting using Random Forest Regressor

In [39]:
def get_vehicle_type_numeric(vehicle_type):
    vehicle_type_mapping = {
        "Premium": 1,
        "Economy": 0
    }
    vehicle_type_numeric = vehicle_type_mapping.get(vehicle_type)
    return vehicle_type_numeric
  
# Predicting using user input values
def predict_price(number_of_riders, number_of_drivers, vehicle_type, Expected_Ride_Duration):
    vehicle_type_numeric = get_vehicle_type_numeric(vehicle_type)
    if vehicle_type_numeric is None:
        raise ValueError("Invalid vehicle type")
    
    input_data = np.array([[number_of_riders, number_of_drivers, vehicle_type_numeric, Expected_Ride_Duration]])
    predicted_price = model.predict(input_data)
    return predicted_price

# Example prediction using user input values
user_number_of_riders = 50
user_number_of_drivers = 25
user_vehicle_type = "Economy"
Expected_Ride_Duration = 30
predicted_price = predict_price(user_number_of_riders, user_number_of_drivers, user_vehicle_type, Expected_Ride_Duration)
print("Predicted price:", predicted_price)

Predicted price: [114.83608529]


###  Comparison of the actual and predicted results

In [40]:
import plotly.graph_objects as go

# Predict on the test set
y_pred = model.predict(x_test)

# Create a scatter plot with actual vs predicted values
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=y_test.flatten(),
    y=y_pred,
    mode='markers',
    name='Actual vs Predicted'
))

# Add a line representing the ideal case
fig.add_trace(go.Scatter(
    x=[min(y_test.flatten()), max(y_test.flatten())],
    y=[min(y_test.flatten()), max(y_test.flatten())],
    mode='lines',
    name='Ideal',
    line=dict(color='red', dash='dash')
))

fig.update_layout(
    title='Actual vs Predicted Values',
    xaxis_title='Actual Values',
    yaxis_title='Predicted Values',
    showlegend=True,
)

fig.show()

### Evaluating the model

In [41]:
 
# Evaluating the model
from sklearn.metrics import mean_squared_error, r2_score
 
# Access the OOB Score
oob_score = model.oob_score_
print(f'Out-of-Bag Score: {oob_score}')
 
# Making predictions on the same data or new data
predictions = model.predict(x)
 
# Evaluating the model
mse = mean_squared_error(y, predictions)
print(f'Mean Squared Error: {mse}')
 
r2 = r2_score(y, predictions)
print(f'R-squared: {r2}')

Out-of-Bag Score: 0.8332023875096721
Mean Squared Error: 2566.7792930627165
R-squared: 0.946255568562012
