## Read csv file

In [2]:
import pandas as pd
df = pd.read_csv('../data/taxi_trip_pricing.csv')
df.head()

Unnamed: 0,Trip_Distance_km,Time_of_Day,Day_of_Week,Passenger_Count,Traffic_Conditions,Weather,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Trip_Price
0,19.35,Morning,Weekday,3.0,Low,Clear,3.56,0.8,0.32,53.82,36.2624
1,47.59,Afternoon,Weekday,1.0,High,Clear,,0.62,0.43,40.57,
2,36.87,Evening,Weekend,1.0,High,Clear,2.7,1.21,0.15,37.27,52.9032
3,30.33,Evening,Weekday,4.0,Low,,3.48,0.51,0.15,116.81,36.4698
4,,Evening,Weekday,3.0,High,Clear,2.93,0.63,0.32,22.64,15.618


## EDA

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Trip_Distance_km       950 non-null    float64
 1   Time_of_Day            950 non-null    object 
 2   Day_of_Week            950 non-null    object 
 3   Passenger_Count        950 non-null    float64
 4   Traffic_Conditions     950 non-null    object 
 5   Weather                950 non-null    object 
 6   Base_Fare              950 non-null    float64
 7   Per_Km_Rate            950 non-null    float64
 8   Per_Minute_Rate        950 non-null    float64
 9   Trip_Duration_Minutes  950 non-null    float64
 10  Trip_Price             951 non-null    float64
dtypes: float64(7), object(4)
memory usage: 86.1+ KB


In [4]:
df.isna().sum()

Trip_Distance_km         50
Time_of_Day              50
Day_of_Week              50
Passenger_Count          50
Traffic_Conditions       50
Weather                  50
Base_Fare                50
Per_Km_Rate              50
Per_Minute_Rate          50
Trip_Duration_Minutes    50
Trip_Price               49
dtype: int64

In [5]:
df[df.isna().any(axis=1)]

Unnamed: 0,Trip_Distance_km,Time_of_Day,Day_of_Week,Passenger_Count,Traffic_Conditions,Weather,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Trip_Price
1,47.59,Afternoon,Weekday,1.0,High,Clear,,0.62,0.43,40.57,
3,30.33,Evening,Weekday,4.0,Low,,3.48,0.51,0.15,116.81,36.4698
4,,Evening,Weekday,3.0,High,Clear,2.93,0.63,0.32,22.64,15.6180
6,3.85,Afternoon,Weekday,4.0,High,Rain,3.51,1.66,,5.05,11.2645
7,43.44,Evening,Weekend,3.0,,Clear,2.97,1.87,0.23,,101.1216
...,...,...,...,...,...,...,...,...,...,...,...
991,35.04,Morning,Weekend,4.0,Medium,Rain,2.90,1.10,0.15,9.99,
993,,Morning,Weekday,3.0,Medium,Clear,2.65,1.35,,25.61,55.3348
996,45.95,Night,Weekday,4.0,Medium,Clear,3.12,0.61,,61.96,62.1295
997,7.70,Morning,Weekday,3.0,Low,Rain,2.08,1.78,,54.18,33.1236


## Drop the `Passenger_Count` column because it does not significantly affect the target


In [6]:
df = df.drop('Passenger_Count', axis=1)
df.head()

Unnamed: 0,Trip_Distance_km,Time_of_Day,Day_of_Week,Traffic_Conditions,Weather,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Trip_Price
0,19.35,Morning,Weekday,Low,Clear,3.56,0.8,0.32,53.82,36.2624
1,47.59,Afternoon,Weekday,High,Clear,,0.62,0.43,40.57,
2,36.87,Evening,Weekend,High,Clear,2.7,1.21,0.15,37.27,52.9032
3,30.33,Evening,Weekday,Low,,3.48,0.51,0.15,116.81,36.4698
4,,Evening,Weekday,High,Clear,2.93,0.63,0.32,22.64,15.618


## Drop rows where `Trip_Distance_km` and other key values are missing
These rows cannot be used for machine learning because `Trip_Distance_km` or other essential features are missing, so the trip price cannot be calculated accurately.


In [7]:
df[df['Trip_Distance_km'].isna()]

Unnamed: 0,Trip_Distance_km,Time_of_Day,Day_of_Week,Traffic_Conditions,Weather,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Trip_Price
4,,Evening,Weekday,High,Clear,2.93,0.63,0.32,22.64,15.618
10,,Morning,Weekday,,Clear,2.4,0.58,0.43,26.34,14.892
84,,Night,Weekday,Medium,Clear,3.38,1.5,0.31,58.31,45.8161
87,,,Weekday,Medium,Clear,3.41,0.57,0.39,92.72,57.9476
95,,Morning,Weekend,Medium,Clear,3.43,0.91,0.5,67.43,60.077
111,,Morning,Weekday,Low,Clear,4.01,1.65,0.17,109.77,37.3559
134,,Afternoon,Weekend,Low,Clear,2.55,1.46,0.37,38.26,85.6182
137,,,,Low,Clear,4.52,1.38,0.35,57.56,73.587
142,,Afternoon,Weekend,Medium,Rain,2.98,1.24,0.28,36.68,44.7092
166,,Evening,Weekday,High,Clear,4.11,1.0,0.36,86.31,51.9016


In [8]:
rows_to_remove = [366, 424, 770, 813, 993]
df = df.drop(rows_to_remove)

## Fill in missing Trip_Distance_km values by calculating them from other relevant trip details and rates

In [9]:
missing_count = df['Trip_Distance_km'].isna().sum()
print(f"Number of missing Trip_Distance_km values: {missing_count}")

df['Trip_Distance_km'] = df.apply(
    lambda row: (row['Trip_Price'] - row['Base_Fare'] - row['Per_Minute_Rate']*row['Trip_Duration_Minutes']) / row['Per_Km_Rate']
    if pd.isna(row['Trip_Distance_km']) else row['Trip_Distance_km'],
    axis=1
)

print(f"Number of missing Trip_Distance_km values after calculation: {df['Trip_Distance_km'].isna().sum()}")


Number of missing Trip_Distance_km values: 45
Number of missing Trip_Distance_km values after calculation: 1


In [10]:
df[df['Trip_Distance_km'].isna()]

Unnamed: 0,Trip_Distance_km,Time_of_Day,Day_of_Week,Traffic_Conditions,Weather,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Trip_Price
734,,Morning,Weekend,Low,Clear,3.14,,0.14,89.47,34.4316


In [11]:
df = df.drop(734)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 994 entries, 0 to 999
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Trip_Distance_km       994 non-null    float64
 1   Time_of_Day            944 non-null    object 
 2   Day_of_Week            944 non-null    object 
 3   Traffic_Conditions     944 non-null    object 
 4   Weather                944 non-null    object 
 5   Base_Fare              945 non-null    float64
 6   Per_Km_Rate            945 non-null    float64
 7   Per_Minute_Rate        945 non-null    float64
 8   Trip_Duration_Minutes  947 non-null    float64
 9   Trip_Price             945 non-null    float64
dtypes: float64(6), object(4)
memory usage: 85.4+ KB


## Drop rows where `Base_Fare` and other key values are missing

In [13]:
df[df['Base_Fare'].isna()]

Unnamed: 0,Trip_Distance_km,Time_of_Day,Day_of_Week,Traffic_Conditions,Weather,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Trip_Price
1,47.59,Afternoon,Weekday,High,Clear,,0.62,0.43,40.57,
25,39.47,Afternoon,Weekday,Low,Clear,,,0.35,7.59,83.6965
43,45.56,Afternoon,Weekday,Low,Clear,,0.9,0.5,80.8,85.884
50,48.51,Evening,Weekday,High,Rain,,1.0,0.48,110.18,104.1764
92,38.28,Afternoon,Weekday,Medium,Clear,,1.08,0.39,51.33,63.5611
107,38.02,Evening,,,Clear,,1.31,0.35,33.73,66.2817
109,4.77,Morning,Weekend,High,Clear,,0.96,0.18,105.91,
116,40.38,Morning,Weekday,Medium,Rain,,0.8,0.23,115.21,63.4323
156,33.93,Morning,Weekday,Low,Rain,,0.88,0.39,66.77,59.9287
158,12.64,Evening,Weekday,Medium,Clear,,0.7,0.29,22.91,20.3819


In [14]:
df = df.drop([1, 25, 109, 173, 201, 328, 397, 547, 728, 871, 937])