In [42]:
import pandas as pd
import numpy  as np

In [43]:
data = pd.read_csv("/content/My_Uber_Drives- 2016.csv")

In [44]:
data.head(5)

Unnamed: 0,START_DATE*,END_DATE*,CATEGORY*,START*,STOP*,MILES*,PURPOSE*
0,1/1/2016 21:11,1/1/2016 21:17,Business,Fort Pierce,Fort Pierce,5.1,Meal/Entertain
1,1/2/2016 1:25,1/2/2016 1:37,Business,Fort Pierce,Fort Pierce,5.0,
2,1/2/2016 20:25,1/2/2016 20:38,Business,Fort Pierce,Fort Pierce,4.8,Errand/Supplies
3,1/5/2016 17:31,1/5/2016 17:45,Business,Fort Pierce,Fort Pierce,4.7,Meeting
4,1/6/2016 14:42,1/6/2016 15:49,Business,Fort Pierce,West Palm Beach,63.7,Customer Visit


In [45]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1156 entries, 0 to 1155
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   START_DATE*  1156 non-null   object 
 1   END_DATE*    1155 non-null   object 
 2   CATEGORY*    1155 non-null   object 
 3   START*       1155 non-null   object 
 4   STOP*        1155 non-null   object 
 5   MILES*       1156 non-null   float64
 6   PURPOSE*     653 non-null    object 
dtypes: float64(1), object(6)
memory usage: 63.3+ KB
None


In [48]:
data.isnull().sum()

Unnamed: 0,0
START_DATE*,0
END_DATE*,0
CATEGORY*,0
START*,0
STOP*,0
MILES*,0
PURPOSE*,0


In [47]:
data.dropna(inplace=True)

In [55]:
data['START_DATE*'] = pd.to_datetime(data['START_DATE*'])

In [56]:
data["hour"] = data["START_DATE*"].dt.hour
data["day_of_week"] = data["START_DATE*"].dt.dayofweek
data["month"] = data["START_DATE*"].dt.month

In [57]:
data.head(5)

Unnamed: 0,START_DATE*,END_DATE*,CATEGORY*,START*,STOP*,MILES*,PURPOSE*,hour,day_of_week,month
0,2016-01-01 21:11:00,1/1/2016 21:17,Business,Fort Pierce,Fort Pierce,5.1,Meal/Entertain,21,4,1
2,2016-01-02 20:25:00,1/2/2016 20:38,Business,Fort Pierce,Fort Pierce,4.8,Errand/Supplies,20,5,1
3,2016-01-05 17:31:00,1/5/2016 17:45,Business,Fort Pierce,Fort Pierce,4.7,Meeting,17,1,1
4,2016-01-06 14:42:00,1/6/2016 15:49,Business,Fort Pierce,West Palm Beach,63.7,Customer Visit,14,2,1
5,2016-01-06 17:15:00,1/6/2016 17:19,Business,West Palm Beach,West Palm Beach,4.3,Meal/Entertain,17,2,1


In [59]:
data = data[data["MILES*"] > 0]

In [61]:
data["distance_km"] = data["MILES*"] * 1.6

In [62]:
data["fare"] = 25 + (data["distance_km"] * 15)

In [63]:
traffic_levels = ["Low", "Medium", "High"]
data["traffic_level"] = np.random.choice(traffic_levels, len(data))

In [67]:
print("\nâœ… After Feature Engineering:")
print(data.head())


âœ… After Feature Engineering:
          START_DATE*       END_DATE* CATEGORY*           START*  \
0 2016-01-01 21:11:00  1/1/2016 21:17  Business      Fort Pierce   
2 2016-01-02 20:25:00  1/2/2016 20:38  Business      Fort Pierce   
3 2016-01-05 17:31:00  1/5/2016 17:45  Business      Fort Pierce   
4 2016-01-06 14:42:00  1/6/2016 15:49  Business      Fort Pierce   
5 2016-01-06 17:15:00  1/6/2016 17:19  Business  West Palm Beach   

             STOP*  MILES*         PURPOSE*  hour  day_of_week  month  \
0      Fort Pierce     5.1   Meal/Entertain    21            4      1   
2      Fort Pierce     4.8  Errand/Supplies    20            5      1   
3      Fort Pierce     4.7          Meeting    17            1      1   
4  West Palm Beach    63.7   Customer Visit    14            2      1   
5  West Palm Beach     4.3   Meal/Entertain    17            2      1   

   distance_km    fare traffic_level  
0         8.16   147.4           Low  
2         7.68   140.2        Medium  
3  

In [68]:
from sklearn.preprocessing import LabelEncoder

In [69]:
le = LabelEncoder()
data["traffic_level"] = le.fit_transform(data["traffic_level"])

In [70]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score

In [74]:
x =data[['distance_km','hour','traffic_level']]
y =data['fare']

In [75]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [76]:
model = LinearRegression()
model.fit(X_train, y_train)

In [77]:
y_pred = model.predict(X_test)

In [78]:
print("\nðŸš– Model Performance:")
print("MAE:", mean_absolute_error(y_test, y_pred))
print("RÂ² Score:", r2_score(y_test, y_pred))


ðŸš– Model Performance:
MAE: 1.5187172978078477e-14
RÂ² Score: 1.0


In [82]:
import joblib

In [89]:
joblib.dump(model, "ride_price_model.pkl")
print("âœ… Model saved successfully!")

âœ… Model saved successfully!


In [90]:
model = joblib.load("ride_price_model.pkl")
print("âœ… Model loaded successfully!")

âœ… Model loaded successfully!


In [91]:
sample = pd.DataFrame({"distance_km": [8],"hour": [21],"traffic_level": [le.transform(["High"])[0]]})

In [92]:
predicted_fare = model.predict(sample)
print(f"\nðŸ’° Predicted Fare for Ride: â‚¹{predicted_fare[0]:.2f}")


ðŸ’° Predicted Fare for Ride: â‚¹145.00
