# Uber Fares Dataset - Data Preparation with Python

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Display settings
pd.set_option('display.max_columns', None)


## Load the Dataset

In [3]:
# Replace with the actual file name if different
df = pd.read_csv("uber.csv")
print("Dataset loaded successfully.")
df.head()


Dataset loaded successfully.


Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,24238194,2015-05-07 19:52:06.0000003,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
1,27835199,2009-07-17 20:04:56.0000002,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.99471,40.750325,1
2,44984355,2009-08-24 21:45:00.00000061,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.74077,-73.962565,40.772647,1
3,25894730,2009-06-26 08:22:21.0000001,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
4,17610152,2014-08-28 17:47:00.000000188,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5


## Explore the Dataset

In [4]:
print("Shape of the dataset:", df.shape)
print("\nData Types and Missing Values:")
print(df.info())
print("\nSummary Statistics:")
print(df.describe())
print("\nMissing Values Count:")
print(df.isnull().sum())


Shape of the dataset: (200000, 9)

Data Types and Missing Values:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Unnamed: 0         200000 non-null  int64  
 1   key                200000 non-null  object 
 2   fare_amount        200000 non-null  float64
 3   pickup_datetime    200000 non-null  object 
 4   pickup_longitude   200000 non-null  float64
 5   pickup_latitude    200000 non-null  float64
 6   dropoff_longitude  199999 non-null  float64
 7   dropoff_latitude   199999 non-null  float64
 8   passenger_count    200000 non-null  int64  
dtypes: float64(5), int64(2), object(2)
memory usage: 13.7+ MB
None

Summary Statistics:
         Unnamed: 0    fare_amount  pickup_longitude  pickup_latitude  \
count  2.000000e+05  200000.000000     200000.000000    200000.000000   
mean   2.771250e+07      11.359955        -72.5276

## Clean the Data

In [5]:
# Drop rows with missing values
df_cleaned = df.dropna()

# Filter out unrealistic fare amounts and coordinates
df_cleaned = df_cleaned[
    (df_cleaned['fare_amount'] > 0) &
    (df_cleaned['fare_amount'] < 200) &
    (df_cleaned['pickup_latitude'].between(-90, 90)) &
    (df_cleaned['pickup_longitude'].between(-180, 180))
]

print("Cleaned dataset shape:", df_cleaned.shape)
df_cleaned.head()


Cleaned dataset shape: (199959, 9)


Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,24238194,2015-05-07 19:52:06.0000003,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
1,27835199,2009-07-17 20:04:56.0000002,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.99471,40.750325,1
2,44984355,2009-08-24 21:45:00.00000061,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.74077,-73.962565,40.772647,1
3,25894730,2009-06-26 08:22:21.0000001,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
4,17610152,2014-08-28 17:47:00.000000188,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5


## Feature Engineering

In [6]:
# Convert pickup_datetime to datetime type
df_cleaned['pickup_datetime'] = pd.to_datetime(df_cleaned['pickup_datetime'])

# Extract time features
df_cleaned['hour'] = df_cleaned['pickup_datetime'].dt.hour
df_cleaned['day'] = df_cleaned['pickup_datetime'].dt.day
df_cleaned['month'] = df_cleaned['pickup_datetime'].dt.month
df_cleaned['day_of_week'] = df_cleaned['pickup_datetime'].dt.dayofweek

# Add peak/off-peak label
def is_peak(hour):
    return 'Peak' if 7 <= hour <= 9 or 17 <= hour <= 19 else 'Off-Peak'

df_cleaned['peak'] = df_cleaned['hour'].apply(is_peak)

df_cleaned.head()


Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,hour,day,month,day_of_week,peak
0,24238194,2015-05-07 19:52:06.0000003,7.5,2015-05-07 19:52:06+00:00,-73.999817,40.738354,-73.999512,40.723217,1,19,7,5,3,Peak
1,27835199,2009-07-17 20:04:56.0000002,7.7,2009-07-17 20:04:56+00:00,-73.994355,40.728225,-73.99471,40.750325,1,20,17,7,4,Off-Peak
2,44984355,2009-08-24 21:45:00.00000061,12.9,2009-08-24 21:45:00+00:00,-74.005043,40.74077,-73.962565,40.772647,1,21,24,8,0,Off-Peak
3,25894730,2009-06-26 08:22:21.0000001,5.3,2009-06-26 08:22:21+00:00,-73.976124,40.790844,-73.965316,40.803349,3,8,26,6,4,Peak
4,17610152,2014-08-28 17:47:00.000000188,16.0,2014-08-28 17:47:00+00:00,-73.925023,40.744085,-73.973082,40.761247,5,17,28,8,3,Peak


## Export the Cleaned Dataset

In [9]:
df_cleaned.to_csv("uber.csv", index=False)
print("Cleaned dataset saved as 'uber.csv'")


Cleaned dataset saved as 'uber.csv'


In [10]:
print(df_cleaned.describe())


         Unnamed: 0    fare_amount  pickup_longitude  pickup_latitude  \
count  1.999590e+05  199959.000000     199959.000000    199959.000000   
mean   2.771242e+07      11.351877        -72.503064        39.918635   
std    1.601400e+07       9.733290         10.445643         6.128234   
min    1.000000e+00       0.010000        -93.824668       -74.015515   
25%    1.382523e+07       6.000000        -73.992065        40.734796   
50%    2.774524e+07       8.500000        -73.981823        40.752591   
75%    4.155554e+07      12.500000        -73.967157        40.767157   
max    5.542357e+07     196.000000         40.808425        48.018760   

       dropoff_longitude  dropoff_latitude  passenger_count           hour  \
count      199959.000000     199959.000000    199959.000000  199959.000000   
mean          -72.516941         39.925401         1.684515      13.491726   
std            10.511192          6.197760         1.385999       6.515207   
min          -737.916665      