# Load dataset

In [20]:
import pandas as pd

data = pd.read_csv('uber.csv')

print(data.head())

   Unnamed: 0                            key  fare_amount  \
0    24238194    2015-05-07 19:52:06.0000003          7.5   
1    27835199    2009-07-17 20:04:56.0000002          7.7   
2    44984355   2009-08-24 21:45:00.00000061         12.9   
3    25894730    2009-06-26 08:22:21.0000001          5.3   
4    17610152  2014-08-28 17:47:00.000000188         16.0   

           pickup_datetime  pickup_longitude  pickup_latitude  \
0  2015-05-07 19:52:06 UTC        -73.999817        40.738354   
1  2009-07-17 20:04:56 UTC        -73.994355        40.728225   
2  2009-08-24 21:45:00 UTC        -74.005043        40.740770   
3  2009-06-26 08:22:21 UTC        -73.976124        40.790844   
4  2014-08-28 17:47:00 UTC        -73.925023        40.744085   

   dropoff_longitude  dropoff_latitude  passenger_count  
0         -73.999512         40.723217                1  
1         -73.994710         40.750325                1  
2         -73.962565         40.772647                1  
3         

# Clean dataset

In [21]:
data = data.drop(columns=['Unnamed: 0'], errors='ignore')
data = data.dropna()

valid_longitude_range = (-180, 180)
valid_latitude_range = (-90, 90)

data = data[
    (data['pickup_longitude'].between(*valid_longitude_range)) &
    (data['pickup_latitude'].between(*valid_latitude_range)) &
    (data['dropoff_longitude'].between(*valid_longitude_range)) &
    (data['dropoff_latitude'].between(*valid_latitude_range))
]

data = data[(data['fare_amount'] <= 1000) & (data['fare_amount'] >= 0)]

# Display dataset information

In [22]:
print("Dataset Info:")
print(data.info())

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
Index: 199970 entries, 0 to 199999
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   key                199970 non-null  object 
 1   fare_amount        199970 non-null  float64
 2   pickup_datetime    199970 non-null  object 
 3   pickup_longitude   199970 non-null  float64
 4   pickup_latitude    199970 non-null  float64
 5   dropoff_longitude  199970 non-null  float64
 6   dropoff_latitude   199970 non-null  float64
 7   passenger_count    199970 non-null  int64  
dtypes: float64(5), int64(1), object(2)
memory usage: 13.7+ MB
None


# Summary statistics

In [23]:
print("\nSummary Statistics:")
print(data.describe())


Summary Statistics:
         fare_amount  pickup_longitude  pickup_latitude  dropoff_longitude  \
count  199970.000000     199970.000000    199970.000000      199970.000000   
mean       11.362259        -72.501661        39.917866         -72.511852   
std         9.897210         10.450391         6.130667          10.411364   
min         0.000000        -93.824668       -74.015515         -75.458979   
25%         6.000000        -73.992065        40.734793         -73.991407   
50%         8.500000        -73.981822        40.752592         -73.980092   
75%        12.500000        -73.967154        40.767157         -73.963657   
max       499.000000         40.808425        48.018760          40.831932   

       dropoff_latitude  passenger_count  
count     199970.000000    199970.000000  
mean          39.922164         1.684503  
std            6.117273         1.385981  
min          -74.015750         0.000000  
25%           40.733824         1.000000  
50%           40.7

# Check columns of interest

In [24]:
columns_of_interest = ['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'passenger_count', 'fare_amount']

if all(col in data.columns for col in columns_of_interest):
    print(data[columns_of_interest].describe())
else:
    print("\nSome columns of interest are missing in the dataset.")

       pickup_longitude  pickup_latitude  dropoff_longitude  dropoff_latitude  \
count     199970.000000    199970.000000      199970.000000     199970.000000   
mean         -72.501661        39.917866         -72.511852         39.922164   
std           10.450391         6.130667          10.411364          6.117273   
min          -93.824668       -74.015515         -75.458979        -74.015750   
25%          -73.992065        40.734793         -73.991407         40.733824   
50%          -73.981822        40.752592         -73.980092         40.753042   
75%          -73.967154        40.767157         -73.963657         40.768001   
max           40.808425        48.018760          40.831932         45.031598   

       passenger_count    fare_amount  
count    199970.000000  199970.000000  
mean          1.684503      11.362259  
std           1.385981       9.897210  
min           0.000000       0.000000  
25%           1.000000       6.000000  
50%           1.000000       8

# Processing time feature

In [15]:
# Check if the 'pickup_datetime' column exists and print its first few rows
if 'pickup_datetime' in data.columns:
    print(data['pickup_datetime'].head())
else:
    print("The column 'pickup_datetime' is not present in the dataset.")

0    2015-05-07 19:52:06 UTC
1    2009-07-17 20:04:56 UTC
2    2009-08-24 21:45:00 UTC
3    2009-06-26 08:22:21 UTC
4    2014-08-28 17:47:00 UTC
Name: pickup_datetime, dtype: object


In [16]:
data['pickup_datetime'] = pd.to_datetime(data['pickup_datetime'], errors='coerce')
data['hour'] = data['pickup_datetime'].dt.hour 
data['day_of_week'] = data['pickup_datetime'].dt.dayofweek 

print(data[['hour', 'day_of_week']].head())
print(data['hour'].describe())
print(data['day_of_week'].describe())

   hour  day_of_week
0    19            3
1    20            4
2    21            0
3     8            4
4    17            3
count    199970.000000
mean         13.491504
std           6.515409
min           0.000000
25%           9.000000
50%          14.000000
75%          19.000000
max          23.000000
Name: hour, dtype: float64
count    199970.000000
mean          3.048422
std           1.946984
min           0.000000
25%           1.000000
50%           3.000000
75%           5.000000
max           6.000000
Name: day_of_week, dtype: float64
