In [1]:
import pandas as pd

uber_dataset = pd.read_csv("uber_dataset.csv")

In [2]:
uber_dataset.shape

(200000, 8)

In [3]:
col=uber_dataset.select_dtypes(include=['int']).columns

In [4]:
len(col)

2

In [5]:
missing_values = uber_dataset['dropoff_longitude'].isnull().sum()
print(missing_values)

1


In [6]:
pickup_datetime_dtype = uber_dataset['pickup_datetime'].dtype
print(pickup_datetime_dtype)

object


In [7]:
uber_dataset.dropna(subset=['fare_amount'], inplace=True)

average_fare = uber_dataset['fare_amount'].mean()

print("Average Fare Amount:", average_fare)

Average Fare Amount: 11.359955250000626


In [8]:
import pandas as pd
import numpy as np

def haversine(lat1, lon1, lat2, lon2):
    # Convert latitude and longitude from degrees to radians
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])

    # Haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    radius_of_earth = 6371  # Earth's radius in kilometers
    distance = radius_of_earth * c

    return distance

# Calculate Haversine distance for each row
uber_dataset['haversine_distance'] = uber_dataset.apply(lambda row: haversine(row['pickup_latitude'], row['pickup_longitude'],
                                                          row['dropoff_latitude'], row['dropoff_longitude']), axis=1)

# Calculate the median Haversine distance
median_distance = uber_dataset['haversine_distance'].median()

# Print the median Haversine distance
print("Median Haversine Distance:", median_distance)


Median Haversine Distance: 2.1209923961833708


In [9]:
max_distance = uber_dataset['haversine_distance'].max()

# Print the maximum Haversine distance
print("Maximum Haversine Distance:", max_distance)

Maximum Haversine Distance: 16409.239135313168


In [10]:
rides_with_zero_distance = (uber_dataset['haversine_distance'] == 0.0).sum()

# Print the count of rides with 0.0 Haversine distance
print("Number of rides with 0.0 Haversine Distance:", rides_with_zero_distance)

Number of rides with 0.0 Haversine Distance: 5632


In [16]:
zero_distance_rides = uber_dataset[uber_dataset['haversine_distance'] == 0.0]

# Calculate the mean 'fare_amount' for rides with 0.0 Haversine distance
mean_fare_for_zero_distance_rides = zero_distance_rides['fare_amount'].mean()

# Print the mean fare for rides with 0.0 Haversine distance
print("Mean 'fare_amount' for rides with 0.0 Haversine Distance:", mean_fare_for_zero_distance_rides)

Mean 'fare_amount' for rides with 0.0 Haversine Distance: 11.585317826704578


In [11]:
max_fare_amount = uber_dataset['fare_amount'].max()

# Print the maximum 'fare_amount'
print("Maximum 'fare_amount' for a ride:", max_fare_amount)

Maximum 'fare_amount' for a ride: 499.0


In [13]:
# Find the row with the maximum 'fare_amount'
costliest_ride = uber_dataset[uber_dataset['fare_amount'] == uber_dataset['fare_amount'].max()]

# Calculate the Haversine distance for the costliest ride
costliest_ride_distance = haversine(costliest_ride['pickup_latitude'].values[0],
                                     costliest_ride['pickup_longitude'].values[0],
                                     costliest_ride['dropoff_latitude'].values[0],
                                     costliest_ride['dropoff_longitude'].values[0])

# Print the Haversine distance for the costliest ride
print("Haversine Distance for the Costliest Ride:", costliest_ride_distance)


Haversine Distance for the Costliest Ride: 0.0007899213191009994


In [14]:
# Find the row with the maximum 'fare_amount'
costliest_ride = uber_dataset[uber_dataset['fare_amount'] == uber_dataset['fare_amount'].max()]

# Calculate the Haversine distance for the costliest ride
costliest_ride_distance = haversine(costliest_ride['pickup_latitude'].values[0],
                                     costliest_ride['pickup_longitude'].values[0],
                                     costliest_ride['dropoff_latitude'].values[0],
                                     costliest_ride['dropoff_longitude'].values[0])

# Print the Haversine distance for the costliest ride
print("Haversine Distance for the Costliest Ride:", costliest_ride_distance)


Haversine Distance for the Costliest Ride: 0.0007899213191009994


In [15]:
# Assuming you have a DataFrame called 'df' containing the Uber dataset

# Convert 'pickup_datetime' column to datetime format if it's not already
uber_dataset['pickup_datetime'] = pd.to_datetime(uber_dataset['pickup_datetime'])

# Extract the year from the 'pickup_datetime' column
uber_dataset['pickup_year'] = uber_dataset['pickup_datetime'].dt.year

# Count the number of rides recorded in the year 2014
rides_in_2014 = (uber_dataset['pickup_year'] == 2014).sum()

# Print the number of rides recorded in 2014
print("Number of rides recorded in the year 2014:", rides_in_2014)


Number of rides recorded in the year 2014: 29968


In [16]:
# Assuming you have a DataFrame called 'df' containing the Uber dataset

# Convert 'pickup_datetime' column to datetime format if it's not already
uber_dataset['pickup_datetime'] = pd.to_datetime(uber_dataset['pickup_datetime'])

# Extract the quarter from the 'pickup_datetime' column
uber_dataset['pickup_quarter'] = uber_dataset['pickup_datetime'].dt.quarter

# Count the number of rides recorded in the first quarter of 2014 (Quarter 1)
rides_in_first_quarter_2014 = ((uber_dataset['pickup_year'] == 2014) & (uber_dataset['pickup_quarter'] == 1)).sum()

# Print the number of rides recorded in the first quarter of 2014
print("Number of rides recorded in the first quarter of 2014:", rides_in_first_quarter_2014)


Number of rides recorded in the first quarter of 2014: 7687


In [17]:
# Assuming you have a DataFrame called 'df' containing the Uber dataset

# Convert 'pickup_datetime' column to datetime format if it's not already
uber_dataset['pickup_datetime'] = pd.to_datetime(uber_dataset['pickup_datetime'])

# Extract the year and month from the 'pickup_datetime' column
uber_dataset['pickup_year'] = uber_dataset['pickup_datetime'].dt.year
uber_dataset['pickup_month'] = uber_dataset['pickup_datetime'].dt.month

# Filter the DataFrame to include only rides in September 2010
september_2010_rides = uber_dataset[(uber_dataset['pickup_year'] == 2010) & (uber_dataset['pickup_month'] == 9)]

# Calculate the day of the week for each ride and count occurrences
day_of_week_counts = september_2010_rides['pickup_datetime'].dt.day_name().value_counts()

# Find the day with the maximum number of rides
max_rides_day = day_of_week_counts.idxmax()

# Print the day of the week with the maximum rides
print("Day of the week in September 2010 with maximum rides:", max_rides_day)


Day of the week in September 2010 with maximum rides: Thursday
