# Data Analyst Intern with Uber

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("/content/uber_rides_data.xlsx - sample_train.csv")
df.head()

Unnamed: 0,ride_id,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,24238194,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
1,27835199,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.99471,40.750325,1
2,44984355,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.74077,-73.962565,40.772647,1
3,25894730,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
4,17610152,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5


What is the shape of given dataset?


In [3]:
df.shape

(200000, 8)

How many integer columns(by default) are given in the dataset?


In [4]:
df.dtypes

ride_id                int64
fare_amount          float64
pickup_datetime       object
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count        int64
dtype: object

How many missing values exists in 'dropoff_longitude' column?


In [5]:
df['dropoff_longitude'].isnull().sum()


1

What is the data type of ' pickup_datetime' feature in your data?


In [8]:
df.pickup_datetime.dtype

dtype('O')

Which of the following is the correct syntax to convert 'pickup_datetime' to datetime datatype?


Which function can be used to remove null values from the dataframe?


In [10]:
df.dropna(inplace=True)
df.isna().sum()

ride_id              0
fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    0
dropoff_latitude     0
passenger_count      0
dtype: int64

What is the average fare amount?


In [11]:
average_fare = df['fare_amount'].mean()
print(f'Average Fare Amount: ${average_fare:.2f}')


Average Fare Amount: $11.36


Calculate distance between each pickup and dropoff points using Haversine formula.
What is the median haversine distance between pickup and dropoff location according to the given dataset?

Read about Haversine Distance here: https://en.wikipedia.org/wiki/Haversine_formula


In [12]:
import numpy as np


# Define the Haversine distance calculation function
def haversine_distance(lat1, lon1, lat2, lon2):
    # Radius of the Earth in kilometers
    R = 6371.0

    # Convert latitude and longitude from degrees to radians
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])

    # Haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    distance = R * c

    return distance

# Calculate Haversine distance for each row
df['haversine_distance'] = df.apply(lambda row: haversine_distance(row['pickup_latitude'], row['pickup_longitude'], row['dropoff_latitude'], row['dropoff_longitude']), axis=1)

# Find the median Haversine distance
median_haversine_distance = df['haversine_distance'].median()

print(f'Median Haversine Distance between Pickup and Dropoff Locations: {median_haversine_distance:.2f} kilometers')


Median Haversine Distance between Pickup and Dropoff Locations: 2.12 kilometers


What is the maximum haversine distance between pickup and dropoff location according to the given dataset?


In [13]:
# Find the maximum Haversine distance
max_haversine_distance = df['haversine_distance'].max()

print(f'Maximum Haversine Distance between Pickup and Dropoff Locations: {max_haversine_distance:.2f} kilometers')


Maximum Haversine Distance between Pickup and Dropoff Locations: 16409.24 kilometers


How many rides have 0.0 haversine distance between pickup and dropoff location according to the given dataset?


In [14]:
# Count the number of rides with 0.0 Haversine distance
rides_with_zero_distance = df[df['haversine_distance'] == 0.0]
num_rides_with_zero_distance = len(rides_with_zero_distance)

print(f'Number of rides with 0.0 Haversine Distance: {num_rides_with_zero_distance}')


Number of rides with 0.0 Haversine Distance: 5632


What is the mean 'fare_amount' for rides with 0 haversine distance?


In [15]:
# Calculate the mean 'fare_amount' for rides with 0.0 Haversine distance
mean_fare_for_zero_distance = rides_with_zero_distance['fare_amount'].mean()

print(f'Mean Fare Amount for Rides with 0.0 Haversine Distance: ${mean_fare_for_zero_distance:.2f}')


Mean Fare Amount for Rides with 0.0 Haversine Distance: $11.59


What is the maximum 'fare_amount' for a ride?


In [16]:
# Find the maximum 'fare_amount'
max_fare_amount = df['fare_amount'].max()

print(f'Maximum Fare Amount for a Ride: ${max_fare_amount:.2f}')


Maximum Fare Amount for a Ride: $499.00


What is the haversine distance between pickup and dropoff location for the costliest ride?


In [18]:
# Find the row with the maximum 'fare_amount'
costliest_ride = df[df['fare_amount'] == max_fare_amount].iloc[0]

# Calculate the Haversine distance for the costliest ride
haversine_distance_costliest_ride = haversine_distance(
    costliest_ride['pickup_latitude'],
    costliest_ride['pickup_longitude'],
    costliest_ride['dropoff_latitude'],
    costliest_ride['dropoff_longitude']
)

print(f'Haversine Distance for the Costliest Ride: {haversine_distance_costliest_ride:} kilometers')


Haversine Distance for the Costliest Ride: 0.0007899213191009993 kilometers


How many rides were recorded in the year 2014?


In [19]:
# Convert 'pickup_datetime' column to datetime
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])

# Extract the year from 'pickup_datetime'
df['pickup_year'] = df['pickup_datetime'].dt.year

# Count the number of rides in the year 2014
rides_in_2014 = len(df[df['pickup_year'] == 2014])

print(f'Number of Rides Recorded in the Year 2014: {rides_in_2014}')


Number of Rides Recorded in the Year 2014: 29968


How many rides were recorded in the first quarter of 2014?


In [20]:
# Filter the DataFrame for rides in the first quarter of 2014
q1_2014_rides = df[(df['pickup_year'] == 2014) & (df['pickup_datetime'].dt.month >= 1) & (df['pickup_datetime'].dt.month <= 3)]

# Count the number of rides in the first quarter of 2014
num_q1_2014_rides = len(q1_2014_rides)

print(f'Number of Rides Recorded in the First Quarter of 2014: {num_q1_2014_rides}')


Number of Rides Recorded in the First Quarter of 2014: 7687


On which day of the week in September 2010, maximum rides were recorded ?


In [21]:
# Extract the year and month from 'pickup_datetime'
df['pickup_month'] = df['pickup_datetime'].dt.month
df['pickup_day_of_week'] = df['pickup_datetime'].dt.day_name()

# Filter the DataFrame for rides in September 2010
september_2010_rides = df[(df['pickup_year'] == 2010) & (df['pickup_month'] == 9)]

# Group the data by day of the week and count the rides for each day
rides_by_day = september_2010_rides['pickup_day_of_week'].value_counts()

# Find the day with the maximum ride count
max_rides_day = rides_by_day.idxmax()

print(f'Day in September 2010 with Maximum Rides: {max_rides_day}')


Day in September 2010 with Maximum Rides: Thursday


Apply a Machine Learning Algorithm to predict the fare amount given following input features:
passenger_count, distance and ride_week_day.

Perform a 70-30 split of data.

Which algorithm gives the least adjusted R square value

In [23]:
df.head()

Unnamed: 0,ride_id,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,haversine_distance,pickup_year,pickup_month,pickup_day_of_week
0,24238194,7.5,2015-05-07 19:52:06+00:00,-73.999817,40.738354,-73.999512,40.723217,1,1.683323,2015,5,Thursday
1,27835199,7.7,2009-07-17 20:04:56+00:00,-73.994355,40.728225,-73.99471,40.750325,1,2.45759,2009,7,Friday
2,44984355,12.9,2009-08-24 21:45:00+00:00,-74.005043,40.74077,-73.962565,40.772647,1,5.036377,2009,8,Monday
3,25894730,5.3,2009-06-26 08:22:21+00:00,-73.976124,40.790844,-73.965316,40.803349,3,1.661683,2009,6,Friday
4,17610152,16.0,2014-08-28 17:47:00+00:00,-73.925023,40.744085,-73.973082,40.761247,5,4.47545,2014,8,Thursday


In [25]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score

# Assuming you have prepared your features (X) and target variable (y)
X = df[['passenger_count', 'haversine_distance', 'pickup_day_of_week']]
y = df['fare_amount']

# Encode the categorical 'pickup_day_of_week' column
X_encoded = pd.get_dummies(X, columns=['pickup_day_of_week'], drop_first=True)

# Split the data into a 70-30 train-test split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.3, random_state=42)

# Initialize the models
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree Regression': DecisionTreeRegressor(),
    'Random Forest Regression': RandomForestRegressor(),
    'KNN Regression': KNeighborsRegressor()
}

adjusted_r2_scores = {}

# Train and evaluate each model
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    n = len(y_test)
    p = X_test.shape[1]
    adjusted_r2 = 1 - ((1 - r2) * (n - 1) / (n - p - 1))
    adjusted_r2_scores[model_name] = adjusted_r2

# Find the model with the least adjusted R-squared value
worst_model = min(adjusted_r2_scores, key=adjusted_r2_scores.get)

print(f"The model with the least adjusted R-squared value is: {worst_model}")


The model with the least adjusted R-squared value is: Linear Regression
