# Importing libraries

In [2]:
import numpy as np
import pandas as pd

# Reading the data set

In [6]:
df = pd.read_csv("uber_rides_data.xlsx - sample_train.csv")
df

Unnamed: 0,ride_id,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,24238194,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
1,27835199,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.994710,40.750325,1
2,44984355,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.740770,-73.962565,40.772647,1
3,25894730,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
4,17610152,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5
...,...,...,...,...,...,...,...,...
199995,42598914,3.0,2012-10-28 10:49:00 UTC,-73.987042,40.739367,-73.986525,40.740297,1
199996,16382965,7.5,2014-03-14 01:09:00 UTC,-73.984722,40.736837,-74.006672,40.739620,1
199997,27804658,30.9,2009-06-29 00:42:00 UTC,-73.986017,40.756487,-73.858957,40.692588,2
199998,20259894,14.5,2015-05-20 14:56:25 UTC,-73.997124,40.725452,-73.983215,40.695416,1


# What is the shape of given dataset?

In [11]:
shape = df.shape
print(shape)

(200000, 8)


# How many integer columns(by default) are given in the dataset?


In [12]:
integer_columns = df.select_dtypes(include=['int64'])


num_integer_columns = integer_columns.shape[1]

print("Number of integer columns:", num_integer_columns)



Number of integer columns: 2


# How many missing values exists in 'dropoff_longitude' column?


In [13]:
missing_values_count = df['dropoff_longitude'].isna().sum()

print("Number of missing values in 'dropoff_longitude' column:", missing_values_count)


Number of missing values in 'dropoff_longitude' column: 1


# What is the data type of ' pickup_datetime' feature in your data?

In [15]:
data_type = df['pickup_datetime'].dtype

print("Data type of 'pickup_datetime' feature:", data_type)


Data type of 'pickup_datetime' feature: object


# Which of the following is the correct syntax to convert 'pickup_datetime' to datetime datatype?

In [16]:
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])

print("Data type of 'pickup_datetime' after conversion:", df['pickup_datetime'].dtype)


Data type of 'pickup_datetime' after conversion: datetime64[ns, UTC]


# Which function can be used to remove null values from the dataframe?

In [19]:
df_cleaned = df.dropna()

df_cleaned

Unnamed: 0,ride_id,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,24238194,7.5,2015-05-07 19:52:06+00:00,-73.999817,40.738354,-73.999512,40.723217,1
1,27835199,7.7,2009-07-17 20:04:56+00:00,-73.994355,40.728225,-73.994710,40.750325,1
2,44984355,12.9,2009-08-24 21:45:00+00:00,-74.005043,40.740770,-73.962565,40.772647,1
3,25894730,5.3,2009-06-26 08:22:21+00:00,-73.976124,40.790844,-73.965316,40.803349,3
4,17610152,16.0,2014-08-28 17:47:00+00:00,-73.925023,40.744085,-73.973082,40.761247,5
...,...,...,...,...,...,...,...,...
199995,42598914,3.0,2012-10-28 10:49:00+00:00,-73.987042,40.739367,-73.986525,40.740297,1
199996,16382965,7.5,2014-03-14 01:09:00+00:00,-73.984722,40.736837,-74.006672,40.739620,1
199997,27804658,30.9,2009-06-29 00:42:00+00:00,-73.986017,40.756487,-73.858957,40.692588,2
199998,20259894,14.5,2015-05-20 14:56:25+00:00,-73.997124,40.725452,-73.983215,40.695416,1


# What is the average fare amount?

In [20]:

average_fare = df_cleaned['fare_amount'].mean()

print("Average fare amount:", average_fare)


Average fare amount: 11.359891549457748


# Calculate distance between each pickup and dropoff points using Haversine formula.

In [21]:
def haversine_distance(lat1, lon1, lat2, lon2):

    lat1, lon1, lat2, lon2 = np.radians([lat1, lon1, lat2, lon2])
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))

    distance = r * c
    
    return distance


df['haversine_distance'] = df.apply(lambda row: haversine_distance(row['pickup_latitude'], row['pickup_longitude'], 
                                                                 row['dropoff_latitude'], row['dropoff_longitude']), axis=1)

median_distance = df['haversine_distance'].median()

print("Median Haversine distance between pickup and dropoff locations:", median_distance, "kilometers")


Median Haversine distance between pickup and dropoff locations: 2.1209923961833708 kilometers


# What is the maximum haversine distance between pickup and dropoff location according to the given dataset?

In [22]:
max_distance = df['haversine_distance'].max()

print("Maximum Haversine distance between pickup and dropoff locations:", max_distance, "kilometers")


Maximum Haversine distance between pickup and dropoff locations: 16409.239135313168 kilometers


# How many rides have 0.0 haversine distance between pickup and dropoff location according to the given dataset?

In [23]:
zero_distance_rides = (df['haversine_distance'] == 0.0).sum()

print("Number of rides with 0.0 Haversine distance:", zero_distance_rides)


Number of rides with 0.0 Haversine distance: 5632


# What is the mean 'fare_amount' for rides with 0 haversine distance?

In [24]:
zero_distance_rides = df[df['haversine_distance'] == 0.0]

mean_fare_zero_distance = zero_distance_rides['fare_amount'].mean()

print("Mean 'fare_amount' for rides with 0.0 Haversine distance:", mean_fare_zero_distance)


Mean 'fare_amount' for rides with 0.0 Haversine distance: 11.585317826704546


# What is the maximum 'fare_amount' for a ride?

In [25]:
max_fare_amount = df['fare_amount'].max()

print("Maximum 'fare_amount' for a ride:", max_fare_amount)


Maximum 'fare_amount' for a ride: 499.0


# What is the haversine distance between pickup and dropoff location for the costliest ride?

In [26]:
costliest_ride = df[df['fare_amount'] == df['fare_amount'].max()]

haversine_distance_costliest_ride = costliest_ride['haversine_distance'].values[0]

print("Haversine distance for the costliest ride:", haversine_distance_costliest_ride, "kilometers")


Haversine distance for the costliest ride: 0.0007899213191009994 kilometers


# How many rides were recorded in the year 2014?

In [27]:
df['pickup_year'] = df['pickup_datetime'].dt.year

rides_2014 = (df['pickup_year'] == 2014).sum()

print("Number of rides recorded in the year 2014:", rides_2014)


Number of rides recorded in the year 2014: 29968


# How many rides were recorded in the first quarter of 2014?

In [28]:
df['pickup_quarter'] = df['pickup_datetime'].dt.quarter

rides_first_quarter_2014 = ((df['pickup_year'] == 2014) & (df['pickup_quarter'] == 1)).sum()

print("Number of rides recorded in the first quarter of 2014:", rides_first_quarter_2014)


Number of rides recorded in the first quarter of 2014: 7687


# On which day of the week in September 2010, maximum rides were recorded ?

In [29]:
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])

df['pickup_day_of_week'] = df['pickup_datetime'].dt.day_name()


rides_september_2010 = df[(df['pickup_datetime'].dt.year == 2010) & (df['pickup_datetime'].dt.month == 9)]

max_day_of_week = rides_september_2010['pickup_day_of_week'].mode()[0]

print("Day of the week with the maximum rides in September 2010:", max_day_of_week)


Day of the week with the maximum rides in September 2010: Thursday


# Which algorithm gives the least adjusted R square value?

In [33]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score


X = df[['passenger_count']]
y = df['fare_amount']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


models = [LinearRegression(), DecisionTreeRegressor(), RandomForestRegressor(), KNeighborsRegressor()]
results = []

for model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    adj_r2 = 1 - (1 - r2) * ((X_test.shape[0] - 1) / (X_test.shape[0] - X_test.shape[1] - 1))
    results.append({'Model': model.__class__.__name__, 'MSE': mse, 'R-squared': r2, 'Adjusted R-squared': adj_r2})


results_df = pd.DataFrame(results)
least_adj_r2_model = results_df[results_df['Adjusted R-squared'] == results_df['Adjusted R-squared'].min()]

print("Results:\n", results_df)
print("\nThe model with the least Adjusted R-squared value is:\n", least_adj_r2_model)


Results:
                    Model         MSE  R-squared  Adjusted R-squared
0       LinearRegression  103.942956   0.000116            0.000099
1  DecisionTreeRegressor  103.914414   0.000390            0.000373
2  RandomForestRegressor  103.914238   0.000392            0.000375
3    KNeighborsRegressor  119.406831  -0.148640           -0.148659

The model with the least Adjusted R-squared value is:
                  Model         MSE  R-squared  Adjusted R-squared
3  KNeighborsRegressor  119.406831   -0.14864           -0.148659
