## Imports

In [1]:
import pymongo
from pymongo import MongoClient
import json
from bson.json_util import dumps

def get_path(dataset_name,env_name='colab'):
    prefix = 'https://raw.githubusercontent.com/John-Ghaly88/Big_Data_and_NoSQL/main/Datasets/Assessment/'
    if env_name == 'colab':
        return prefix+dataset_name
    else:
        return f'../Datasets/{dataset_name}'

## Connecting to MongoDB

In [2]:
# connecting to MongoDB
cluster = MongoClient("mongodb+srv://admin0:Jj123123@cluster0.wb68g.mongodb.net/?retryWrites=true&w=majority")
# connecting to the DB
db = cluster["Assignemnt_2"]
# connecting to the cluster
collection = db["taxis"]

## Reading data

In [3]:
# Reading the data
import pandas as pd

df = pd.read_csv(get_path('cleaned_taxi_trip.csv'))

i,j = df.shape

df

Unnamed: 0,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,imp_surcharge,zone_id,dropoff_location_id,zone_name,borough,duration,time_of_day
0,1,"05/11/2018, 17:40:00","05/11/2018, 17:55:00",1,1.60,1,11.5,1.0,0.5,0.00,0.00,0.3,48,68,Clinton East,Manhattan,15.0,evening
1,1,"08/26/2018, 10:24:00","08/26/2018, 10:32:00",2,1.20,2,7.5,0.0,0.5,0.00,0.00,0.3,48,43,Clinton East,Manhattan,8.0,morning
2,2,"11/21/2018, 22:25:00","11/21/2018, 22:42:00",1,2.40,2,12.5,0.5,0.5,0.00,0.00,0.3,48,137,Clinton East,Manhattan,17.0,night
3,1,"06/15/2018, 06:26:00","06/15/2018, 06:34:00",1,1.10,1,7.0,0.0,0.5,2.30,0.00,0.3,48,162,Clinton East,Manhattan,8.0,morning
4,2,"12/05/2018, 18:41:00","12/05/2018, 18:53:00",2,1.35,1,9.0,1.0,0.5,3.00,0.00,0.3,48,68,Clinton East,Manhattan,12.0,evening
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19215,1,"12/24/2018, 22:28:00","12/24/2018, 22:46:00",1,7.40,1,23.5,0.5,0.5,0.00,0.00,0.3,124,28,Howard Beach,Queens,18.0,night
19216,2,"04/24/2018, 07:54:00","04/24/2018, 08:02:00",1,1.41,2,7.0,0.0,0.5,0.00,0.00,0.3,196,82,Rego Park,Queens,8.0,morning
19217,2,"08/21/2018, 20:33:00","08/21/2018, 20:42:00",2,1.38,1,8.0,0.5,0.5,0.00,0.00,0.3,178,26,Ocean Parkway South,Brooklyn,9.0,evening
19218,2,"05/03/2018, 22:25:00","05/03/2018, 22:43:00",6,9.30,1,52.0,0.0,0.5,11.71,5.76,0.3,93,233,Flushing Meadows-Corona Park,Queens,18.0,night


## Insertion

In [5]:
# Inserting the data into the DB
# collection.insert_many(df.apply(lambda x: x.to_dict(), axis=1).to_list())

## Total Trip cost

In [6]:
# Calculating the total trip cost and adding it as a new field in the DB
total_trip_cost = collection.aggregate([
    {
        "$project":{
            "fare_amount": "$fare_amount",
            "extra": "$extra",
            "mta_tax": "$mta_tax",
            "tip_amount": "$tip_amount",
            "tolls_amount": "$tolls_amount",
            "imp_surcharge": "$imp_surcharge",
            "total_trip_cost": {"$sum": ['$fare_amount', '$extra' , '$mta_tax', '$tip_amount', '$tolls_amount', '$imp_surcharge']}
        }
    }
])


result_list = list(total_trip_cost)

# for i in result_list:
#     collection.update_one({'_id': i.get("_id")}, {"$set": {'total_trip_cost': i.get("total_trip_cost")}})

## Most common payment type

In [7]:
# The most common payment type used per time of day
result = collection.aggregate([
    {
        "$match":{
            "time_of_day": {"$exists": True}
        }
    },
    {
        "$group": {"_id": {"time_of_day": "$time_of_day", "payment_type": "$payment_type"}, "count": {"$count":{}}}
    },
    {
        #-1 to sort them descendingly
        "$sort": {"count": -1}
    },
    {
        #Limit 4 to get u the top common type for each of the 4 times of the day
        "$limit": 4
    }
])

print(list(result))

[{'_id': {'time_of_day': 'morning', 'payment_type': 1}, 'count': 6858}, {'_id': {'time_of_day': 'afternoon', 'payment_type': 1}, 'count': 6672}, {'_id': {'time_of_day': 'night', 'payment_type': 1}, 'count': 6580}, {'_id': {'time_of_day': 'evening', 'payment_type': 1}, 'count': 6578}]


## Average tip amount

In [8]:
# The average tip amount per passenger count
result = collection.aggregate([
    {
        "$match":{
            "passenger_count": {"$exists": True}
        }
    },
    {
        "$group": {"_id": {"passenger_count": "$passenger_count"}, "avg": {"$avg": "$tip_amount"}}
    },
    {
        "$sort": {"avg": -1}
    }
])

print(list(result))

[{'_id': {'passenger_count': 4}, 'avg': 1.9235492957746478}, {'_id': {'passenger_count': 6}, 'avg': 1.8951905626134302}, {'_id': {'passenger_count': 5}, 'avg': 1.8295862068965518}, {'_id': {'passenger_count': 1}, 'avg': 1.824198250728863}, {'_id': {'passenger_count': 2}, 'avg': 1.7997718631178707}, {'_id': {'passenger_count': 3}, 'avg': 1.7691456077015644}]


## Best 5 pick up locations

In [9]:
# The best 5 locations for drivers to pick up passengers from
result = collection.aggregate([
    {
        "$match":{
            "zone_name": {"$exists": True}
        }
    },
    {
        "$group": {"_id": {"zone_name": "$zone_name"}, "count": {"$count":{}}}
    },
    {
        "$sort": {"count": -1}
    },
    {
        "$limit": 5
    }
])
        
print(list(result))

[{'_id': {'zone_name': 'Upper East Side South'}, 'count': 1582}, {'_id': {'zone_name': 'Midtown Center'}, 'count': 1488}, {'_id': {'zone_name': 'Upper East Side North'}, 'count': 1426}, {'_id': {'zone_name': 'Times Sq/Theatre District'}, 'count': 1384}, {'_id': {'zone_name': 'Midtown East'}, 'count': 1334}]
