## MongoDB

In [5]:
from pymongo import MongoClient
from datetime import datetime
import pandas as pd

class TaxiDB:
    
    columns_location = ['LocationID', 'Borough', 'Zone', 'service_zone']
    
    columns_tripdata = ['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime','passenger_count',
                        'trip_distance', 'RatecodeID', 'store_and_fwd_flag', 'PULocationID',
                        'DOLocationID', 'payment_type', 'fare_amount', 'extra', 'mta_tax',
                        'tip_amount', 'tolls_amount', 'improvement_surcharge', 'total_amount']
    
    def __init__(self):
        self.connect_db = MongoClient('localhost', 27017)['taxi_db']
    
    
    def insert_location_in_file(self, path, start_line=1):
        i=-1
        location = self.connect_db['location']
        for line in open(path, 'r'):
            i += 1
            if i<start_line: continue
            
            ln_data = line[:-1].replace('"', '').split(',')
            ln_data[0] = int(ln_data[0])
            
            data_dict = {}
            for j in range(4):
                data_dict[self.columns_location[j]] = ln_data[j]
            
            location.insert_one(data_dict)
    
    
    def insert_tripdata_in_file(self, path, start_line=1):
        
        i=-1
        tripdata = self.connect_db['tripdata']
        for line in open(path, 'r'):
            i += 1
            if i<start_line: continue
            
            ln_data = line[:-1].split(',')
            
            for j in [0, 3, 5, 7, 8, 9]:
                ln_data[j] = int(ln_data[j])

            for j in [4, 10, 11, 12, 13, 14, 15, 16]:
                ln_data[j] = float(ln_data[j])
   
            ln_data[1] = datetime.strptime(ln_data[1], '%Y-%m-%d %H:%M:%S')
            ln_data[2] = datetime.strptime(ln_data[2], '%Y-%m-%d %H:%M:%S')
        
            data_dict = {}
            for j in range(17):
                data_dict[self.columns_tripdata[j]] = ln_data[j]
            
            tripdata.insert_one(data_dict)

            
    def get_report_1(self, begin_d, end_d):
        answer = self.connect_db.tripdata.find({'$and': [
                                                         {"tpep_pickup_datetime": {"$gt": begin_d}},
                                                         {"tpep_pickup_datetime": {"$lt": end_d}}
                                                        ]})
        answer_arr = []
        for doc in answer:
            answer_arr.append(doc)
        answer = pd.DataFrame(answer_arr)
        return(answer)
    
    
    def get_report_2(self):
        curs = self.connect_db.tripdata
        location_id = curs.distinct("PULocationID")
        answer = []
        for id in location_id:
            count = curs.find({'PULocationID': id}).count()
            loc_name = self.connect_db.location.find_one({'LocationID': id})
            answer.append({'Zone': loc_name['Zone'], 
                           'Borough': loc_name['Borough'],
                           'Count': count})
        answer = pd.DataFrame(answer_arr)
        return(answer)

In [28]:
obj_taxi = TaxiDB()
path = '..//..//003_DB_data//taxi+_zone_lookup.csv'
# obj_taxi.insert_location_in_file(path)

In [29]:
obj_taxi = TaxiDB()
path = '..//..//003_DB_data//yellow_tripdata_2018-01.csv'
# obj_taxi.insert_tripdata_in_file(path, start_line=2)

In [44]:
obj_taxi = TaxiDB()
df = obj_taxi.get_report_1(datetime(2018, 1, 1, 10), datetime(2018, 1, 1, 12))
df.head()

Unnamed: 0,DOLocationID,PULocationID,RatecodeID,VendorID,_id,extra,fare_amount,improvement_surcharge,mta_tax,passenger_count,payment_type,store_and_fwd_flag,tip_amount,tolls_amount,total_amount,tpep_dropoff_datetime,tpep_pickup_datetime,trip_distance
0,237,163,1,2,5beae2a03b11d7119ce0eb3d,0.0,7.5,0.3,0.5,1,1,N,1.66,0.0,9.96,2018-01-01 10:08:31,2018-01-01 10:00:03,1.24
1,100,132,2,2,5beae2a03b11d7119ce0ec5d,0.0,52.0,0.3,0.5,2,2,N,0.0,5.76,58.56,2018-01-01 10:32:50,2018-01-01 10:01:24,18.2
2,261,232,1,2,5beae2a03b11d7119ce0ec63,0.0,11.0,0.3,0.5,3,1,N,2.95,0.0,14.75,2018-01-01 10:09:39,2018-01-01 10:00:42,3.07
3,188,97,1,2,5beae2a03b11d7119ce0ec6f,0.0,11.5,0.3,0.5,5,1,N,3.08,0.0,15.38,2018-01-01 10:10:19,2018-01-01 10:00:16,2.89
4,100,163,1,2,5beae2a03b11d7119ce0ec75,0.0,4.5,0.3,0.5,2,2,N,0.0,0.0,5.3,2018-01-01 10:04:27,2018-01-01 10:01:37,0.93


In [None]:
obj_taxi = TaxiDB()
df = obj_taxi.get_report_2()
df

