In [1]:
#This is dev branch
import datetime
import math
import copy
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import multiprocessing as mp
import logging

NUM_TIME_SLOTS = 144

In [2]:
def readMultipleData(path,fileNamePrefix,headerNames,dataTypes):
    filesToExplore = []
    for file in os.listdir(path):
        if file.startswith(fileNamePrefix):
            filesToExplore.append(file)
            # print(f"{file} read")
        else:
            continue

    print(f"{len(filesToExplore)} files read")
    
    readData = []
    for files in filesToExplore:
        fileRead = pd.read_csv(path + files, sep='\t', names=headerNames,dtype=dataTypes)
        readData.append(fileRead)

    readData = pd.concat(readData, ignore_index=True)
    return readData

In [3]:
# now region Data
regionData = pd.read_csv('./training_data/cluster_map/cluster_map', sep='\t', names=['region_hash', 'region_id'],dtype={'region_hash': 'str', 'region_id': 'int'})
print(regionData.head())

                        region_hash  region_id
0  90c5a34f06ac86aee0fd70e2adce7d8a          1
1  f2c8c4bb99e6377d21de71275afd6cd2          2
2  58c7a4888306d8ff3a641d1c0feccbe3          3
3  b26a240205c852804ff8758628c0a86a          4
4  4b9e4cf2fbdc8281b8a1f9f12b80ce4d          5


In [4]:
# read order data
dataTypes = {'order_id':'str', 'driver_id':'str', 'passenger_id':'str', 'start_region_hash':'str', 'dest_region_hash':'str', 'price':'double', 'time':'str'}
orderDataPath = './training_data/order_data/'
orderData = readMultipleData(orderDataPath,'order', ['order_id', 'driver_id', 'passenger_id', 'start_region_hash', 'dest_region_hash', 'price', 'time'], dataTypes)
print("printing order data")
print(orderData.head())

# read weather data
dataTypes={'time':'str', 'weather':'int', 'temperature':'double', 'PM2.5':'double'}
weatherDataPath = './training_data/weather_data/'
weatherData = readMultipleData(weatherDataPath,'weather', ['time', 'weather', 'temperature', 'PM2.5'], dataTypes)
print("printing weather data")
print(weatherData.head())

21 files read
printing order data
                           order_id                         driver_id  \
0  97ebd0c6680f7c0535dbfdead6e51b4b  dd65fa250fca2833a3a8c16d2cf0457c   
1  92c3ac9251cc9b5aab90b114a1e363be  c077e0297639edcb1df6189e8cda2c3d   
2  abeefc3e2aec952468e2fd42a1649640  86dbc1b68de435957c61b5a523854b69   
3  cb31d0be64cda3cc66b46617bf49a05c  4fadfa6eeaa694742de036dddf02b0c4   
4  139d492189ae5a933122c098f63252b3                               NaN   

                       passenger_id                 start_region_hash  \
0  ed180d7daf639d936f1aeae4f7fb482f  4725c39a5e5f4c188d382da3910b3f3f   
1  191a180f0a262aff3267775c4fac8972  82cc4851f9e4faa4e54309f8bb73fd7c   
2  7029e813bb3de8cc73a8615e2785070c  fff4e8465d1e12621bc361276b6217cf   
3  21dc133ac68e4c07803d1c2f48988a83  4b7f6f4e2bf237b6cc58f57142bea5c0   
4  26963cc76da2d8450d8f23fc357db987  fc34648599753c9e74ab238e9a4a07ad   

                   dest_region_hash  price                 time  
0  3e12208dd0be281c92a

In [5]:
# read POI Data
poiDataStr = {
    'region_hash':[],
    'poi_class':[]
}
with open('./training_data/poi_data/poi_data','r') as fileToRead:
    for line in fileToRead:
        line = line.strip()
        columns = line.split('\t')
        poiDataStr['region_hash'].append(columns[0])
        remData = columns[1:]
        poiDataStr['poi_class'].append(remData)
        
poiData = pd.DataFrame(poiDataStr,columns=['region_hash','poi_class'])
print("printing poi data")
print(poiData.head())
print("Printing poi data line 1")
print(f"region_hash: {poiData['region_hash'][0]} poi_class: {poiData['poi_class'][0]}")

printing poi data
                        region_hash  \
0  74c1c25f4b283fa74a5514307b0d0278   
1  08f5b445ec6b29deba62e6fd8b0325a6   
2  4b7f6f4e2bf237b6cc58f57142bea5c0   
3  a814069db8d32f0fa6e188f41059c6e1   
4  8316146a6f78cc6d9f113f0390859417   

                                           poi_class  
0  [1#11:2241, 1#10:249, 24:1245, 25:3652, 20:334...  
1  [20#7:249, 20#5:83, 2#7:166, 20#2:747, 20#1:99...  
2  [4#16:249, 24:913, 25:332, 20:4316, 22:415, 4:...  
3  [1#11:498, 24:332, 25:581, 20:5810, 22:2407, 4...  
4  [20#7:581, 20#5:83, 20#4:415, 20#2:166, 20#1:6...  
Printing poi data line 1
region_hash: 74c1c25f4b283fa74a5514307b0d0278 poi_class: ['1#11:2241', '1#10:249', '24:1245', '25:3652', '20:33449', '22:2324', '23:913', '4:13031', '8:166', '5#4:83', '5#3:3569', '5#2:83', '5#1:4731', '8#2:8798', '8#3:5229', '8#1:664', '8#4:7387', '8#5:83', '1#3:498', '1#2:2822', '1#1:415', '1#7:166', '1#6:83', '1#5:12367', '1#4:249', '1#9:166', '1#8:4316', '14#10:664', '7:6640', '15#7:14

In [6]:
# now to calculate gap(i,j) = req(i,j) - supply(i,j)
# req(i,j) is for region i and timeslot j 
# ith region will be from from start_region_hash and jth timeslot will be calculated from time
def getRegionID(regionHash):
    regionID = -1
    for i in range(len(regionData)):
        if regionHash == regionData['region_hash'][i]:
            regionID = regionData['region_id'][i]
    return regionID

print("Printing region id for 1st row")
print(getRegionID(regionData['region_hash'][0]))

Printing region id for 1st row
1


In [7]:
# 24 hours is divided into 144 slots where each slot is 10 mins long
def calculateTimeSlot(time,printValue=True):
    global NUM_TIME_SLOTS
    dateTime = datetime.datetime.strptime(time, '%Y-%m-%d %H:%M:%S')
    timePart = dateTime.time()
    timeInMinutes = (timePart.hour * 60) + timePart.minute + (timePart.second/60) + 1
    timeSlot = timeInMinutes/10
    roundedTimeSlot = math.ceil(timeSlot)
    if roundedTimeSlot > NUM_TIME_SLOTS:
        roundedTimeSlot -= 1
    if printValue==True:
        print(f"time: {time} timeInMinutes: {timeInMinutes} timeSlot: {roundedTimeSlot}")
    return int(roundedTimeSlot)

print("Printing time slot for 1st row")
print(calculateTimeSlot(orderData['time'][627]))

Printing time slot for 1st row
time: 2016-01-01 23:54:27 timeInMinutes: 1435.45 timeSlot: 144
144


In [None]:
# now to get req(i,j) we can do that by counting the number of orders for region i and timeslot j
def getRequest(i,j): # i is region id and j is timeslot
    global orderData
    numberOfIterations = len(orderData)
    print(f"Number of lines of data: {numberOfIterations}")
    progressBarInit = tqdm(total=numberOfIterations, desc="Calculating requests", unit=" lines")
    requests = 0
    for row in range(len(orderData)):
        currentRegionID = getRegionID(orderData['start_region_hash'][row])
        currentTimeSlot = calculateTimeSlot(orderData['time'][row],False)
        if currentRegionID == i and currentTimeSlot == j:
            requests += 1
        progressBarInit.update(1)
    progressBarInit.close()
    return requests

print("Printing request for 1st region and 1st timeslot")
print(getRequest(1,1))

In [8]:
def getAllRequestAndSupply():
    global orderData
    global regionData
    global NUM_TIME_SLOTS
    numberOfRegions = len(regionData)
    numberOfIterations = len(orderData)
    print(f"Number of lines of data: {numberOfIterations}")
    progressBarInit = tqdm(total=numberOfIterations, desc="Calculating requests", unit=" lines")
    requests = [[0 for j in range(NUM_TIME_SLOTS)] for i in range(numberOfRegions)]
    supply = [[0 for j in range(NUM_TIME_SLOTS)] for i in range(numberOfRegions)]
    for row in range(len(orderData)):
        currentRegionID = getRegionID(orderData['start_region_hash'][row])
        currentTimeSlot = calculateTimeSlot(orderData['time'][row],False)
        if currentRegionID < 0:
            print(f"Region not found for {orderData['start_region_hash'][row]}")
            continue
            # return (None,None)
        if currentTimeSlot < 0:
            print(f"Time slot not found for {orderData['time'][row]}")
            continue
            # return (None,None)
        if currentRegionID > numberOfRegions:
            print(f"Region id {currentRegionID} is greater than number of regions {numberOfRegions}")
            continue
            # return (None,None)
        if currentTimeSlot > NUM_TIME_SLOTS:
            print(f"Time slot {currentTimeSlot} is greater than number of time slots {NUM_TIME_SLOTS}")
            print(f"Time: {orderData['time'][row]}")
            print(f"Row: {row}")
            continue
            # return (None,None)
        if currentTimeSlot == 0:
            print(f"Time slot is 0 for {orderData['time'][row]}")
            print(f"Row: {row}")
            continue
            # return (None,None)
        requests[currentRegionID-1][currentTimeSlot-1] += 1
        if orderData['driver_id'][row] != 'NULL':
            supply[currentRegionID-1][currentTimeSlot-1] += 1
        progressBarInit.update(1)
    progressBarInit.close()
    return (requests,supply)

print("Printing request and supply regions d(i) and timeslots t(j)")
(request,supply) = getAllRequestAndSupply()
print(request)
print(supply) 


Printing request and supply regions d(i) and timeslots t(j)
Number of lines of data: 8540614


Calculating requests: 100%|██████████| 8540614/8540614 [33:32<00:00, 4243.24 lines/s]

[[1479, 1426, 1264, 1172, 1106, 957, 923, 853, 784, 794, 706, 593, 612, 592, 509, 469, 393, 431, 381, 319, 328, 289, 239, 220, 184, 167, 180, 183, 167, 173, 193, 222, 198, 254, 292, 463, 705, 752, 811, 1026, 1090, 1190, 1655, 2086, 2634, 3783, 4450, 4442, 5777, 7342, 6508, 6681, 7792, 5408, 4118, 3880, 3589, 3239, 3331, 3097, 3015, 2745, 2643, 2770, 2715, 2776, 3068, 3108, 3059, 3522, 3635, 3667, 3727, 3637, 3411, 3530, 3747, 3702, 3737, 3861, 3680, 3684, 3656, 3656, 3694, 3604, 3596, 3801, 3754, 3755, 3882, 3924, 3892, 4028, 3854, 3939, 4353, 4286, 4494, 4775, 4765, 4690, 5813, 5401, 5289, 6264, 5904, 5552, 6098, 5770, 5513, 5465, 5085, 4887, 4811, 4643, 4489, 4620, 4586, 4768, 4971, 5041, 4925, 5147, 5018, 5118, 5421, 4965, 4833, 4668, 4426, 4575, 5017, 4283, 3584, 2996, 2705, 2670, 2077, 1933, 1832, 1583, 1425, 1496], [165, 161, 164, 160, 137, 123, 131, 88, 81, 72, 92, 54, 56, 58, 58, 47, 37, 41, 21, 22, 18, 18, 22, 10, 13, 9, 9, 12, 14, 7, 11, 7, 16, 24, 19, 18, 22, 31, 62, 174, 17




In [9]:
npRequest = np.array(request)
npSupply = np.array(supply)
np.savetxt('request.csv', npRequest, delimiter=',')
np.savetxt('supply.csv', npSupply, delimiter=',')


In [None]:
# # now to get req(i,j) we can do that by counting the number of orders for region i and timeslot j
# # concurrentI =0
# # concurrentJ =0
# def getRequestMulti(data, i, j,lowerIndex,upperIndex):
#     # (orderData, i, j,lowerIndex,upperIndex) = arguments
#     numberOfIterations = upperIndex - lowerIndex
#     currentPID = mp.current_process()._identity[0]-1
#     # logging.info(f"process {currentPID}")
#     print(f"Number of lines of data: {numberOfIterations} for process {currentPID}")
#     # progressBarInit = tqdm(total=numberOfIterations, desc=f"Calculating requests {currentPID}", unit=" lines")
#     lowerIndex = lowerIndex[currentPID]
#     upperIndex = upperIndex[currentPID]
#     requests = 0
#     for row in range(lowerIndex,upperIndex):
#         currentRegionID = getRegionID(data['start_region_hash'][row])
#         currentTimeSlot = calculateTimeSlot(data['time'][row],False)
#         if currentRegionID == i and currentTimeSlot == j:
#             requests += 1
#         # progressBarInit.update(1)
#     # progressBarInit.close()
#     return requests

# def getRequestHelper(i,j): # i is region id and j is timeslot
#     global orderData
#     # global concurrentI
#     # global concurrentJ
#     # concurrentJ = j
#     # concurrentI = i
#     numberOfIterations = len(orderData)
#     # logging.basicConfig(level=logging.INFO,filename='worker.log', filemode='w')
#     # console_handler = logging.StreamHandler()
#     # logging.getLogger().addHandler(console_handler)
#     # print(f"Number of lines of data: {numberOfIterations}")
#     # progressBarInit = tqdm(total=numberOfIterations, desc="Calculating requests", unit=" lines")
    
#     numberOfProcessesToRun = mp.cpu_count()
#     print(f"CPUs: {numberOfProcessesToRun}")
#     multiProcessingPool = mp.Pool(numberOfProcessesToRun)
#     upperIndex = []
#     lowerIndex = []
#     for i in range(numberOfProcessesToRun):
#         lowerval = i*numberOfIterations//numberOfProcessesToRun
#         upperVal = (i+1)*numberOfIterations//numberOfProcessesToRun
#         lowerIndex.append(lowerval)
#         upperIndex.append(upperVal)
#     print("Here")
#     # argumentsToPass = (orderData, i, j,lowerIndex, upperIndex)
#     requests = multiProcessingPool.starmap(getRequestMulti, [(orderData, i, j,lowerIndex, upperIndex)])
#     requests = sum(requests)
#     multiProcessingPool.close()
#     multiProcessingPool.join()
#     return requests



# print("Printing request for 1st region and 1st timeslot")
# print(getRequestHelper(1,1))