In [None]:
#This is dev branch
import datetime
import math
import copy
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import multiprocessing as mp
import logging

NUM_TIME_SLOTS = 144

In [None]:
def readMultipleData(path,fileNamePrefix,headerNames,dataTypes):
    filesToExplore = []
    for file in os.listdir(path):
        if file.startswith(fileNamePrefix):
            filesToExplore.append(file)
            # print(f"{file} read")
        else:
            continue

    print(f"{len(filesToExplore)} files read")
    
    readData = []
    for files in filesToExplore:
        fileRead = pd.read_csv(path + files, sep='\t', names=headerNames,dtype=dataTypes)
        readData.append(fileRead)

    readData = pd.concat(readData, ignore_index=True)
    return readData

In [None]:
# now region Data
regionData = pd.read_csv('./training_data/cluster_map/cluster_map', sep='\t', names=['region_hash', 'region_id'],dtype={'region_hash': 'str', 'region_id': 'int'})
print(regionData.head())

In [None]:
# read order data
dataTypes = {'order_id':'str', 'driver_id':'str', 'passenger_id':'str', 'start_region_hash':'str', 'dest_region_hash':'str', 'price':'double', 'time':'str'}
orderDataPath = './training_data/order_data/'
orderData = readMultipleData(orderDataPath,'order', ['order_id', 'driver_id', 'passenger_id', 'start_region_hash', 'dest_region_hash', 'price', 'time'], dataTypes)
print("printing order data")
print(orderData.head())

# read weather data
dataTypes={'time':'str', 'weather':'int', 'temperature':'double', 'PM2.5':'double'}
weatherDataPath = './training_data/weather_data/'
weatherData = readMultipleData(weatherDataPath,'weather', ['time', 'weather', 'temperature', 'PM2.5'], dataTypes)
print("printing weather data")
print(weatherData.head())

In [None]:
# read POI Data
poiDataStr = {
    'region_hash':[],
    'poi_class':[]
}
with open('./training_data/poi_data/poi_data','r') as fileToRead:
    for line in fileToRead:
        line = line.strip()
        columns = line.split('\t')
        poiDataStr['region_hash'].append(columns[0])
        remData = columns[1:]
        poiDataStr['poi_class'].append(remData)
        
poiData = pd.DataFrame(poiDataStr,columns=['region_hash','poi_class'])
print("printing poi data")
print(poiData.head())
print("Printing poi data line 1")
print(f"region_hash: {poiData['region_hash'][0]} poi_class: {poiData['poi_class'][0]}")

In [None]:
# now to calculate gap(i,j) = req(i,j) - supply(i,j)
# req(i,j) is for region i and timeslot j 
# ith region will be from from start_region_hash and jth timeslot will be calculated from time
def getRegionID(regionHash):
    regionID = -1
    for i in range(len(regionData)):
        if regionHash == regionData['region_hash'][i]:
            regionID = regionData['region_id'][i]
    return regionID

print("Printing region id for 1st row")
print(getRegionID(regionData['region_hash'][0]))

In [None]:
# 24 hours is divided into 144 slots where each slot is 10 mins long
def calculateTimeSlot(time,printValue=True):
    dateTime = datetime.datetime.strptime(time, '%Y-%m-%d %H:%M:%S')
    timePart = dateTime.time()
    timeInMinutes = (timePart.hour * 60) + timePart.minute + (timePart.second/60) + 1
    timeSlot = timeInMinutes/10
    roundedTimeSlot = math.ceil(timeSlot)
    if printValue==True:
        print(f"time: {time} timeInMinutes: {timeInMinutes} timeSlot: {roundedTimeSlot}")
    return roundedTimeSlot

print("Printing time slot for 1st row")
print(calculateTimeSlot(orderData['time'][627]))

In [None]:
# now to get req(i,j) we can do that by counting the number of orders for region i and timeslot j
def getRequest(i,j): # i is region id and j is timeslot
    global orderData
    numberOfIterations = len(orderData)
    print(f"Number of lines of data: {numberOfIterations}")
    progressBarInit = tqdm(total=numberOfIterations, desc="Calculating requests", unit=" lines")
    requests = 0
    for row in range(len(orderData)):
        currentRegionID = getRegionID(orderData['start_region_hash'][row])
        currentTimeSlot = calculateTimeSlot(orderData['time'][row],False)
        if currentRegionID == i and currentTimeSlot == j:
            requests += 1
        progressBarInit.update(1)
    progressBarInit.close()
    return requests

print("Printing request for 1st region and 1st timeslot")
print(getRequest(1,1))

In [None]:
def getAllRequestAndSupply():
    global orderData
    global regionData
    global NUM_TIME_SLOTS
    numberOfRegions = len(regionData)
    numberOfIterations = len(orderData)
    print(f"Number of lines of data: {numberOfIterations}")
    progressBarInit = tqdm(total=numberOfIterations, desc="Calculating requests", unit=" lines")
    requests = [[0 for j in range(NUM_TIME_SLOTS)] for i in range(numberOfRegions)]
    supply = [[0 for j in range(NUM_TIME_SLOTS)] for i in range(numberOfRegions)]
    for row in range(len(orderData)):
        currentRegionID = getRegionID(orderData['start_region_hash'][row])
        currentTimeSlot = calculateTimeSlot(orderData['time'][row],False)
        if currentRegionID < 0:
            print(f"Region not found for {orderData['start_region_hash'][row]}")
            continue
            # return (None,None)
        if currentTimeSlot < 0:
            print(f"Time slot not found for {orderData['time'][row]}")
            continue
            # return (None,None)
        if currentRegionID > numberOfRegions:
            print(f"Region id {currentRegionID} is greater than number of regions {numberOfRegions}")
            continue
            # return (None,None)
        if currentTimeSlot > NUM_TIME_SLOTS:
            print(f"Time slot {currentTimeSlot} is greater than number of time slots {NUM_TIME_SLOTS}")
            print(f"Time: {orderData['time'][row]}")
            print(f"Row: {row}")
            continue
            # return (None,None)
        if currentTimeSlot == 0:
            print(f"Time slot is 0 for {orderData['time'][row]}")
            continue
            # return (None,None)
        requests[currentRegionID-1][currentTimeSlot-1] += 1
        if orderData['driver_id'][row] != 'NULL':
            supply[currentRegionID-1][currentTimeSlot-1] += 1
        progressBarInit.update(1)
    progressBarInit.close()
    return (requests,supply)

print("Printing request and supply regions d(i) and timeslots t(j)")
(request,supply) = getAllRequestAndSupply()
print(request)
print(supply) 


In [None]:
# # now to get req(i,j) we can do that by counting the number of orders for region i and timeslot j
# # concurrentI =0
# # concurrentJ =0
# def getRequestMulti(data, i, j,lowerIndex,upperIndex):
#     # (orderData, i, j,lowerIndex,upperIndex) = arguments
#     numberOfIterations = upperIndex - lowerIndex
#     currentPID = mp.current_process()._identity[0]-1
#     # logging.info(f"process {currentPID}")
#     print(f"Number of lines of data: {numberOfIterations} for process {currentPID}")
#     # progressBarInit = tqdm(total=numberOfIterations, desc=f"Calculating requests {currentPID}", unit=" lines")
#     lowerIndex = lowerIndex[currentPID]
#     upperIndex = upperIndex[currentPID]
#     requests = 0
#     for row in range(lowerIndex,upperIndex):
#         currentRegionID = getRegionID(data['start_region_hash'][row])
#         currentTimeSlot = calculateTimeSlot(data['time'][row],False)
#         if currentRegionID == i and currentTimeSlot == j:
#             requests += 1
#         # progressBarInit.update(1)
#     # progressBarInit.close()
#     return requests

# def getRequestHelper(i,j): # i is region id and j is timeslot
#     global orderData
#     # global concurrentI
#     # global concurrentJ
#     # concurrentJ = j
#     # concurrentI = i
#     numberOfIterations = len(orderData)
#     # logging.basicConfig(level=logging.INFO,filename='worker.log', filemode='w')
#     # console_handler = logging.StreamHandler()
#     # logging.getLogger().addHandler(console_handler)
#     # print(f"Number of lines of data: {numberOfIterations}")
#     # progressBarInit = tqdm(total=numberOfIterations, desc="Calculating requests", unit=" lines")
    
#     numberOfProcessesToRun = mp.cpu_count()
#     print(f"CPUs: {numberOfProcessesToRun}")
#     multiProcessingPool = mp.Pool(numberOfProcessesToRun)
#     upperIndex = []
#     lowerIndex = []
#     for i in range(numberOfProcessesToRun):
#         lowerval = i*numberOfIterations//numberOfProcessesToRun
#         upperVal = (i+1)*numberOfIterations//numberOfProcessesToRun
#         lowerIndex.append(lowerval)
#         upperIndex.append(upperVal)
#     print("Here")
#     # argumentsToPass = (orderData, i, j,lowerIndex, upperIndex)
#     requests = multiProcessingPool.starmap(getRequestMulti, [(orderData, i, j,lowerIndex, upperIndex)])
#     requests = sum(requests)
#     multiProcessingPool.close()
#     multiProcessingPool.join()
#     return requests



# print("Printing request for 1st region and 1st timeslot")
# print(getRequestHelper(1,1))