In [None]:
# !pip install numpy
# !pip install pandas
# !pip install tqdm
# !pip install sklearn
# !pip install matplotlib

In [1]:
#This is dev branch
import datetime
import math
import copy
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import multiprocessing as mp
import logging
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

NUM_TIME_SLOTS = 144
NUM_DAYS_IN_DATA = 0

In [None]:
# 24 hours is divided into 144 slots where each slot is 10 mins long
def calculateTimeSlot(time,printValue=True):
    global NUM_TIME_SLOTS
    dateTime = datetime.datetime.strptime(time, '%Y-%m-%d %H:%M:%S')
    timePart = dateTime.time()
    timeInMinutes = (timePart.hour * 60) + timePart.minute + (timePart.second/60) + 1
    timeSlot = timeInMinutes/10
    roundedTimeSlot = math.ceil(timeSlot)
    if roundedTimeSlot > NUM_TIME_SLOTS:
        roundedTimeSlot -= 1
    if printValue==True:
        print(f"time: {time} timeInMinutes: {timeInMinutes} timeSlot: {roundedTimeSlot}")
    return int(roundedTimeSlot)

def extractDayOfWeek(time):
    d = datetime.datetime.strptime(time, '%Y-%m-%d %H:%M:%S')
    return d.weekday()

def extractDate(time):
    d = datetime.datetime.strptime(time, '%Y-%m-%d %H:%M:%S')
    return d.date()

print(type(extractDate('2019-01-01 00:00:00')))

In [None]:
def extractNumberOfPOI(poiArray):
    poiCount = 0
    for poiEntry in poiArray:
        poiEntry = poiEntry + ':'
        divisions = poiEntry.split(":")
        poiCount += int(divisions[1])
    return poiCount
        

In [None]:
def readMultipleData(path,fileNamePrefix,headerNames,dataTypes):
    global NUM_DAYS_IN_DATA
    filesToExplore = []
    for file in os.listdir(path):
        if file.startswith(fileNamePrefix):
            filesToExplore.append(file)
            # print(f"{file} read")
        else:
            continue

    print(f"{len(filesToExplore)} files read")
    if fileNamePrefix == 'order':
        NUM_DAYS_IN_DATA = len(filesToExplore)
    
    readData = []
    for files in filesToExplore:
        fileRead = pd.read_csv(path + files, sep='\t', names=headerNames,dtype=dataTypes)
        readData.append(fileRead)

    readData = pd.concat(readData, ignore_index=True)
    return readData

In [None]:
# now region Data
regionData = pd.read_csv('./training_data/cluster_map/cluster_map', sep='\t', names=['region_hash', 'region_id'],dtype={'region_hash': 'str', 'region_id': 'int'})
# print(regionData.head())
regionData.to_csv('regionData.csv',index=False)

In [None]:
# read order data
dataTypes = {'order_id':'str', 'driver_id':'str', 'passenger_id':'str', 'start_region_hash':'str', 'dest_region_hash':'str', 'price':'double', 'time':'str'}
orderDataPath = './training_data/order_data/'
orderData = readMultipleData(orderDataPath,'order', ['order_id', 'driver_id', 'passenger_id', 'start_region_hash', 'dest_region_hash', 'price', 'time'], dataTypes)
print("printing order data")
# print(orderData.head())


In [None]:
# read weather data
dataTypes={'time':'str', 'weather':'int', 'temperature':'double', 'PM2.5':'double'}
weatherDataPath = './training_data/weather_data/'
weatherData = readMultipleData(weatherDataPath,'weather', ['time', 'weather', 'temperature', 'PM2.5'], dataTypes)
print("printing weather data")
# print(weatherData.head())

In [None]:
weatherData['time_slot'] = weatherData['time'].apply(calculateTimeSlot,printValue=False)
weatherData['date'] = weatherData['time'].apply(extractDate)
# weatherData['day_of_week'] = weatherData['time'].apply(extractDayOfWeek)
weatherData = weatherData.drop(['temperature','PM2.5','time'], axis=1)
print(weatherData.head())

In [None]:
orderData = pd.merge(regionData,orderData, how='left', right_on='start_region_hash', left_on='region_hash')
orderData['time_slot'] = orderData['time'].apply(calculateTimeSlot,printValue=False)
orderData['day_of_week'] = orderData['time'].apply(extractDayOfWeek)
orderData['date'] = orderData['time'].apply(extractDate)
orderData = orderData.drop(['passenger_id', 'dest_region_hash','start_region_hash','price'], axis=1)
print(orderData.head())
# regionData=None

In [None]:
mergedOrderData = pd.merge(weatherData,orderData, how="inner", on=['date','time_slot'])
print(mergedOrderData.head())
# orderData.to_csv('orderData.csv',index=False)

In [None]:
orderData=None
orderData = mergedOrderData
# orderData.to_csv('mergedOrderData.csv',index=False)
# type(orderData['driver_id'][4])==float

In [None]:
orderData = pd.read_csv('mergedOrderData.csv')

In [None]:
# read POI Data
poiDataStr = {
    'region_hash':[],
    'poi_class':[]
}
with open('./training_data/poi_data/poi_data','r') as fileToRead:
    for line in fileToRead:
        line = line.strip()
        columns = line.split('\t')
        poiDataStr['region_hash'].append(columns[0])
        remData = columns[1:]
        poiDataStr['poi_class'].append(remData)
        
poiData = pd.DataFrame(poiDataStr,columns=['region_hash','poi_class'])
poiData['poi_count'] = poiData['poi_class'].apply(extractNumberOfPOI)
poiData = pd.merge(regionData,poiData, how='inner', on='region_hash')
poiData = poiData.drop(['region_hash'], axis=1)
poiData = poiData.drop(['poi_class'], axis=1)
print(poiData.head())

In [None]:
orderData = pd.merge(orderData,poiData, how="inner", on='region_id')
print(orderData.head())

In [None]:
# print(mergedDataCSV)
orderData['requests'] = 1
# print(orderData.head())
groupedMergedData = orderData.groupby(['region_id','time_slot','day_of_week','weather','poi_count'])['requests'].agg('sum').reset_index()
# groupedMergedData = groupedMergedData.drop(['date','region_hash','order_id','driver_id','time'], axis=1)
orderData = pd.merge(orderData,groupedMergedData, how='left' , on=['region_id','time_slot','day_of_week','weather','poi_count'])
print(orderData.head())

In [None]:
orderData = orderData.drop(["requests_x"], axis=1)
orderData = orderData.rename(columns={"requests_y": "requests"})
print(orderData.head())

In [None]:
orderData['temp'] = 1
orderData['answers'] = orderData['temp'].where(orderData['driver_id'].notnull(), 0)
orderData = orderData.drop(['temp'], axis=1)
groupedMergedData = orderData.groupby(['region_id','time_slot','day_of_week','weather','poi_count'])['answers'].agg('sum').reset_index()
orderData = pd.merge(orderData,groupedMergedData, how='left' , on=['region_id','time_slot','day_of_week','weather','poi_count'])
print(orderData)

In [None]:
orderData = orderData.drop(["answers_x"], axis=1)
orderData = orderData.rename(columns={"answers_y": "answers"})
orderData.to_csv('mergedOrderData.csv',index=False)

In [2]:
orderData = pd.read_csv('mergedOrderData.csv')

In [3]:
orderData['supply_demand'] = orderData['requests'] - orderData['answers']
print(orderData)

          weather  time_slot        date                       region_hash   
0               1          1  2016-01-01  90c5a34f06ac86aee0fd70e2adce7d8a  \
1               1          1  2016-01-01  90c5a34f06ac86aee0fd70e2adce7d8a   
2               1          1  2016-01-01  90c5a34f06ac86aee0fd70e2adce7d8a   
3               1          1  2016-01-01  90c5a34f06ac86aee0fd70e2adce7d8a   
4               1          1  2016-01-01  90c5a34f06ac86aee0fd70e2adce7d8a   
...           ...        ...         ...                               ...   
12222999        2        141  2016-01-21  a735449c5c09df639c35a7d61fad3ee5   
12223000        2        142  2016-01-21  a735449c5c09df639c35a7d61fad3ee5   
12223001        2        142  2016-01-21  a735449c5c09df639c35a7d61fad3ee5   
12223002        2        142  2016-01-21  a735449c5c09df639c35a7d61fad3ee5   
12223003        2        142  2016-01-21  a735449c5c09df639c35a7d61fad3ee5   

          region_id                          order_id   
0     

In [14]:
# Apply model here
AImodel = RandomForestRegressor(n_estimators=100, max_depth=5, random_state=0)
X = orderData[['region_id','time_slot','day_of_week','weather','poi_count']]
Y = orderData['supply_demand']
AImodel.fit(X,Y)

In [24]:
tempData = pd.DataFrame([[1,5,3,7,20000]],columns=['region_id','time_slot','day_of_week','weather','poi_count'])
# tempData = pd.DataFrame([[1,5,3,7,20000]],columns=['region_id','time_slot'])
print(AImodel.predict(tempData))
mse = mean_squared_error([28], AImodel.predict(tempData))
print(mse)

[28.65673643]
0.4313027409787012


In [29]:
# make rolling window function
def rollTheWindow(data,windowSize):
    datasetLength = len(data)
    listOfPredictions = []
    numberOfWindows = datasetLength//windowSize
    for i in range(numberOfWindows):
        rollingWindow = []
        lowerLimit = i
        upperLimit = i+windowSize
        rollingWindow.append(data[lowerLimit:upperLimit])
        rollingWindow = pd.concat(rollingWindow)
        X_WINDOW_INPUT = rollingWindow[['region_id','time_slot','day_of_week','weather','poi_count']]
        Y_WINDOW_OUTPUT = rollingWindow['supply_demand']
        currentPrediction = AImodel.predict(X_WINDOW_INPUT)
        meanSqError = mean_squared_error(Y_WINDOW_OUTPUT, currentPrediction)
        print(f"Prediction for window [{lowerLimit},{upperLimit}]={currentPrediction} MSE={meanSqError}")
        predictionTuple = (lowerLimit,upperLimit,currentPrediction,meanSqError)
        listOfPredictions.append(predictionTuple)
        i = upperLimit + 1
    return listOfPredictions

In [31]:
rollTheWindow(orderData,1)

Prediction for window [0,1]=[124.21251258] MSE=11709.947878351923
Prediction for window [1,2]=[124.21251258] MSE=11709.947878351923
Prediction for window [2,3]=[124.21251258] MSE=11709.947878351923
Prediction for window [3,4]=[124.21251258] MSE=11709.947878351923
Prediction for window [4,5]=[124.21251258] MSE=11709.947878351923
Prediction for window [5,6]=[124.21251258] MSE=11709.947878351923
Prediction for window [6,7]=[124.21251258] MSE=11709.947878351923
Prediction for window [7,8]=[124.21251258] MSE=11709.947878351923
Prediction for window [8,9]=[124.21251258] MSE=11709.947878351923
Prediction for window [9,10]=[124.21251258] MSE=11709.947878351923
Prediction for window [10,11]=[124.21251258] MSE=11709.947878351923
Prediction for window [11,12]=[124.21251258] MSE=11709.947878351923
Prediction for window [12,13]=[124.21251258] MSE=11709.947878351923
Prediction for window [13,14]=[124.21251258] MSE=11709.947878351923
Prediction for window [14,15]=[124.21251258] MSE=11709.947878351923

KeyboardInterrupt: 

In [None]:
# read test data

In [None]:
# # now to calculate gap(i,j) = req(i,j) - supply(i,j)
# # req(i,j) is for region i and timeslot j 
# # ith region will be from from start_region_hash and jth timeslot will be calculated from time
# def getRegionID(regionHash):
#     regionID = -1
#     for i in range(len(regionData)):
#         if regionHash == regionData['region_hash'][i]:
#             regionID = regionData['region_id'][i]
#     return regionID

In [None]:
# mergedData = None # here

In [None]:
# drop order_id, driver_id, passenger_id, dest_region_hash
# mergedData = orderData.drop(['order_id', 'passenger_id', 'dest_region_hash'], axis=1)
# print("dropped order_id, passenger_id, dest_region_hash")
#  merge order data with region data on start_region_hash with region_hash
# mergedData = pd.merge(regionData,mergedData, how='left', right_on='start_region_hash', left_on='region_hash')
# print("merged order data and region data based on region")

# mergedData = mergedData.drop(['region_hash','start_region_hash'], axis=1)
# print("dropped region_hash, start_region_hash")
# # reduce time to time slot and update time column
# mergedData['time'] = mergedData['time'].apply(calculateTimeSlot,printValue=False)
# print("reduced time to time slot 1 to 144")
# rename time to time_slot
# mergedData.rename(columns={'time':'time_slot'}, inplace=True)
# # append column for day of week into mergedData
# mergedData['day_of_week'] = orderData['time'].apply(extractDayOfWeek)
# print("appended day_of_week column to data")
# now we have mergedData with region_id, price, time, day_of_week
# print("printing merged data")
# print(mergedData)

# writing to mergedData.csv for quick access
# orderData.to_csv('mergedData.csv',index=False)

In [None]:
# read mergedData.csv
# mergedDataCSV = pd.read_csv('mergedData.csv')


In [None]:
# print(mergedDataCSV)

In [None]:
# # now to get req(i,j) we can do that by counting the number of orders for region i and timeslot j
# def getRequest(i,j): # i is region id and j is timeslot
#     global orderData
#     numberOfIterations = len(orderData)
#     print(f"Number of lines of data: {numberOfIterations}")
#     progressBarInit = tqdm(total=numberOfIterations, desc="Calculating requests", unit=" lines")
#     requests = 0
#     for row in range(len(orderData)):
#         currentRegionID = getRegionID(orderData['start_region_hash'][row])
#         currentTimeSlot = calculateTimeSlot(orderData['time'][row],False)
#         if currentRegionID == i and currentTimeSlot == j:
#             requests += 1
#         progressBarInit.update(1)
#     progressBarInit.close()
#     return requests

# print("Printing request for 1st region and 1st timeslot")
# print(getRequest(1,1))

In [None]:
# # def dateToIndex(date):
# #     index = 0
    
# #     return index

# def getAllRequestAndSupply(): # need to filter by date 
#     global orderData
#     global regionData
#     global NUM_TIME_SLOTS
#     global NUM_DAYS_IN_DATA
#     numberOfRegions = len(regionData)
#     numberOfIterations = len(orderData)
#     print(f"Number of lines of data: {numberOfIterations}")
#     progressBarInit = tqdm(total=numberOfIterations, desc="Calculating requests", unit=" lines")
#     # 3D requests array requests[i][j][k] is number of requests --> date i ,region j, timeslot k
#     # requests = [[[0 for k in range(NUM_TIME_SLOTS)] for j in range(numberOfRegions)] for i in range(NUM_DAYS_IN_DATA)]
#     # supply = [[[0 for k in range(NUM_TIME_SLOTS)] for j in range(numberOfRegions)] for i in range(NUM_DAYS_IN_DATA)]
#     requests = [[0 for j in range(NUM_TIME_SLOTS)] for i in range(numberOfRegions)]
#     supply = [[0 for j in range(NUM_TIME_SLOTS)] for i in range(numberOfRegions)]
#     for row in range(len(orderData)):
#         currentRegionID = getRegionID(orderData['start_region_hash'][row])
#         currentTimeSlot = calculateTimeSlot(orderData['time'][row],False)
#         # date = orderData['time'][row].split(' ')[0]
        
#         if currentRegionID < 0:
#             print(f"Region not found for {orderData['start_region_hash'][row]}")
#             continue
#             # return (None,None)
#         if currentTimeSlot < 0:
#             print(f"Time slot not found for {orderData['time'][row]}")
#             continue
#             # return (None,None)
#         if currentRegionID > numberOfRegions:
#             print(f"Region id {currentRegionID} is greater than number of regions {numberOfRegions}")
#             continue
#             # return (None,None)
#         if currentTimeSlot > NUM_TIME_SLOTS:
#             print(f"Time slot {currentTimeSlot} is greater than number of time slots {NUM_TIME_SLOTS}")
#             print(f"Time: {orderData['time'][row]}")
#             print(f"Row: {row}")
#             continue
#             # return (None,None)
#         if currentTimeSlot == 0:
#             print(f"Time slot is 0 for {orderData['time'][row]}")
#             print(f"Row: {row}")
#             continue
#             # return (None,None)
#         # requests[currentDate][currentRegionID-1][currentTimeSlot-1] += 1
#         requests[currentRegionID-1][currentTimeSlot-1] += 1
#         if type(orderData['driver_id'][row]) == str:
#             supply[currentRegionID-1][currentTimeSlot-1] += 1
#             # supply[currentDate][currentRegionID-1][currentTimeSlot-1] += 1
#         progressBarInit.update(1)
#     progressBarInit.close()
#     return (requests,supply)

# print("Printing request and supply regions d(i) and timeslots t(j)")
# (request,supply) = getAllRequestAndSupply()
# print(request)
# print(supply) 


In [None]:
# npRequest = np.array(request)
# npSupply = np.array(supply)
# np.savetxt('request.csv', npRequest, delimiter=',')
# np.savetxt('supply.csv', npSupply, delimiter=',')


In [None]:
# # now to get req(i,j) we can do that by counting the number of orders for region i and timeslot j
# # concurrentI =0
# # concurrentJ =0
# def getRequestMulti(data, i, j,lowerIndex,upperIndex):
#     # (orderData, i, j,lowerIndex,upperIndex) = arguments
#     numberOfIterations = upperIndex - lowerIndex
#     currentPID = mp.current_process()._identity[0]-1
#     # logging.info(f"process {currentPID}")
#     print(f"Number of lines of data: {numberOfIterations} for process {currentPID}")
#     # progressBarInit = tqdm(total=numberOfIterations, desc=f"Calculating requests {currentPID}", unit=" lines")
#     lowerIndex = lowerIndex[currentPID]
#     upperIndex = upperIndex[currentPID]
#     requests = 0
#     for row in range(lowerIndex,upperIndex):
#         currentRegionID = getRegionID(data['start_region_hash'][row])
#         currentTimeSlot = calculateTimeSlot(data['time'][row],False)
#         if currentRegionID == i and currentTimeSlot == j:
#             requests += 1
#         # progressBarInit.update(1)
#     # progressBarInit.close()
#     return requests

# def getRequestHelper(i,j): # i is region id and j is timeslot
#     global orderData
#     # global concurrentI
#     # global concurrentJ
#     # concurrentJ = j
#     # concurrentI = i
#     numberOfIterations = len(orderData)
#     # logging.basicConfig(level=logging.INFO,filename='worker.log', filemode='w')
#     # console_handler = logging.StreamHandler()
#     # logging.getLogger().addHandler(console_handler)
#     # print(f"Number of lines of data: {numberOfIterations}")
#     # progressBarInit = tqdm(total=numberOfIterations, desc="Calculating requests", unit=" lines")
    
#     numberOfProcessesToRun = mp.cpu_count()
#     print(f"CPUs: {numberOfProcessesToRun}")
#     multiProcessingPool = mp.Pool(numberOfProcessesToRun)
#     upperIndex = []
#     lowerIndex = []
#     for i in range(numberOfProcessesToRun):
#         lowerval = i*numberOfIterations//numberOfProcessesToRun
#         upperVal = (i+1)*numberOfIterations//numberOfProcessesToRun
#         lowerIndex.append(lowerval)
#         upperIndex.append(upperVal)
#     print("Here")
#     # argumentsToPass = (orderData, i, j,lowerIndex, upperIndex)
#     requests = multiProcessingPool.starmap(getRequestMulti, [(orderData, i, j,lowerIndex, upperIndex)])
#     requests = sum(requests)
#     multiProcessingPool.close()
#     multiProcessingPool.join()
#     return requests



# print("Printing request for 1st region and 1st timeslot")
# print(getRequestHelper(1,1))