In [57]:
# mapping visitor data with scrapped weather data
from datetime import datetime, timedelta
from random import randint
import csv

In [58]:
filename = "PrepData.csv"

N = 365 * 2.5;

visitorRange = [1500, 28000]
visitorRangeMonday = [1, 250]
visitorBiasRange = [500, 2000]

lastDate = datetime.strptime("2018-12-28", "%Y-%m-%d" )
# startDate = lastDate - timedelta(days = N)	# from last real data
startDate = datetime.strptime("2015-01-01", "%Y-%m-%d" ) # set start date according to real data availability

additionalHoliday = ['2015-01-01', '2016-01-01', '2017-01-01', '2018-01-01`

existingVisitorData = dict()
realMonthlyAverage = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
dayOfMonth = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]

existingWeatherData = dict()

In [59]:
def readRealData(filelocation, year):
	print(filelocation)
	with open(filelocation) as csv_file:
		csv_reader = csv.reader(csv_file, delimiter=',')
		line_count = 0
    
		existingVisitorData[year] = dict()

		for row in csv_reader:
			existingVisitorData[year][row[0]] = row[1]

def readWeatherData(filelocation, year):
	print(filelocation)
	with open(filelocation) as csv_file:
		csv_reader = csv.reader(csv_file, delimiter=',')
		line_count = 0
    
		existingWeatherData[year] = dict()

		for row in csv_reader:
			existingWeatherData[year][row[0]] = [row[1], row[2], row[3], row[4], row[5]]

def randomlySplitTotal(total, splitTo):
	result = []
	for i in range(splitTo - 1):
		currentMax = (int) (total/2)
		currentMin = (int)  (total /10)
		split = randint(currentMin, currentMax)
		
		result.append(split)
		total = total - split

	result.append(total)
	return result

def isDateWeekend(dateObject):
    if dateObject.weekday() in [5,6]:
        return True
    
    return False
    
def isDateSpecialHoliday(dateObject):
    if dateObject.strftime('%Y-%m-%d') in additionalHoliday:
        return True
    
    return False

In [60]:
def main():

	# read 3 years data
	readRealData('RealData/2016-Table 1.csv', 2016)
	readRealData('RealData/2017-Table 1.csv', 2017)
	readRealData('RealData/2018-Table 1.csv', 2018)

	# read weather data
	readWeatherData('ScrapedWeather/2011.csv', 2011)
	readWeatherData('ScrapedWeather/2012.csv', 2012)
	readWeatherData('ScrapedWeather/2013.csv', 2013)
	readWeatherData('ScrapedWeather/2014.csv', 2014)
	readWeatherData('ScrapedWeather/2015.csv', 2015)
	readWeatherData('ScrapedWeather/2016.csv', 2016)
	readWeatherData('ScrapedWeather/2017.csv', 2017)
	readWeatherData('ScrapedWeather/2018.csv', 2018)

	# calculate monthly average from 3 years data
	for key in existingVisitorData[2016]:
		if existingVisitorData[2016][key] != "":
			monthIndex = datetime.strptime(key, "%Y-%m-%d").month - 1
			realMonthlyAverage[monthIndex] = realMonthlyAverage[monthIndex] + (int) (existingVisitorData[2016][key])

	for key in existingVisitorData[2017]:
		if existingVisitorData[2017][key] != "":
			monthIndex = datetime.strptime(key, "%Y-%m-%d").month - 1
			realMonthlyAverage[monthIndex] = realMonthlyAverage[monthIndex] + (int) (existingVisitorData[2017][key])

	for key in existingVisitorData[2018]:
		if existingVisitorData[2018][key] != "":
			monthIndex = datetime.strptime(key, "%Y-%m-%d").month - 1
			realMonthlyAverage[monthIndex] = realMonthlyAverage[monthIndex] + (int) (existingVisitorData[2018][key])

	# average the sum of 3 years
	for i in range(len(realMonthlyAverage)):
		realMonthlyAverage[i] = (int) (realMonthlyAverage[i] / (dayOfMonth[i] * 3) )
	
	f = open(filename,"w+")
	f.write("date, minTemp, maxTemp, clear, cloudy, rain, snow, storm, wind, weekend, holiday, visitorTotal, visitorMale, visitorFemale, visitorKids, VisitorTeenage, VisitorAdult, VisitorElderly, OverMonthlyAverage \n")			# header

	currentDate = startDate
	i = 0

	while currentDate < lastDate:

		if currentDate.year <= 2015:	# random data before 2015
			if currentDate.weekday() != 0:
				
				# visitorTotal = randint(visitorRange[0], visitorRange[1])
				visitorTotal = realMonthlyAverage[currentDate.month - 1]	# refer to real monthly average
				gainLoss = 1
				if randint(1, 2) %2 == 0:
					gainLoss = -1
				visitorTotal = visitorTotal + (gainLoss * randint(visitorBiasRange[0], visitorBiasRange[1]))	# plus bias

			else:
				visitorTotal = randint(visitorRangeMonday[0], visitorRangeMonday[1])
		else:
			if existingVisitorData[currentDate.year][currentDate.strftime('%Y-%m-%d')] == "":
				visitorTotal = 9999	# empty data
			else:
				visitorTotal = (int) (existingVisitorData[currentDate.year][currentDate.strftime('%Y-%m-%d')])


		splitGender = randomlySplitTotal(visitorTotal, 2)		# split total visitor by gender
		splitAge = randomlySplitTotal(visitorTotal, 4)			# split total gender by age (4 groups)

		# weather data prep
		clear = 0
		cloudy = 0
		rain = 0
		snow = 0
		storm = 0
		minTemp = 0
		maxTemp = 0

		# if weather data not exsit -> use default value (tolerate error)
		if currentDate.strftime('%Y-%m-%d') in existingWeatherData[currentDate.year]:
			weatherData = existingWeatherData[currentDate.year][currentDate.strftime('%Y-%m-%d')]

			weatherCond1 = weatherData[0]
			weatherCond2 = weatherData[1]

			minTemp = int(weatherData[3])
			maxTemp = int(weatherData[2])

			wind = int(weatherData[4])

			clear = 0
			cloudy = 0
			rain = 0
			snow = 0
			storm = 0

			# weather labelling 
			if weatherCond1.find('cloudy') >= 0 or weatherCond2.find('cloudy') >= 0:
				cloudy = 1

			if weatherCond1.find('rain') >= 0 or weatherCond2.find('rain') >= 0 or weatherCond1.find('shower') >= 0 or weatherCond2.find('shower') >= 0 :
				rain = 1

			if weatherCond1.find('sleet') >= 0 or weatherCond2.find('sleet') >= 0:
				rain = 1
				snow = 1

			stormCondition = ['rainstorm', 'heavy rain', 'thunder storm']
			if weatherCond1 in stormCondition or weatherCond2 in stormCondition:
				rain = 1
				storm = 1

		isWeekend = isDateWeekend(currentDate)
		isHoliday = isDateSpecialHoliday(currentDate)
        
		# classification done by comparing total visitor with monthly average - unused
		classCond = 0
		if visitorTotal >= realMonthlyAverage[currentDate.month - 1]:
			classCond = 1

		f.write("%s,"% (currentDate.strftime('%Y-%m-%d')))
		f.write("%d, %d, %d, %d, %d, %d, %d, %d, %d, %d," % (minTemp, maxTemp, clear, cloudy, rain, snow, storm, wind, isWeekend, isHoliday)) # line
		f.write("%d, %d, %d, %d, %d, %d, %d," % (visitorTotal, splitGender[0], splitGender[1], splitAge[0], splitAge[1], splitAge[2], splitAge[3]))		# line cont
		f.write("%d\n" % (classCond))		# line cont
	
		currentDate = currentDate + timedelta(days=1)
		i = i + 1

	f.close()


In [61]:
main()

RealData/2016-Table 1.csv
RealData/2017-Table 1.csv
RealData/2018-Table 1.csv
ScrapedWeather/2011.csv
ScrapedWeather/2012.csv
ScrapedWeather/2013.csv
ScrapedWeather/2014.csv
ScrapedWeather/2015.csv
ScrapedWeather/2016.csv
ScrapedWeather/2017.csv
ScrapedWeather/2018.csv
