## Prepping data for learning
### Method: based on the sunrise hour, is the sunrise beautiful?
            based on the sunset hour, is the sunset beautiful?

inputs: (all for the same location over same time frame)
- weather csv 
- sunrise dates csv
- sunset dates csv

output: 
- data3 / prepped_sunrise / location.csv
- data3 / prepped_sunset / location.csv

 what we need: 
 - iterate through the data, only keep rows whose time is within sunset time
 - same with sunrise time 
    
code: 

    for day in data: 
        find sunrise time for the day (should be the sunrise column) 
        turn that time into minutes, store as sunrise time
            
            for hour in day: 
                find the weather vector time 
                turn that time into minutes
                
                if sunrise time - 60 < vector time < sunrise time + 60 :
                    put row in sunrise data frame


In [11]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os

### Sunrise/Sunset Date Dictionary
Read in the dates csv to make a dictionary of all dates with sunrise photos

dictionary structure: 
- key is date
- value is a tuple of booleans (sunrise = 0, sunset = 0)
- 0 for no photos exist, 1 for photo exists

In [12]:
sunrise_path = os.path.join(os.getcwd(),"data2011-2017", "denver_sunrise.csv")
sunset_path = os.path.join(os.getcwd(),"data2011-2017", "denver_sunset.csv")
weather_path = os.path.join(os.getcwd(),"data2011-2017", "weather_data.csv")
# CHANGE OUTPUT PATH LATER
output_path = 'prepped_data_2.csv'

In [None]:
## CLEAN TIMES

In [None]:
# get a dictionary of the sunrise and sunset times
# this will be used to verify whether the photo datetime is accurate or not.
# any date whose time is not within an hour of sunrise or sunset will be taken out of the dataset
times = pd.read_csv(weather_path, usecols = ['Sunrise', 'Sunset'] )
sunrise_times = {}
for datetime in times['Sunrise']: 
    date = datetime[:10]
    hour = int(datetime[11:13])
    minute = int(datetime[14:16])
    time = hour * 60 + minute
    sunrise_times[date] = time 
    
sunset_times = {}
for datetime in times['Sunset']: 
    date = datetime[:10]
    hour = int(datetime[11:13])
    minute = int(datetime[14:16])
    time = hour * 60 + minute
    sunset_times[date] = time 


In [None]:
## GET DATE DICTIONARY

In [14]:
# create a dictionary of all dates with photos
dates = {}

# sunrise
df = pd.read_csv(sunrise_path)

for column in df: 
    columnSeriesObj = df[column]
    for item in columnSeriesObj.values:
        date = item[:10]
        hour = int(item[11:13])
        minute = int(item[14:16])
        photo_time = hour * 60 + minute
    
        real_time = sunrise_times.get(date)
        # check alleged time's proximity to the real sunrise time to make sure that this is a valid date
        # if real time -60 < photo time < real time + 60
        if real_time - 60 < photo_time < real_time + 60:  
            dates[date] = (1,0)

# sunset
df = pd.read_csv(sunset_path)

# if it is already in dictionary,
# keep sunrise value in the tuple, make the sunset value a 1
# if not in the dictionary,
# add to the dictionary with 0 for sunrise value
for column in df: 
    columnSeriesObj = df[column]
    for item in columnSeriesObj.values:
        date = item[:10]
        hour = int(item[11:13])
        minute = int(item[14:16])
        photo_time = hour * 60 + minute
        real_time = sunset_times.get(date)
        # if valid:
        if real_time - 60 < photo_time < real_time + 60: 
            if date in dates: 
                dates[date] = (dates[date][0], 1) 

            else:
                dates[date] = (0,1)
    


### Clean the weather data
- read in the csv with full weather data as a pandas dataframe


In [None]:
# output = pd.read_csv(weather_path)
# # find out what conditions exist
# conditions = {}
# for row in output['Conditions']:
#     for item in row.split(', '):
#         if item not in conditions: 
#             conditions[item] = 0
#         else: 
#             conditions[item] += 1
            
# # see dataprep1.ipynb for weather type processing

# # make a new column in this order for every condition
# for condition in list(conditions):
#     output.at[:,"Condition: " + condition] = 0.0

In [18]:
output = pd.read_csv(weather_path)

# rename sunrise and sunset date time columns 
output.rename(columns = {'Sunrise':'Sunrise Time', 'Sunset':'Sunset Time'}, inplace = True)

# change conditions into one conditions vector with 0s and 1s --> Condition: <condition>
for i, row in enumerate(output['Conditions']):
    for condition in row.split(', '):
        output.at[i,"Condition: " + condition] = 1

# add sunrise and sunset labels based on the dates dictionary
for i, row in enumerate(output['Date time']):
    date = row[:10]
    # get date data
    output.at[i, "Month"] = int(row[5:7])
    output.at[i, "Hour"] = int(row[-8:-6])
    # check the dates dictionary 
    # if the first value in tuple is 1, label yes sunrise, if second, label yes sunset
    if date in dates: 
        if dates.get(date)[0]:
            output.at[i,'Sunrise']= 1.0
            
        if dates.get(date)[1]:
            output.at[i,'Sunset']= 1.0
    

# START WORK HERE!!!! 
make a new dataframe for the sunrise hours 
    - take only rows whose time aligns with sunrise time 
    - change the sunrise label column to be LAST ? maybe. 
and a new dataframe for the sunset hours


In [None]:

# change to sunrise times
# FOR THIS VERSION, ONLY KEEP SUNRISE TIME vectors (store in sunrise output) 
# https://note.nkmk.me/en/python-pandas-str-slice/
hour = output['Sunrise Time'].str[11:13]
to_drop = []
for i, time in enumerate(output['Sunrise Time']):
    
    sunrise_hour = int(time[11:13])
    # NOTE! RIGHT NOW, just dropping the bad hours, but in the future will make a new df 
    if sunrise_hour != output['Hour'].get(i):
        to_drop.append(i)
        # keep the vector in the sunrise dataframe
        # print(sunrise_hour, output['Hour'].get(i))



In [None]:
output = output.drop(to_drop) # drop all non sunrise hours
        
# populate NaNs with 0.0s
output = output.fillna(0)

# delete any column with strings or unecessary info
output = output.drop(['Moon Phase', 'Heat Index', 'Minimum Temperature', 'Precipitation Cover', 'Maximum Temperature', 'Latitude', 'Longitude', 'Date time', 'Info', 'Address', 'Conditions', 'Weather Type', 'Sunrise Time', 'Sunset Time', 'Resolved Address', 'Name'], axis=1)

# index = False makes it ignore the first column
output.to_csv(output_path, index=False) 