## Prepping data for learning
### Method: based on the sunrise hour, is the sunrise beautiful?


inputs: 
- csv of dates separated on new line
- csv of weather data to label

output: 
- labeled weather data

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os

### Sunrise Date Dictionary
Read in the dates csv to make a dictionary of all dates with sunrise photos

dictionary structure: 
- key is date
- value is a tuple of booleans (sunrise = 0, sunset = 0)
- 0 for no photos exist, 1 for photo exists

In [2]:
sunrise_path = os.path.join(os.getcwd(),"data2011-2017", "denver_sunrise.csv")
sunset_path = os.path.join(os.getcwd(),"data2011-2017", "denver_sunset.csv")
weather_path = os.path.join(os.getcwd(),"data2011-2017", "weather_data.csv")


In [277]:
# get a dictionary of the sunrise and sunset times
# this will be used to verify whether the photo datetime is accurate or not.
# any date whose time is not within an hour of sunrise or sunset will be taken out of the dataset

times = pd.read_csv(weather_path, usecols = ['Sunrise', 'Sunset'] )
sunrise_times = {}
for datetime in times['Sunrise']: 
    date = datetime[:10]
    hour = int(datetime[11:13])
    minute = int(datetime[14:16])
    time = hour * 60 + minute
    sunrise_times[date] = time 
    
sunset_times = {}
for datetime in times['Sunset']: 
    date = datetime[:10]
    hour = int(datetime[11:13])
    minute = int(datetime[14:16])
    time = hour * 60 + minute
    sunset_times[date] = time 
    

In [283]:
# create a dictionary of all dates with photos
dates = {}

# sunrise
df = pd.read_csv(sunrise_path)
  
for column in df: 
    columnSeriesObj = df[column]
    for item in columnSeriesObj.values:
        date = item[:10]
        hour = int(item[11:13])
        minute = int(item[14:16])
        photo_time = hour * 60 + minute
    
        real_time = sunrise_times.get(date)
        # check alleged time's proximity to the real sunrise time to make sure that this is a valid date
        # if real time -60 < photo time < real time + 60
        if real_time - 60 < photo_time < real_time + 60:  
            dates[date] = (1,0)

# sunset
df = pd.read_csv(sunset_path)

# if it is already in dictionary,
# keep sunrise value in the tuple, make the sunset value a 1
# if not in the dictionary,
# add to the dictionary with 0 for sunrise value
for column in df: 
    columnSeriesObj = df[column]
    for item in columnSeriesObj.values:
        date = item[:10]
        hour = int(item[11:13])
        minute = int(item[14:16])
        photo_time = hour * 60 + minute
        real_time = sunset_times.get(date)
        # if valid:
        if real_time - 60 < photo_time < real_time + 60: 
            good += 1
            if date in dates: 
                dates[date] = (dates[date][0], 1) 

            else:
                dates[date] = (0,1)
        
        

In [284]:
# # create a dictionary of all dates with photos
# dates = {}

# # sunrise
# df = pd.read_csv(sunrise_path)

# for column in df: 
#     columnSeriesObj = df[column]
#     for item in columnSeriesObj.values:
#         date = item[:10]
#         # if it is within sunrise times 
#         dates[date] = (1,0)
    
# # sunset
# df = pd.read_csv(sunset_path)

# for column in df: 
#     columnSeriesObj = df[column]
#     for item in columnSeriesObj.values:
#         date = item[:10]
#         # if it is already in dictionary,
#         # keep sunrise value in the tuple, make the sunset value a 1
#         if date in dates: 
#             dates[date] = (dates[date][0], 1) 
#         # if not in the dictionary,
#         # add to the dictionary with 0 for sunrise value
#         else:
#             dates[date] = (0,1)
# og_dates = dates


denver cleaning: 

- 396 sunrise dates removed
- 570 sunset dates removed
- 816 remained the same


### Clean the weather data
- read in the csv with full weather data as a pandas dataframe


In [285]:
input_file = pd.read_csv(weather_path)


find out what conditions exist

In [286]:
output = pd.read_csv(weather_path)
conditions = {}
for row in output['Conditions']:
    for item in row.split(', '):
        if item not in conditions: 
            conditions[item] = 0
        else: 
            conditions[item] += 1
            

types = {}
for row in output['Weather Type']:
    if isinstance(row, str):
        for item in row.split(', '):
            if item not in types: 
                types[item] = 1
            else: 
                types[item] += 1

# make a new column in this order for every condition
for condition in list(conditions):
    output.at[:,"Condition: " + condition] = 0.0
    
# make a new column in this order for every weather type
for type in list(types):
    output.at[:,"Type: " + type] = 0.0

In [287]:
output = pd.read_csv(weather_path)

# rename sunrise and sunset date time columns 
output.rename(columns = {'Sunrise':'Sunrise Time', 'Sunset':'Sunset Time'}, inplace = True)


# # change to sunrise times
# # https://note.nkmk.me/en/python-pandas-str-slice/
# hour = output['Sunrise Time'].str[11:13]
# minute = output['Sunrise Time'].str[14:16]
# maybe convert it into a floating point which represents current time in minutes / total minutes in a day


# change conditions into one conditions vector with 0s and 1s --> Condition: <condition>
for i, row in enumerate(output['Conditions']):
    for condition in row.split(', '):
        output.at[i,"Condition: " + condition] = 1

# # weather types --> Type: <weather_type>
# for i, row in enumerate(output['Weather Type']):
#         if isinstance(row, str):
#             for type in row.split(', '):
#                 output.at[i,"Type: " +type] = 1

# add sunrise and sunset labels based on the dates dictionary
for i, row in enumerate(output['Date time']):
    date = row[:10]
    
    # get date data
    output.at[i, "Month"] = int(row[5:7])
    output.at[i, "Hour"] = int(row[-8:-6])
    
    
    # check the dates dictionary 
    # if the first value in tuple is 1, label yes sunrise, if second, label yes sunset
    if date in dates: 
        if dates.get(date)[0]:
            output.at[i,'Sunrise']= 1.0
        if dates.get(date)[1]:
            output.at[i,'Sunset']= 1.0
        
# populate NaNs with 0.0s
output = output.fillna(0)

# delete any column with strings or unecessary info
output = output.drop(['Moon Phase', 'Minimum Temperature', 'Precipitation Cover', 'Maximum Temperature', 'Latitude', 'Longitude', 'Date time', 'Info', 'Address', 'Conditions', 'Weather Type', 'Sunrise Time', 'Sunset Time', 'Resolved Address', 'Name'], axis=1)


In [288]:
# # iterating the columns
list(output)

['Temperature',
 'Dew Point',
 'Relative Humidity',
 'Heat Index',
 'Wind Speed',
 'Wind Gust',
 'Wind Direction',
 'Wind Chill',
 'Precipitation',
 'Snow Depth',
 'Visibility',
 'Cloud Cover',
 'Sea Level Pressure',
 'Condition: Overcast',
 'Condition: Partially cloudy',
 'Condition: Clear',
 'Condition: Rain',
 'Month',
 'Hour',
 'Sunrise',
 'Sunset']

In [289]:
output

Unnamed: 0,Temperature,Dew Point,Relative Humidity,Heat Index,Wind Speed,Wind Gust,Wind Direction,Wind Chill,Precipitation,Snow Depth,...,Cloud Cover,Sea Level Pressure,Condition: Overcast,Condition: Partially cloudy,Condition: Clear,Condition: Rain,Month,Hour,Sunrise,Sunset
0,22.1,15.2,74.09,0.0,7.7,0.0,210.0,13.1,0.0,0.00,...,97.8,1025.9,1.0,0.0,0.0,0.0,12.0,0.0,1.0,0.0
1,22.2,15.0,73.70,0.0,5.6,0.0,223.0,14.9,0.0,0.00,...,89.0,1024.8,1.0,0.0,0.0,0.0,12.0,1.0,1.0,0.0
2,22.8,15.9,74.28,0.0,4.1,0.0,215.0,17.4,0.0,0.00,...,97.8,1024.0,1.0,0.0,0.0,0.0,12.0,2.0,1.0,0.0
3,22.8,16.1,74.84,0.0,6.1,0.0,204.0,15.2,0.0,0.00,...,68.5,1023.2,0.0,1.0,0.0,0.0,12.0,3.0,1.0,0.0
4,22.8,15.7,73.66,0.0,6.3,0.0,201.0,15.1,0.0,0.00,...,63.3,1022.9,0.0,1.0,0.0,0.0,12.0,4.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63920,6.9,-6.0,54.86,0.0,8.9,0.0,213.0,-6.6,0.0,2.47,...,21.9,1023.4,0.0,0.0,1.0,0.0,1.0,20.0,1.0,1.0
63921,7.6,-6.5,51.89,0.0,9.3,0.0,215.0,-6.0,0.0,2.44,...,5.5,1022.3,0.0,0.0,1.0,0.0,1.0,21.0,1.0,1.0
63922,6.9,-6.5,53.25,0.0,10.2,0.0,208.0,-7.5,0.0,2.41,...,0.0,1022.2,0.0,0.0,1.0,0.0,1.0,22.0,1.0,1.0
63923,6.1,-6.5,55.30,0.0,11.5,0.0,202.0,-9.5,0.0,2.38,...,0.0,1022.2,0.0,0.0,1.0,0.0,1.0,23.0,1.0,1.0


In [290]:
# index = False makes it ignore the first column
output.to_csv('prepped_data_1.csv', index=False) 


In [291]:
# diff_sunrise = 0
# diff_sunset = 0
# same = 0

# for og, new in zip(og_dates.keys(), dates.keys()):
#     if og_dates.get(og)[0] != dates.get(new)[0]:
#         diff_sunrise +=1
#     if og_dates.get(og)[1] != dates.get(new)[1]:

        
# print(diff_sunrise, diff_sunset, same)