inputs: 
- csv of dates separated on new line
- csv of weather data to label

output: 
- labeled weather data

Notes: need a header for the csvs if they are line separated

In [9]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os

### Sunrise Date Dictionary
Read in the dates csv to make a dictionary of all dates with sunrise photos

In [10]:
sunrise_path = os.path.join(os.getcwd(),"label_denver", "denver_sunrise.csv")
sunset_path = os.path.join(os.getcwd(),"label_denver", "denver_sunset.csv")

In [24]:
# create a dictionary of all dates with photos
df = pd.read_csv(sunrise_path)

# dictionary structure: 
    # key is date
    # value is a tuple of booleans (sunrise = 0, sunset = 0)
    # 0 for no photos exist, 1 for photo exists
dates = {}

# sunrise
for column in df: 
    columnSeriesObj = df[column]
    for item in columnSeriesObj.values:
        date = item[:10]
        dates[date] = (1,0)
    
    
df = pd.read_csv(sunset_path)
# # dates = {}

# sunset
for column in df: 
    columnSeriesObj = df[column]
    for item in columnSeriesObj.values:
        date = item[:10]
        # if it is already in dictionary,
        # keep sunrise value in the tuple, make the sunset value a 1
        if date in dates: 
            dates[date] = (dates[date][0], 1) 
        # if not in the dictionary,
        # add to the dictionary with 0 for sunrise value
        else:
            dates[date] = (0,1)


In [25]:
dates

{'2020-08-15': (1, 1),
 '2021-03-10': (1, 1),
 '2020-12-26': (1, 0),
 '2022-03-26': (1, 0),
 '2021-10-23': (1, 0),
 '2022-04-23': (1, 0),
 '2023-01-03': (1, 0),
 '2022-09-03': (1, 0),
 '2022-09-07': (1, 0),
 '2022-08-09': (1, 0),
 '2022-08-07': (1, 1),
 '2022-08-02': (1, 0),
 '2022-08-01': (1, 1),
 '2022-07-31': (1, 0),
 '2022-07-28': (1, 0),
 '2022-10-01': (1, 1),
 '2022-10-02': (1, 0),
 '2022-09-15': (1, 0),
 '2022-01-16': (1, 0),
 '2022-12-09': (1, 0),
 '2020-07-18': (1, 1),
 '2021-11-15': (1, 0),
 '2017-01-22': (1, 0),
 '2022-09-30': (1, 0),
 '2021-12-07': (1, 1),
 '2020-07-04': (1, 0),
 '2011-01-06': (1, 0),
 '2022-09-17': (1, 1),
 '2022-09-02': (1, 0),
 '2021-06-10': (1, 1),
 '2021-11-07': (1, 0),
 '2022-04-24': (1, 0),
 '2022-06-23': (1, 0),
 '2022-06-18': (1, 0),
 '2020-11-18': (1, 0),
 '2020-10-24': (1, 0),
 '2022-01-11': (1, 1),
 '2022-03-28': (1, 0),
 '2022-02-05': (1, 0),
 '2021-11-20': (1, 0),
 '2021-12-06': (1, 0),
 '2021-12-12': (1, 0),
 '2021-12-24': (1, 0),
 '2022-01-1

### Weather days
read in the csv with full weather data


In [26]:
# weather_data = pd.read_csv("2022_data.csv", usecols = [5])

# for column in weather_data: 
#     for row in weather_data[column].values: 
#         date = row[:10]
#         if date in dates:
#             # print(date)
#             pass


In [27]:
# output = pd.read_csv("2022_data.csv")


In [33]:
output = pd.read_csv("2022_data.csv")

# change conditions into one conditions vector with 0s and 1s 
for i, row in enumerate(output['conditions']):
    for condition in row.split(', '):
        output.at[i,condition] = 1

# other features to change
        
    
# add sunrise and sunset labels based on the dates dictionary
for i, row in enumerate(output['datetimeStr']):
    date = row[:10]
    # check the dates dictionary 
    # if the first value in tuple is 1, label yes sunrise, if second, label yes sunset
    if date in dates: 
        if dates.get(date)[0]:
            output.at[i,'sunrise']= 1.0
        if dates.get(date)[1]:
            output.at[i,'sunset']= 1.0
        
        
# also figure out how to populate the NaNs with 0s or whatever makes sense there

output = output.fillna(0)

# delete any column with strings
output

Unnamed: 0,wdir,temp,maxt,visibility,wspd,datetimeStr,solarenergy,heatindex,cloudcover,mint,...,wgust,conditions,windchill,info,Rain,Overcast,Clear,Partially cloudy,sunrise,sunset
0,240.08,12.3,20.9,4.5,15.5,2022-01-01T00:00:00-07:00,0.9,0.0,84.4,0.6,...,0.0,"Rain, Overcast",-12.3,0.0,1.0,1.0,0.0,0.0,0.0,0.0
1,169.17,22.5,42.0,9.9,11.0,2022-01-02T00:00:00-07:00,1.3,0.0,7.6,1.0,...,0.0,Clear,-12.6,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,118.46,37.7,55.0,9.9,9.6,2022-01-03T00:00:00-07:00,2.2,0.0,37.4,20.9,...,28.9,Partially cloudy,10.6,0.0,0.0,0.0,0.0,1.0,1.0,0.0
3,270.50,45.1,53.6,9.9,28.6,2022-01-04T00:00:00-07:00,1.7,0.0,35.9,37.1,...,58.8,Partially cloudy,29.2,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,172.63,28.1,51.6,9.4,24.3,2022-01-05T00:00:00-07:00,2.6,0.0,67.4,12.7,...,46.1,Partially cloudy,-1.5,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
360,181.00,48.8,61.3,9.9,14.4,2022-12-27T00:00:00-07:00,3.1,0.0,41.0,36.4,...,35.8,Partially cloudy,31.6,0.0,0.0,0.0,0.0,1.0,0.0,0.0
361,125.08,44.0,50.0,9.2,14.2,2022-12-28T00:00:00-07:00,2.3,0.0,79.9,34.1,...,32.5,"Rain, Overcast",25.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
362,197.71,28.2,32.5,8.2,19.0,2022-12-29T00:00:00-07:00,0.6,0.0,69.5,23.5,...,26.4,"Rain, Partially cloudy",16.5,0.0,1.0,0.0,0.0,1.0,1.0,1.0
363,140.79,27.1,35.6,9.9,7.9,2022-12-30T00:00:00-07:00,0.9,0.0,54.3,16.6,...,0.0,Partially cloudy,6.6,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [8]:
# weather_data = pd.read_csv("2022_data.csv", usecols = [5])
# output = pd.read_csv("2022_data.csv").assign(sunrise = 0.0)
# output

# # next, add sunrise labels.

# for column in weather_data: # weather data only includes dates
#     for i, row in enumerate(weather_data[column].values):
#         date = row[:10]
#         if date in dates:
#             output.at[i,'sunrise']= 1.0
#         # else:
#             # print(date)

# # after that, change conditions into one conditions vector with 0s and 1s 
# for i, row in enumerate(output['conditions']):
#     for condition in row.split(', '):
#         output.at[i,condition] = 1

# # also figure out how to populate the NaNs with 0s or whatever makes sense there

# output = output.fillna(0)

# # delete any column with strings


In [15]:
output

Unnamed: 0,wdir,temp,maxt,visibility,wspd,datetimeStr,solarenergy,heatindex,cloudcover,mint,...,precipcover,wgust,conditions,windchill,info,sunrise,Rain,Overcast,Clear,Partially cloudy
0,240.08,12.3,20.9,4.5,15.5,2022-01-01T00:00:00-07:00,0.9,0.0,84.4,0.6,...,50.00,0.0,"Rain, Overcast",-12.3,0.0,0.0,1.0,1.0,0.0,0.0
1,169.17,22.5,42.0,9.9,11.0,2022-01-02T00:00:00-07:00,1.3,0.0,7.6,1.0,...,0.00,0.0,Clear,-12.6,0.0,0.0,0.0,0.0,1.0,0.0
2,118.46,37.7,55.0,9.9,9.6,2022-01-03T00:00:00-07:00,2.2,0.0,37.4,20.9,...,0.00,28.9,Partially cloudy,10.6,0.0,1.0,0.0,0.0,0.0,1.0
3,270.50,45.1,53.6,9.9,28.6,2022-01-04T00:00:00-07:00,1.7,0.0,35.9,37.1,...,0.00,58.8,Partially cloudy,29.2,0.0,0.0,0.0,0.0,0.0,1.0
4,172.63,28.1,51.6,9.4,24.3,2022-01-05T00:00:00-07:00,2.6,0.0,67.4,12.7,...,0.00,46.1,Partially cloudy,-1.5,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
360,181.00,48.8,61.3,9.9,14.4,2022-12-27T00:00:00-07:00,3.1,0.0,41.0,36.4,...,0.00,35.8,Partially cloudy,31.6,0.0,0.0,0.0,0.0,0.0,1.0
361,125.08,44.0,50.0,9.2,14.2,2022-12-28T00:00:00-07:00,2.3,0.0,79.9,34.1,...,16.67,32.5,"Rain, Overcast",25.0,0.0,0.0,1.0,1.0,0.0,0.0
362,197.71,28.2,32.5,8.2,19.0,2022-12-29T00:00:00-07:00,0.6,0.0,69.5,23.5,...,16.67,26.4,"Rain, Partially cloudy",16.5,0.0,1.0,1.0,0.0,0.0,1.0
363,140.79,27.1,35.6,9.9,7.9,2022-12-30T00:00:00-07:00,0.9,0.0,54.3,16.6,...,0.00,0.0,Partially cloudy,6.6,0.0,0.0,0.0,0.0,0.0,1.0


In [19]:
output.to_csv('output_csv.csv') # TODO: make it ignore the first column 


# NOTES

### what conditions are in our weather data? 

In [86]:
conditions = {}
for row in output['conditions']:
    for item in row.split(', '):
        
        if item not in conditions: 
            conditions[item] = 0
            
        else: 
            conditions[item] += 1
            
list(conditions.keys())

In [87]:
list(conditions.keys())

['Rain', 'Overcast', 'Clear', 'Partially cloudy']

### make into a vector and replace conditions column

In [90]:
# create a new column for every key in conditions dictionary, name is key (ex: "Partially cloudy") 
# for key in list(conditions.keys()):
#     output = output.assign(key = None)
    
# output


# Q: how to not hardcode this? 
# Q: how to deal with the space? 

# output = output.assign(Rain = 0.0, Overcast = 0.0, Clear = 0.0, Partially_cloudy = 0.0)

# iterate over conditions column 
for i, row in enumerate(output['conditions']):
    for condition in row.split(', '):
        output.at[i,condition] = 1


In [91]:
output 

Unnamed: 0,wdir,temp,maxt,visibility,wspd,datetimeStr,solarenergy,heatindex,cloudcover,mint,...,precipcover,wgust,conditions,windchill,info,sunrise,Rain,Overcast,Clear,Partially cloudy
0,240.08,12.3,20.9,4.5,15.5,2022-01-01T00:00:00-07:00,0.9,0.0,84.4,0.6,...,50.00,0.0,"Rain, Overcast",-12.3,0.0,0.0,1.0,1.0,,
1,169.17,22.5,42.0,9.9,11.0,2022-01-02T00:00:00-07:00,1.3,0.0,7.6,1.0,...,0.00,0.0,Clear,-12.6,0.0,0.0,,,1.0,
2,118.46,37.7,55.0,9.9,9.6,2022-01-03T00:00:00-07:00,2.2,0.0,37.4,20.9,...,0.00,28.9,Partially cloudy,10.6,0.0,1.0,,,,1.0
3,270.50,45.1,53.6,9.9,28.6,2022-01-04T00:00:00-07:00,1.7,0.0,35.9,37.1,...,0.00,58.8,Partially cloudy,29.2,0.0,0.0,,,,1.0
4,172.63,28.1,51.6,9.4,24.3,2022-01-05T00:00:00-07:00,2.6,0.0,67.4,12.7,...,0.00,46.1,Partially cloudy,-1.5,0.0,0.0,,,,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
360,181.00,48.8,61.3,9.9,14.4,2022-12-27T00:00:00-07:00,3.1,0.0,41.0,36.4,...,0.00,35.8,Partially cloudy,31.6,0.0,0.0,,,,1.0
361,125.08,44.0,50.0,9.2,14.2,2022-12-28T00:00:00-07:00,2.3,0.0,79.9,34.1,...,16.67,32.5,"Rain, Overcast",25.0,0.0,0.0,1.0,1.0,,
362,197.71,28.2,32.5,8.2,19.0,2022-12-29T00:00:00-07:00,0.6,0.0,69.5,23.5,...,16.67,26.4,"Rain, Partially cloudy",16.5,0.0,1.0,1.0,,,1.0
363,140.79,27.1,35.6,9.9,7.9,2022-12-30T00:00:00-07:00,0.9,0.0,54.3,16.6,...,0.00,0.0,Partially cloudy,6.6,0.0,0.0,,,,1.0
