In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from IPython.display import clear_output

!gdown --id 1z2-yb17G56b5Etqpkrj2sX1VC3CH8BZL
!unzip /content/processed_dataset.zip
!mv /content/content/dataset /content/dataset
!rm -rf /content/processed_dataset.zip
!rm -rf /content/content/
!rm -rf /content/dataset/processed_test.csv
!rm -rf /content/dataset/processed_train.csv
!mv /content/dataset/test/test.csv /content/dataset/test.csv

!pip install pyprind

clear_output()

In [None]:
# from IPython.display import clear_output

# !gdown --id 1BwrMSrNb1Kz27AU0JBkS7V-Va7iagoo1
# !mkdir /content/dataset
# !unzip /content/processed_dataset_new.zip -d /content/dataset
# !rm -rf /content/processed_dataset_new.zip

# !pip install pyprind

# clear_output()

---

In [None]:
import pandas as pd
import numpy as np

import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torchvision import transforms

import os
import glob
from tqdm import trange
import pyprind

import cv2
from google.colab.patches import cv2_imshow
import matplotlib.pyplot as plt

In [None]:
def get_date(date):
    M, D = date.split('/')
    M = "0" + M if len(M)==1 else M
    D = "0" + D if len(D)==1 else D
    return M+D

def get_time(time):
    M, H = time.split(':')
    return M + H + "00"

___

In [None]:
def make_dict_train(data):
    entry = {} 

    entry["Date"] = data[0][0]
    entry["Time"] = data[1]
    entry["Global CMP22"] = data[2]
    entry["Direct sNIP"] = data[3]
    entry["Azimuth Angle"] = data[4]
    entry["Tower Dry Bulb Temperature"] = data[5]
    entry["Tower Wet Bulb Temperature"] = data[6]
    entry["Tower Dew Point Temperature"] = data[7]
    entry["Tower RH"] = data[8]
    entry["Total Cloud Cover"] = data[9]
    entry["Peak Wind Speed"] = data[10]
    entry["Avgerage Wind Direction"] = data[11]
    entry["Station Pressure"] = data[12]
    entry["Precipitation"] = data[13]
    entry["Snow Depth"] = data[14]
    entry["Moisture"] = data[15]
    entry["Albedo"] = data[16]

    return entry

def make_dict_test(data):
    entry = {}
    
    entry["Time"] = list(map(int, data[0]))
    entry["Global CMP22"] = data[1]
    entry["Direct sNIP"] = data[2]
    entry["Azimuth Angle"] = data[3]
    entry["Tower Dry Bulb Temperature"] = data[4]
    entry["Tower Wet Bulb Temperature"] = data[5]
    entry["Tower Dew Point Temperature"] = data[6]
    entry["Tower RH"] = data[7]
    entry["Total Cloud Cover"] = data[8]
    entry["Peak Wind Speed"] = data[9]
    entry["Avgerage Wind Direction"] = data[10]
    entry["Station Pressure"] = data[11]
    entry["Precipitation"] = data[12]
    entry["Snow Depth"] = data[13]
    entry["Moisture"] = data[14]
    entry["Albedo"] = data[15]

    return entry

In [None]:
def normalize_columns(data):
    data = data.copy()
    smooth = 1e-8
    for column in data.columns:
        if column == 'DATE (MM/DD)' or column == 'MST' or column == 'Time [Mins]':
            continue
        elif column == 'Global CMP22 (vent/cor) [W/m^2]':
            data[column] = (data[column] - (-5) + smooth ) / abs( (4000 ) - (-5) + smooth )
        elif column == 'Direct sNIP [W/m^2]':
            data[column] = (data[column] - (-15) + smooth ) / abs( (1200) - (-15) + smooth )
        elif column == 'Azimuth Angle [degrees]':
            data[column] = (data[column] - (0) + smooth ) / abs( (360) - (0) + smooth )
        elif column == 'Tower Dry Bulb Temp [deg C]':
            data[column] = (data[column] - (-20) + smooth ) / abs( (32) - (-20) + smooth )
        elif column == 'Tower Wet Bulb Temp [deg C]':
            data[column] = (data[column] - (-20) + smooth ) / abs( (32) - (-20) + smooth )
        elif column == 'Tower Dew Point Temp [deg C]':
            data[column] = (data[column] - (-25) + smooth ) / abs( (32) - (-25) + smooth )
        elif column == 'Tower RH [%]':
            data[column] = (data[column] - (0) + smooth ) / abs( (100) - (0) + smooth )
        elif column == 'Total Cloud Cover [%]':
            data[column] = (data[column] - (0) + smooth ) / abs( (100) - (0) + smooth )
        elif column == 'Peak Wind Speed @ 6ft [m/s]':
            data[column] = (data[column] - (0) + smooth ) / abs( (30) - (0) + smooth )
        elif column == 'Avg Wind Direction @ 6ft [deg from N]':
            data[column] = (data[column] - (0) + smooth ) / abs( (360) - (0) + smooth )
        elif column == 'Station Pressure [mBar]':
            data[column] = (data[column] - (760) + smooth ) / abs( (850) - (760) + smooth )
        elif column == 'Precipitation (Accumulated) [mm]':
            data[column] = (data[column] - (0) + smooth ) / abs( (30) - (0) + smooth )
        elif column == 'Snow Depth [cm]':
            data[column] = (data[column] - (0) + smooth ) / abs( (30) - (0) + smooth )
        elif column == 'Moisture':
            data[column] = (data[column] - (0) + smooth ) / abs( (1) - (0) + smooth )
        elif column == 'Albedo (CMP11)':
            data[column] = (data[column] - (0) + smooth ) / abs( (2) - (0) + smooth )
    return data

In [None]:
def process_train(base):
    original = pd.read_csv(os.path.join(base, "train", "train.csv"))
    original = normalize_columns(original)
    train = []

    bar = pyprind.ProgBar(len(original['DATE (MM/DD)'].unique()), bar_char='█')
    for date in original['DATE (MM/DD)'].unique():
        items = original.loc[original['DATE (MM/DD)']==date].copy(deep=True)
        items['DATE (MM/DD)'] = items['DATE (MM/DD)'].apply(lambda x: get_date(x))
        items['MST'] = items['MST'].apply(lambda x: get_time(x))

        day = get_date(date)
   
        for i in range(1, len(items)-9):
            data = items.iloc[i:i+10]
            if (data['Total Cloud Cover [%]']>0).all() and (data['Total Cloud Cover [%]']<1).all():
                entry = make_dict_train(data.to_numpy().transpose())
                filename = os.path.join(base, "train", day, "{0}{1}.jpg".format(day, entry['Time'][-1]))
                if os.path.isfile(filename):
                    entry['image'] = filename
                    train.append(entry)

        bar.update()

    return pd.DataFrame(train)


In [None]:
def process_test(base):
    test = []

    bar = pyprind.ProgBar(len(sorted(os.listdir(os.path.join(base, "test")))), bar_char='█')
    for scenario in sorted(os.listdir(os.path.join(base, "test"))):
        original = pd.read_csv(os.path.join(base, "test", scenario, "weather_data.csv"))
        original = normalize_columns(original)  

        for i in range(1, len(original)-9):
            entry = make_dict_test(original.iloc[i:i+10].to_numpy().transpose())
            entry['Scenario'] = scenario
            filename = os.path.join(base, "test", scenario, "{0}.jpg".format(entry["Time"][-1]))
            if os.path.isfile(filename):
                entry['image'] = filename
                test.append(entry)
        bar.update()

    return pd.DataFrame(test)

In [None]:
train = process_train(base="/content/dataset/")

0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:04:39


In [None]:
test = process_test(base="/content/dataset/")
test['Time'] = test['Time'].apply(lambda x: x[9])
test = test.sort_values(by = ['Scenario', 'Time'])

0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:41


In [None]:
train.to_pickle("/content/dataset/processed_train.pkl")
test.to_pickle("/content/dataset/processed_test.pkl")

In [None]:
# !cd "/content/dataset" && zip -r "/content/drive/MyDrive/Events/Hackathons/shell.ai - noVowels/dataset/processed_dataset_new.zip" *


# clear_output()

In [None]:
train = pd.read_pickle("/content/dataset/processed_train.pkl")
test = pd.read_pickle("/content/dataset/processed_test.pkl")

----

In [None]:
# train = pd.read_csv("/content/dataset/train/train.csv")

---

In [None]:
class CreateDataset(torch.utils.data.Dataset):

    def __init__(self, base, past=8, future=12, image=256, dataset='train'):
        self.base = base
        self.past = past
        self.future = future
        self.image = image
        self.dataset = dataset
        self.normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])

        self.data = pd.read_pickle(os.path.join(base, f"processed_{dataset}.pkl"))
        self.files = []

        self.init_dataset()

    def image_transform(self, image):
        image = image.copy()
        image = cv2.resize(image, (self.image, self.image), interpolation=cv2.INTER_AREA)
        image = image/255.
        image = image.transpose((2, 0, 1))
        image = torch.from_numpy(image)
        if self.dataset == 'train':
            image = self.normalize(image)
        elif self.dataset == 'test':
            pass
        image = image.unsqueeze(0)
        return image

    def data_transform(self, data, mode):
        data = data.copy()
        if mode == 'input':
            if self.dataset=='train':
                input = data[2:-1]
            elif self.dataset=='test':
                input = data[1:-2]
            input = np.concatenate(input).astype('float64').reshape(15, 10).transpose()
            input = torch.from_numpy(input)
            input = input.unsqueeze(0)
            return input
        elif mode == 'output':
            output = np.array([data[8][-1]]).astype('float64')
            output = torch.from_numpy(output)
            output = output.unsqueeze(0)
        return output

    def init_dataset(self):
        if self.dataset == 'train':
            bar = pyprind.ProgBar(len(self.data['Date'].unique()), bar_char='█')
            for date in self.data['Date'].unique():
                data = self.data.loc[self.data['Date']==date]
                for item in range(self.past, len(data)-self.future, 1):
                    past = data[item-self.past+1:item+1]
                    future = data[item+1:item+self.future+1]
                    image_path = []
                    input_weather = torch.empty((0, 10, 15))
                    output_actual = torch.empty((0, 1))
                    for i in range(len(past)):
                        image_path.append(past.iloc[i][-1])
                        input_weather = torch.cat((input_weather, self.data_transform(past.iloc[i], mode='input')), dim=0)
                    for i in range(len(future)):
                        output_actual = torch.cat((output_actual, self.data_transform(future.iloc[i], mode='output')), dim=0)

                    self.files.append({'input_images': image_path, 'input_weather': input_weather, 'output_actual': output_actual})
                bar.update()
        elif self.dataset == 'test':
            bar = pyprind.ProgBar(len(self.data['Scenario'].unique()), bar_char='█')
            for scenario in self.data['Scenario'].unique():
                data = self.data.loc[self.data['Scenario']==scenario]
                past = data.iloc[-self.past:]
                image_path = []
                input_weather = torch.empty((0, 10, 15))
                for i in range(len(past)):
                    image_path.append(past.iloc[i][-1])
                    input_weather = torch.cat((input_weather, self.data_transform(past.iloc[i], mode='input')), dim=0)
                self.files.append({'input_images': image_path, 'input_weather': input_weather})
                bar.update()

    def __getitem__(self, index):
        data = self.files[index]

        if self.dataset == 'train':
            input_image = torch.empty((0, 3, self.image, self.image))
            images = data['input_images']
            for path in images:
                image = cv2.imread(path, cv2.IMREAD_COLOR)
                image = self.image_transform(image)
                input_image = torch.cat((input_image, image), dim=0)
            input_weather = data['input_weather']
            output_actual = data['output_actual']

            return input_image, input_weather, output_actual
        elif self.dataset == 'test':
            input_image = torch.empty((0, 3, self.image, self.image))
            images = data['input_images']
            for path in images:
                image = cv2.imread(path, cv2.IMREAD_COLOR)
                image = self.image_transform(image)
                input_image = torch.cat((input_image, image), dim=0)
            input_weather = data['input_weather']

            return input_image, input_weather

    def __len__(self):
        return len(self.files)

In [None]:
base = "/content/dataset"

In [None]:
trainset = CreateDataset(base, past=8, future=12, image=256, dataset='train')
testset = CreateDataset(base, past=8, future=12, image=256, dataset='test')

In [None]:
print(f"Train: {len(trainset)}")
print(f"Test: {len(testset)}")