# Data Processing
Here we do additional processing for the data to create the graph structure for Florida.  

In [None]:
# magic command to import a bunch of stuff
%pylab

# make plots show up in notebook
%matplotlib inline

import pandas as pd
import numpy as np
import torch
from tqdm import tqdm
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

PARENT_DIR = 'gdrive/My Drive/Stat 175 Final Project/'

Using matplotlib backend: agg
Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  warn("pylab import has clobbered these variables: %s"  % clobbered +


Mounted at /content/gdrive


In [None]:
# https://stackoverflow.com/questions/67285115/building-wheels-for-torch-sparse-in-colab-takes-forever
import torch
!pip uninstall torch-scatter torch-sparse torch-geometric torch-cluster  --y
!pip install torch-scatter -f https://data.pyg.org/whl/torch-{torch.__version__}.html
!pip install torch-sparse -f https://data.pyg.org/whl/torch-{torch.__version__}.html
!pip install torch-cluster -f https://data.pyg.org/whl/torch-{torch.__version__}.html
!pip install git+https://github.com/pyg-team/pytorch_geometric.git
!pip install tensorboardX
!wget https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
!unzip ngrok-stable-linux-amd64.zip

In [None]:
import torch.nn as nn
import torch.nn.functional as F

import torch_geometric.nn as pyg_nn
import torch_geometric.utils as pyg_utils

import time
from datetime import datetime

import networkx as nx
import numpy as np
import torch
import torch.optim as optim

from torch_geometric.datasets import TUDataset
from torch_geometric.datasets import Planetoid
from torch_geometric.data import DataLoader, Data


import torch_geometric.transforms as T

from tensorboardX import SummaryWriter
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

In [None]:
# load entire dataframe
df = pd.read_csv(f'{PARENT_DIR}climate_plus_claims.csv')
df.head()

Unnamed: 0,lat,lon,date,tmax,tmin,prec,wind,elev,amount
0,25.2,278.9,1978-01-01,26.77,14.13,0.025,1.0845,0.0,0.0
1,25.2,278.9,1978-01-02,25.37,17.89,2.725,1.575,0.0,0.0
2,25.2,278.9,1978-01-03,23.63,11.25,10.4,6.0765,0.0,0.0
3,25.2,278.9,1978-01-04,18.73,7.26,2.475,6.5721,0.0,0.0
4,25.2,278.9,1978-01-05,25.11,14.4,0.0,5.661,0.0,0.0


In [None]:
df = df[df['elev'] > 0]

In [None]:
df['date'] = pd.to_datetime(df['date'], errors = 'coerce')

df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
# df= df[(df['year'] == 2004) & (df['month'] == 1)]
# df['amount'].max()

In [None]:
# set to index so we can slice
df.set_index('date', inplace =True)

# get range of times
start_date = '1978-01-01'
end_date = '2011-12-31'
freq = '30D'

date_range = pd.date_range(start=start_date, end=end_date, freq=freq)

print(date_range)

DatetimeIndex(['1978-01-01', '1978-01-31', '1978-03-02', '1978-04-01',
               '1978-05-01', '1978-05-31', '1978-06-30', '1978-07-30',
               '1978-08-29', '1978-09-28',
               ...
               '2011-03-09', '2011-04-08', '2011-05-08', '2011-06-07',
               '2011-07-07', '2011-08-06', '2011-09-05', '2011-10-05',
               '2011-11-04', '2011-12-04'],
              dtype='datetime64[ns]', length=414, freq='30D')


In [None]:
def get_matrix(df, start30, end30):

  N = 31*4
  matrix = np.zeros((0, N + 6))

  # select time window
  temp = df.loc[start30.strftime('%Y-%m-%d'):end30.strftime('%Y-%m-%d'), :]
  # for each location 
  for lat, lon in temp[['lat', 'lon']].drop_duplicates().values:
      # get the values for that location
      temp2 = temp[(temp['lat'] == lat) & (temp['lon'] ==lon)]

      amount_sum = temp2['amount'].sum()
      elev = temp2['elev'].iloc[1]
      # create columns for the 28 days and fill with the corresponding 'x' values -- repeat for all variables      
      X = temp2[['tmax', 'tmin', 'prec', 'wind']].values
      X = X.reshape(1, -1)

      if len(X[0]) < N:
        padding_size = N - len(X[0])
        X_long = np.pad(X, ((0, 0), (0, padding_size)), 'constant', constant_values=0)
      else:
          X_long = X
      new_row = np.concatenate(([lat, lon, start30.year, start30.month, amount_sum, elev], X_long[0]))
      matrix = np.vstack((matrix, new_row))
  
  # reformat to dataframe 
  columns = ['lat', 'lon', 'year', 'month', 'amount_sum','elev'] + [f'x_{i+1}' for i in range(31*4)]
  df_new = pd.DataFrame(matrix, columns=columns)
  return df_new


In [None]:
# load matrix 
amat = pd.read_csv(f'{PARENT_DIR}A_mat.csv')

def get_torch(monthly_df):
  
  # extract x and y values 
  x_vals = monthly_df.reset_index().drop(['lat', 'lon', 'year', 'month', 'amount_sum'], axis = 1)
  y_vals = torch.tensor(monthly_df.amount_sum.values, dtype = torch.float)

  # convert to tensor
  x_torch = torch.tensor(x_vals.values, dtype = torch.float)

  # get edges from upper triangular matrix 
  upper_tri_indices = np.triu_indices(n = amat.to_numpy().shape[0], k=1)
  edge_index = np.vstack(upper_tri_indices)

  # convert edges to torch tensor 
  edge_index = torch.tensor(edge_index, dtype = torch.long)

  # put everything together 
  torch_obj = Data(x = x_torch, y = y_vals, edge_index = edge_index)
  return torch_obj

In [None]:
t = get_matrix(df, date_range[0], date_range[1])
t2 = get_torch(t)
t2

Data(x=[1029, 126], edge_index=[2, 528906], y=[1029])

In [None]:
t3 = df.reset_index()[['lat', 'lon']].drop_duplicates().reset_index()

In [None]:
t3.to_csv(f'{PARENT_DIR}locations.csv')

# Reformat and save in dataloader object

In [None]:
# do for all objects
from tqdm import tqdm
objs_30day = []
for i in tqdm(range(len(date_range)-1)): 
  # get dataframe
  df_30day = get_matrix(df, date_range[i], date_range[i+1])
  # get torch object 
  df_30day_torch = get_torch(df_30day)
  # add to objs 
  #objs_30day.append(df_30day_torch)

  # save directly to disk 
  torch.save(df_30day_torch, f'{PARENT_DIR}data/month_{i}.pth')

100%|██████████| 413/413 [13:55<00:00,  2.02s/it]


In [None]:
loader = DataLoader(objs_30day, batch_size = 256, shuffle=True)



In [None]:
torch.save(objs_30day[0],f'{PARENT_DIR}data/obj_0.pth')

In [None]:
torch.load(f'{PARENT_DIR}data/obj_0.pth')

Data(x=[1029, 126], edge_index=[2, 528906], y=[1029])

In [None]:
N = 31*4
matrix = np.zeros((0, N + 6))

for lat, lon in df[['lat', 'lon']].drop_duplicates().values:
    for year, month in df[['year', 'month']].drop_duplicates().values:

        # filter rows for this lat, lon, year, and month
        temp = df[(df['lat'] == lat) & (df['lon'] == lon) & (df['year'] == year) & (df['month'] == month)]

        # sum the 'amount' column for this lat, lon, year, and month
        amount_sum = temp['amount'].sum()
        elev = temp['elev'].iloc[1]
        # create columns for the 28 days and fill with the corresponding 'x' values -- repeat for all variables      
        X = temp[['tmax', 'tmin', 'prec', 'wind']].values
        X = X.reshape(1, -1)
        if len(X[0]) < N:
            padding_size = N - len(X[0])
            X_long = np.pad(X, ((0, 0), (0, padding_size)), 'constant', constant_values=0)
        else:
            X_long = X
        new_row = np.concatenate(([lat, lon, year, month, amount_sum, elev], X_long[0]))
        matrix = np.vstack((matrix, new_row))

Unnamed: 0,lat,lon,date,tmax,tmin,prec,wind,elev,amount,year,month,day
37254,25.3,279.2,1978-01-01,27.50,13.51,0.225,0.9854,2.0,0.0,1978,1,1
37255,25.3,279.2,1978-01-02,26.06,18.12,3.475,1.3895,2.0,0.0,1978,1,2
37256,25.3,279.2,1978-01-03,22.92,11.99,10.350,5.7326,2.0,0.0,1978,1,3
37257,25.3,279.2,1978-01-04,20.91,9.33,1.525,6.3827,2.0,0.0,1978,1,4
37258,25.3,279.2,1978-01-05,25.54,14.65,0.000,5.6024,2.0,0.0,1978,1,5
...,...,...,...,...,...,...,...,...,...,...,...,...
12865043,30.9,278.4,2011-12-27,19.86,10.72,1.000,3.7435,6.0,0.0,2011,12,27
12865044,30.9,278.4,2011-12-28,17.68,4.68,2.450,6.2775,6.0,0.0,2011,12,28
12865045,30.9,278.4,2011-12-29,16.36,3.44,0.000,1.6753,6.0,0.0,2011,12,29
12865046,30.9,278.4,2011-12-30,19.13,3.71,0.000,2.0369,6.0,0.0,2011,12,30
