# Import data and save to csv

## Jason Kniss
### Feb 14 2024

This script is intended to save computation time while editing by saving pertinent data into a csv rather than reload and organize the data with every run.

### Import Libraries

In [1]:
import pandas as pd
import os
from datetime import datetime, timedelta
import numpy as np
# import matplotlib.pyplot as plt
# import matplotlib.dates as mdates
# from IPython.display import HTML
from pandasgui import show

In [2]:
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

## Import and organize data

### Set path to data folders

separate folders are used because each data source will be treated separately. This is to simplify identification and concatenation.

In [3]:
towermet_path = 'data/eureka-data/tower-met'
towerrad_path = 'data/eureka-data/tower-rad'
bsrnrad_path = 'data/eureka-data/bsrn-rad'

### Crerate lists of dataframes

Loops will generate a list of dataframes from text files added to specified folders.

In [4]:
towermet_dfs = []  # List to store Tower Meteroological dataframes
towerrad_dfs = []  # List to store Tower Radiation dataframes
bsrnrad_dfs = [] # List to store BSRN radiation dataframes

### Tower meteorological data

In [5]:
for filename in os.listdir(towermet_path):
    if filename.endswith(".txt"):  # Only consider .txt files
        file_path = os.path.join(towermet_path, filename)
        with open(file_path, "r") as f:
            lines = f.readlines()

        header = lines[0].strip().split()
        data_rows = [line.strip().split("\t") for line in lines[1:]]

        df = pd.DataFrame(data_rows, columns=header)
        # Rename the dataframe when more data is imported

        df = df.astype('float')

        # Fix midnight HourMin
        condition = df['HourMin'] == 2400
        df.loc[condition, 'JulianDay'] += 1
        df.loc[condition, 'HourMin'] = 0000

        df.replace([-999, -9999], np.nan, inplace=True)
        
        towermet_dfs.append(df)

### Tower radiation data

In [6]:
for filename in os.listdir(towerrad_path):
    if filename.endswith(".txt"):  # Only consider .txt files
        file_path = os.path.join(towerrad_path, filename)
        with open(file_path, "r") as f:
            lines = f.readlines()

        header = lines[0].strip().split()
        data_rows = [line.strip().split("\t") for line in lines[1:]]

        df = pd.DataFrame(data_rows, columns=header)
        # Rename the dataframe when more data is imported

        df = df.astype('float')
        
        condition = df['HourMin'] == 2400
        df.loc[condition, 'JulianDay'] += 1
        df.loc[condition, 'HourMin'] = 0000

        df.replace([-999, -9999], np.nan, inplace=True)
        
        #df = df[df['HourMin'] != 2400] # Filter out invalid HourMin values 
        towerrad_dfs.append(df)

### BSRN radiation data
This is redundant of tower radiation data and only serves as a comparison to tower measurements

In [7]:
for filename in os.listdir(bsrnrad_path):
    if filename.endswith(".txt"):  # Only consider .txt files
        file_path = os.path.join(bsrnrad_path, filename)
        with open(file_path, "r") as f:
            lines = f.readlines()

        header = lines[0].strip().split()
        data_rows = [line.strip().split("\t") for line in lines[1:]]

        df = pd.DataFrame(data_rows, columns=header)
        # Rename the dataframe when more data is imported

        df = df.astype('float')
        
        condition = df['HourMin'] == 2400
        df.loc[condition, 'JulianDay'] += 1
        df.loc[condition, 'HourMin'] = 0000

        df.replace([-999, -9999], np.nan, inplace=True)
        
        #df = df[df['HourMin'] != 2400] # Filter out invalid HourMin values 
        bsrnrad_dfs.append(df)

## Sort and concatenate lists of dataframes

Done separately to simplify concatenation in the next step
### Create datetime columns
### Tower meteorological dataframes

In [8]:
# Create a Datetime column 
for df in towermet_dfs:
    def julian_to_date(julian_day, base_year=(df['Year'].iloc[0]).astype('int')):
        base_date = datetime(base_year, 1, 1)
        target_date = base_date + timedelta(days=julian_day - 1)
        return target_date.strftime('%Y-%m-%d')

    def hourmin_to_time(hourmin):
        hours = int(hourmin // 100)
        minutes = int(hourmin % 100)
        return f'{hours:02}:{minutes:02}:00'

    # Convert Julian Day to date
    df['Date'] = df['JulianDay'].apply(julian_to_date)

    # Convert HourMin to time   
    df['Time'] = df['HourMin'].apply(hourmin_to_time)

    # Combine date and time into a single datetime column
    df['Datetime'] = pd.to_datetime(df['Date'] + ' ' + df['Time'])

    # Drop intermediate columns
    df.drop(columns=['Date', 'Time'], inplace=True)
    


### Tower radiation dataframes

In [9]:
for df in towerrad_dfs:
      def julian_to_date(julian_day, base_year=(df['Year'].iloc[0]).astype('int')):
          base_date = datetime(base_year, 1, 1)
          target_date = base_date + timedelta(days=julian_day - 1)
          return target_date.strftime('%Y-%m-%d')

      def hourmin_to_time(hourmin):
          hours = int(hourmin // 100)
          minutes = int(hourmin % 100)
          return f'{hours:02}:{minutes:02}:00'

      # Convert Julian Day to date
      df['Date'] = df['JulianDay'].apply(julian_to_date)

      # Convert HourMin to time   
      df['Time'] = df['HourMin'].apply(hourmin_to_time)

      # Combine date and time into a single datetime column
      df['Datetime'] = pd.to_datetime(df['Date'] + ' ' + df['Time'])

      # Drop intermediate columns
      df.drop(columns=['Date', 'Time'], inplace=True)
      
      # Set 'Datetime' column as the index
 #     df.set_index('Datetime', inplace=True)

### BSRN radiation dataframes

In [10]:
for df in bsrnrad_dfs:
    def julian_to_date(julian_day, base_year=(df['Year'].iloc[0]).astype('int')):
        base_date = datetime(base_year, 1, 1)
        target_date = base_date + timedelta(days=julian_day - 1)
        return target_date.strftime('%Y-%m-%d')

    def hourmin_to_time(hourmin):
        hours = int(hourmin // 100)
        minutes = int(hourmin % 100)
        return f'{hours:02}:{minutes:02}:00'

    # Convert Julian Day to date
    df['Date'] = df['JulianDay'].apply(julian_to_date)

    # Convert HourMin to time   
    df['Time'] = df['HourMin'].apply(hourmin_to_time)

    # Combine date and time into a single datetime column
    df['Datetime'] = pd.to_datetime(df['Date'] + ' ' + df['Time'])

    # Drop intermediate columns
    df.drop(columns=['Date', 'Time'], inplace=True)

### Concatenate lists of dataframes into dataframes and reindex by datetime

In [11]:
# Concatenate vertically
towermet_df = pd.concat(towermet_dfs, ignore_index=True)
towerrad_df = pd.concat(towerrad_dfs, ignore_index=True)
bsrnrad_df = pd.concat(bsrnrad_dfs, ignore_index=True)

In [12]:
# Sort by datetime column
towermet_df.sort_values(by='Datetime', inplace=True)    
towerrad_df.sort_values(by='Datetime', inplace=True)  
bsrnrad_df.sort_values(by='Datetime', inplace=True)

In [13]:
# Set 'Datetime' column as the index
towermet_df.set_index('Datetime', inplace=True)
towerrad_df.set_index('Datetime', inplace=True)
bsrnrad_df.set_index('Datetime', inplace=True)

### Join dataframes into one dataframe with pertinant columns

In [14]:
# Tower Met Data
towermet_df_select = towermet_df[[
    'Pressure[mbar]',
    '10MRH[%]',
    # '6MRH[%]',
    '2MRH[%]',
    '10MVTair[degC]',
    # '6MVTair[degC]',
    '2MVTair[degC]'
]]

# Tower Radiation Data
towerrad_df_select = towerrad_df[[
    'LWTotalDownwelling[W/m^2]',
    'QualityControl'
]].copy()

towerrad_df_select.rename(columns={'LWTotalDownwelling[W/m^2]': 'twrLWTotalDownwelling[W/m^2]'}, inplace=True)
towerrad_df_select.rename(columns={'QualityControl': 'twrQualityControl'}, inplace=True)

# BSRN Radiation Data
bsrnrad_df_select = bsrnrad_df[[
    'LWTotalDownwelling[W/m^2]',
    'QualityControl'
]].copy()

bsrnrad_df_select.rename(columns={'LWTotalDownwelling[W/m^2]': 'bsrnLWTotalDownwelling[W/m^2]'}, inplace=True)
bsrnrad_df_select.rename(columns={'QualityControl': 'bsrnQualityControl'}, inplace=True)

# Join Dataframes
data_df = towermet_df_select.join(towerrad_df_select, how='outer').join(bsrnrad_df_select, how='outer')

In [15]:
data_df.head()

Unnamed: 0_level_0,Pressure[mbar],10MRH[%],2MRH[%],10MVTair[degC],2MVTair[degC],twrLWTotalDownwelling[W/m^2],twrQualityControl,bsrnLWTotalDownwelling[W/m^2],bsrnQualityControl
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2009-01-01 00:00:00,1009.8,64.315,61.374,-35.07,-35.68,182.15,90000000000.0,,
2009-01-01 00:01:00,1009.8,62.975,60.589,-34.944,-35.755,182.109,90000000000.0,150.1,9091991000000000.0
2009-01-01 00:02:00,1009.8,63.003,59.687,-34.976,-35.828,182.317,90000000000.0,150.48,9091991000000000.0
2009-01-01 00:03:00,1009.7,63.52,59.662,-34.968,-35.864,182.29,90000000000.0,150.36,9091991000000000.0
2009-01-01 00:04:00,1009.6,65.087,60.625,-34.909,-35.86,182.319,90000000000.0,150.57,9091991000000000.0


In [16]:
data_df.describe()

Unnamed: 0,Pressure[mbar],10MRH[%],2MRH[%],10MVTair[degC],2MVTair[degC],twrLWTotalDownwelling[W/m^2],twrQualityControl,bsrnLWTotalDownwelling[W/m^2],bsrnQualityControl
count,217440.0,217440.0,217440.0,217440.0,217440.0,217440.0,217440.0,217440.0,217440.0
mean,1003.53553,69.325475,67.752705,-27.886775,-28.303953,185.925846,7747740000000.0,172.622351,8.217672e+17
std,8.740435,6.926484,7.178539,11.173209,11.503212,35.147923,46399320000000.0,42.61129,1.89535e+18
min,980.43,44.144,42.781,-46.883,-47.04,121.307,0.0,106.1,9000991000000000.0
25%,997.06,64.29175,62.393,-36.698,-37.401,165.01175,0.0,140.2875,9091991000000000.0
50%,1003.7,67.554,65.879,-32.233,-32.776,177.543,90000000000.0,163.62,9091991000000000.0
75%,1009.7,73.628,72.494,-19.789,-19.924,197.34625,90000000000.0,192.5,9091991000000000.0
max,1031.2,98.91,98.564,1.3766,1.6535,313.023,440000000000000.0,307.85,6.689022e+18


In [17]:
# ref_df.describe()

In [18]:
data_df.columns

Index(['Pressure[mbar]', '10MRH[%]', '2MRH[%]', '10MVTair[degC]',
       '2MVTair[degC]', 'twrLWTotalDownwelling[W/m^2]', 'twrQualityControl',
       'bsrnLWTotalDownwelling[W/m^2]', 'bsrnQualityControl'],
      dtype='object')

In [19]:
data_df.index

DatetimeIndex(['2009-01-01 00:00:00', '2009-01-01 00:01:00',
               '2009-01-01 00:02:00', '2009-01-01 00:03:00',
               '2009-01-01 00:04:00', '2009-01-01 00:05:00',
               '2009-01-01 00:06:00', '2009-01-01 00:07:00',
               '2009-01-01 00:08:00', '2009-01-01 00:09:00',
               ...
               '2009-05-31 23:51:00', '2009-05-31 23:52:00',
               '2009-05-31 23:53:00', '2009-05-31 23:54:00',
               '2009-05-31 23:55:00', '2009-05-31 23:56:00',
               '2009-05-31 23:57:00', '2009-05-31 23:58:00',
               '2009-05-31 23:59:00', '2009-06-01 00:00:00'],
              dtype='datetime64[ns]', name='Datetime', length=217441, freq=None)

In [20]:
data_df.to_csv('eureka-data.csv')