In [2]:
# Import modules
import numpy as np
import pandas as pd
import glob
import os
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import netCDF4
from datetime import timedelta, datetime
import pickle

# Define data
files = glob.glob('/Users/jryan4/Dropbox (University of Oregon)/research/clouds/data/merged_data/*')

# Define solar zenith angles
sza_values = [35, 40, 45, 50, 55, 60, 65, 70, 75, 80]

In [3]:
def data_read_machine_learning(files):
    
    # Combine data   
    cloud_type = []
    cloud_phase = []
    sza_cloudsat = []
    region = []
    elev = []
    sw_down_cs = []
    sw_down_as = []
    lw_down_cs = []
    lw_down_as = []
    
    modis_cot = []
    modis_ctp = []
    modis_phase = []
    modis_cer = []
    modis_ctt = []
    modis_cth = []
    modis_cwp = []
    
    modis_albedo = []
    cloudsat_albedo = []
    t2m = []
    d2m = []
    ssrdc = []
    strdc = []
    
    lat = []
    lon = []
    
    for i in files:
        
        # Read data
        df = pd.read_csv(i)
        
        # Get path and filename separately
        infilepath1, infilename1 = os.path.split(i)
        # Get file name without extension
        infileshortname1, extension1 = os.path.splitext(infilename1)
        
        # Append
        lat.append(df['lat'].values)
        lon.append(df['lon'].values)
        sza_cloudsat.append(df['sza_cloudsat'].values)
        cloud_type.append(df['cloud_type'].values)
        region.append(df['region'].values)
        elev.append(df['elev'].values)
        sw_down_as.append(df['sw_down_as'].values)
        sw_down_cs.append(df['sw_down_cs'].values)
        lw_down_as.append(df['lw_down_as'].values)
        lw_down_cs.append(df['lw_down_cs'].values)
        cloud_phase.append(df['cloud_phase_cloudsat'].values)
        modis_cot.append(df['cloud_optical_thickness'].values)
        modis_ctp.append(df['cloud_top_pressure'].values)
        modis_phase.append(df['cloud_phase_modis'].values)
        modis_cer.append(df['cloud_effective_radius'].values)
        modis_ctt.append(df['cloud_top_temperature'].values)
        modis_cth.append(df['cloud_top_height'].values)
        modis_cwp.append(df['cloud_water_path'].values)
        modis_albedo.append(df['albedo_modis'].values)
        cloudsat_albedo.append(df['albedo_cloudsat'].values)
        t2m.append(df['t2m'].values)
        d2m.append(df['d2m'].values)
        ssrdc.append(df['ssrdc'].values)
        strdc.append(df['strdc'].values)
        
    lat_flat = [item for sublist in lat for item in sublist]
    lon_flat = [item for sublist in lon for item in sublist]
    cloud_type_flat = [item for sublist in cloud_type for item in sublist]
    cloud_phase_flat = [item for sublist in cloud_phase for item in sublist]
    sza_flat = [item for sublist in sza_cloudsat for item in sublist]
    region_flat = [item for sublist in region for item in sublist]
    elev_flat = [item for sublist in elev for item in sublist]
    sw_down_cs_flat = [item for sublist in sw_down_cs for item in sublist]
    sw_down_as_flat = [item for sublist in sw_down_as for item in sublist]
    lw_down_cs_flat = [item for sublist in lw_down_cs for item in sublist]
    lw_down_as_flat = [item for sublist in lw_down_as for item in sublist]
    
    modis_cot_flat = [item for sublist in modis_cot for item in sublist]
    modis_ctp_flat = [item for sublist in modis_ctp for item in sublist]
    modis_phase_flat = [item for sublist in modis_phase for item in sublist]
    modis_cer_flat = [item for sublist in modis_cer for item in sublist]
    modis_ctt_flat = [item for sublist in modis_ctt for item in sublist]
    modis_cth_flat = [item for sublist in modis_cth for item in sublist]
    modis_cwp_flat = [item for sublist in modis_cwp for item in sublist]
    
    modis_albedo_flat = [item for sublist in modis_albedo for item in sublist]
    cloudsat_albedo_flat = [item for sublist in cloudsat_albedo for item in sublist]
    t2m_flat = [item for sublist in t2m for item in sublist]
    d2m_flat = [item for sublist in d2m for item in sublist]
    ssrdc_flat = [item for sublist in ssrdc for item in sublist]
    strdc_flat = [item for sublist in strdc for item in sublist]

    # Put into DataFrame
    df = pd.DataFrame(list(zip(lon_flat, lat_flat, sza_flat,cloud_type_flat,
                               cloud_phase_flat,region_flat,
                               elev_flat, sw_down_cs_flat,sw_down_as_flat, 
                               lw_down_cs_flat,lw_down_as_flat,modis_cot_flat,
                               modis_ctp_flat,modis_phase_flat,modis_cer_flat,
                               modis_ctt_flat,modis_cth_flat, modis_cwp_flat,
                               modis_albedo_flat,cloudsat_albedo_flat, t2m_flat,
                               d2m_flat, ssrdc_flat, strdc_flat)))
    
    df.columns = ['lon', 'lat', 'sza', 'type', 'phase', 'region', 'elev','sw_cs', 'sw_as',
                  'lw_cs', 'lw_as','modis_cot', 'modis_ctp', 'modis_phase',
                  'modis_cer', 'modis_ctt', 'modis_cth', 'modis_cwp', 
                  'modis_albedo','cloudsat_albedo', 't2m', 'd2m', 'ssrdc', 'strdc']
    
    # Remove rows with no data
    df = df.dropna()
    
    # Remove rows with spurious longwave data
    df = df[df['lw_as'] < 400]
    df = df[df['lw_as'] > 150]
    df = df[df['lw_cs'] != 0]
    df = df[df['lw_cs'] > 150]
    
    # Remove rows with spurious shortwave data
    df = df[df['sw_as'] != 0]
    df = df[df['sw_cs'] != 0]
    
    # Remove if cloud detected but no effect on radiative fluxes
    clearsky = df[df['type'] == 0]
    valid = df[(df['lw_cs'] != df['lw_as']) | (df['sw_cs'] != df['sw_as'])]
    df = pd.concat((clearsky, valid))
    
    # Add factor column
    df['f_sw'] = np.divide(df['sw_as'], df['sw_cs'])
    df['f_lw'] = np.divide(df['lw_as'], df['lw_cs'])

    return df

In [4]:
# Read data
df = data_read_machine_learning(files)

In [5]:
i = 55

# Remove clear skies
data = df[df['type'] > 0]

# Get small band of solar zenith angles
data = data[(data['sza'] < i + 5) & (data['sza'] >= i)]

# Define feature list
feature_list = ['modis_cot', 'modis_ctp', 'modis_cer', 
                'modis_ctt', 'modis_cth', 'modis_cwp', 't2m']

# Define labels and targets
y = data['f_sw']
X = data[['modis_cot', 'modis_ctp', 'modis_cer', 'modis_ctt', 
          'modis_cth', 'modis_cwp', 't2m']]

# Normalize by MinMax
#scaler = MinMaxScaler(feature_range=(0, 1))
#X_norm = scaler.fit_transform(X)

# Split training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

# Define classifier
classifier = RandomForestRegressor(n_estimators=100)

# Train classifier
classifier.fit(X_train, y_train)

# Predict
predictions = classifier.predict(X_test)

# Calculate the absolute errors
errors = abs(predictions - y_test)

# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 3))

# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / y_test)

# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

# Get numerical feature importances
importances = list(classifier.feature_importances_)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, 
                       importance in zip(feature_list, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances]

Mean Absolute Error: 0.074
Accuracy: 87.76 %.
Variable: t2m                  Importance: 0.32
Variable: modis_ctt            Importance: 0.18
Variable: modis_cot            Importance: 0.14
Variable: modis_cer            Importance: 0.14
Variable: modis_cth            Importance: 0.1
Variable: modis_cwp            Importance: 0.1
Variable: modis_ctp            Importance: 0.03


[None, None, None, None, None, None, None]

In [6]:
# Define feature list
feature_list = ['modis_cot', 'modis_ctp', 'modis_phase', 'modis_cer', 
                'modis_ctt', 'modis_cth', 'modis_cwp', 't2m']

# Define labels and targets
y = data['f_lw']
X = data[['modis_cot', 'modis_ctp', 'modis_phase', 'modis_cer', 'modis_ctt', 
          'modis_cth', 'modis_cwp', 't2m']]

# Split training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

# Define classifier
classifier = RandomForestRegressor(n_estimators=100)

# Train classifier
classifier.fit(X_train, y_train)

# Predict
predictions = classifier.predict(X_test)

# Calculate the absolute errors
errors = abs(predictions - y_test)

# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 3))

# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / y_test)

# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

# Get numerical feature importances
importances = list(classifier.feature_importances_)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, 
                       importance in zip(feature_list, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances]

Mean Absolute Error: 0.031
Accuracy: 97.62 %.
Variable: t2m                  Importance: 0.4
Variable: modis_ctt            Importance: 0.14
Variable: modis_cer            Importance: 0.12
Variable: modis_cot            Importance: 0.11
Variable: modis_cth            Importance: 0.09
Variable: modis_cwp            Importance: 0.07
Variable: modis_phase          Importance: 0.04
Variable: modis_ctp            Importance: 0.03


[None, None, None, None, None, None, None, None]