In [1]:
import pandas as pd
import os

In [2]:
root_dir_rmse = 'D:\\Luis_Mercado\\Google Business\\Shared Folder\\Thesis 1\\Experiment Results\\rmse'
root_dir_plcc = 'D:\\Luis_Mercado\\Google Business\\Shared Folder\\Thesis 1\\Experiment Results\\plcc'

models = ['direct_v4', 'recurrent_singleton_v2', 'nbr_singleton_v2', 'randomforest_singleton_v2',
          'sgb_singleton_v2', 'ensemble_singleton_v2']
model_names = ['Direct SVR', 'Recurrent SVR', 'K-Nearest Neighbors Regression', 'Random Forest Regression', 
                'Stochastic Gradient Boosting', 'Stacking Ensemble']

plcc_files = ['plcc_{}.txt'.format(model) for model in models]
rmse_files = ['rmse_{}.txt'.format(model) for model in models]

plcc_dict = {}
rmse_dict = {}

for idx, plcc_file in enumerate(plcc_files):
    model_name = model_names[idx]
    file_path = os.path.join(root_dir_plcc, plcc_file)
    with open(file_path, mode='r', encoding='utf-8') as reader:
        current_apc = ''
        for line_read in reader:
            if 'PLCC for' in line_read:
                current_apc = line_read[:-1]
                if current_apc not in plcc_dict:
                    plcc_dict[current_apc] = {}
                if model_name not in plcc_dict[current_apc]:
                    plcc_dict[current_apc][model_name] = []
            else:
                time_step, plcc_point = line_read.split(':')
                time_step = int(time_step[8:-1])
                plcc_point = float(plcc_point.split(',')[0].strip()[1:])
                point = (time_step, plcc_point)
                plcc_dict[current_apc][model_name].append(point)
                
for idx, rmse_file in enumerate(rmse_files):
    model_name = model_names[idx]
    file_path = os.path.join(root_dir_rmse, rmse_file)
    with open(file_path, mode='r', encoding='utf-8') as reader:
        current_apc = ''
        for line_read in reader:
            if 'RMSE for' in line_read:
                current_apc = line_read[:-1]
                if current_apc not in rmse_dict:
                    rmse_dict[current_apc] = {}
                if model_name not in rmse_dict[current_apc]:
                    rmse_dict[current_apc][model_name] = []
            else:
                time_step, rmse_point = line_read.split(':')
                time_step = int(time_step[8:-1])
                rmse_point = float(rmse_point.split(',')[0].strip())
                point = (time_step, rmse_point)
                rmse_dict[current_apc][model_name].append(point)

In [12]:
root_dir_actualversus = \
    'D:\\Luis_Mercado\\Google Business\\Shared Folder\\Thesis 1\\Experiment Results\\actual vs predicted\\recurrent vs ensemble'
apc_list = list()
for entry in os.listdir(root_dir_actualversus):
    entry_path = os.path.join(root_dir_actualversus, entry)
    if os.path.isdir(entry_path):
        apc_list.append(entry)
        
actual_values = {}
rsvr_preds = {}
ens_preds = {}

for apc in apc_list:
    apc_dir = os.path.join(root_dir_actualversus, apc)
    print('opening directory for %s' % apc)
    actual_values[apc] = {}
    rsvr_preds[apc] = {}
    ens_preds[apc] = {}
    with os.scandir(apc_dir) as entries:
        for entry in entries:
            if entry.is_file() and entry.name != 'desktop.ini':
                time_step = entry.name.split('ensemble_')[1].split('.')[0].strip()
                file_path = os.path.join(apc_dir, entry.name)
                actual_values[apc][time_step] = list()
                ens_preds[apc][time_step] = list()
                rsvr_preds[apc][time_step] = list()
                with open(file_path, 'r') as f_reader:
                    read_line = f_reader.readline()
                    while read_line:
                        values = read_line.split(',')
                        read_line = f_reader.readline()
                        actual_values[apc][time_step].append(float(values[0]))
                        rsvr_preds[apc][time_step].append(float(values[1]))
                        ens_preds[apc][time_step].append(float(values[2]))
    print('done with %s' % apc)

opening directory for CO
done with CO
opening directory for NO2
done with NO2
opening directory for O3
done with O3
opening directory for PM25
done with PM25


In [13]:
def get_time_step_int(string):
    return int(string.split('+')[1])
keys = list(actual_values['CO'].keys())
time_steps_sorted = sorted(keys, key=get_time_step_int)

# Recurrent vs Ensemble based on actual vs predicted values

In [14]:
import numpy as np
def get_table_actualversus(apc_name, time_step):
    df = pd.DataFrame()
    x_points = np.arange(0, len(actual_values[apc_name][time_step]))
    y_points_actual = actual_values[apc_name][time_step]
    y_points_rsvr = rsvr_preds[apc_name][time_step]
    y_points_ens = ens_preds[apc_name][time_step]
    ser_actual = pd.Series(data=y_points_actual, index=x_points, name='Actual')
    ser_rsvr = pd.Series(data=y_points_rsvr, index=x_points, name='RSVR Predicted')
    ser_ens = pd.Series(data=y_points_ens, index=x_points, name='Ensemble Predicted')
    df[ser_actual.name] = ser_actual
    df[ser_rsvr.name] = ser_rsvr
    df[ser_ens.name] = ser_ens
    return df

In [15]:
get_table_actualversus(apc_list[0], 't+1')

Unnamed: 0,Actual,RSVR Predicted,Ensemble Predicted
0,6.0670,5.465108,5.269312
1,5.6390,5.458196,5.134067
2,5.4740,5.574417,5.195652
3,5.5595,5.898442,5.550784
4,6.1495,5.869622,5.642680
...,...,...,...
156,4.6935,5.240607,5.883135
157,4.2705,5.343128,5.835623
158,6.1050,5.029723,5.928800
159,4.5865,5.705523,5.819859


In [18]:
from docx import Document
from docx.shared import Inches

headers = ['DataPoint#', 'Actual', 'RSVR', 'Ensemble']

# try to create directory if not created yet
root_dir = 'results'
if not os.path.exists(root_dir):
    try:
        os.makedirs(root_dir)
    except OSError:
        print ("Creation of the directory %s failed" % root_dir)
    else:
        print ("Successfully created the directory %s" % root_dir)
        
for apc in apc_list:
    file_name = os.path.join('results', 'actual vs predicted %s.docx' % apc)
    doc = Document()
    unit = ''
    if apc == 'CO':
        unit  = 'PPM'
    elif apc == 'NO2':
        unit = 'PPB'
    elif apc == 'PM25':
        unit = 'ug/m3'
    elif apc == 'O3':
        unit = 'PPB'
    
    doc.add_heading('Actual vs Predicted %s (%s)' % (apc, unit), 0)

    for time_step in time_steps_sorted:
        doc.add_heading('Time step (%s)' % time_step, 1)
        item_count = len(rsvr_preds[apc][time_step])
        table = doc.add_table(rows=1, cols=4)
        hdr_cells = table.rows[0].cells
        for idx in range(4):
            run = hdr_cells[idx].paragraphs[0].add_run(headers[idx])
            run.bold = True
        for idx in range(item_count):
            row_cells = table.add_row().cells
            row_cells[0].text = str(idx + 1)
            row_cells[1].text = str(round(actual_values[apc][time_step][idx], 4))
            row_cells[2].text = str(round(rsvr_preds[apc][time_step][idx], 4))
            row_cells[3].text = str(round(ens_preds[apc][time_step][idx], 4))
        doc.add_page_break()

    doc.save(file_name)
    print('Done writing file for %s!' % file_name)

Done writing file for results\actual vs predicted CO.docx!
Done writing file for results\actual vs predicted NO2.docx!
Done writing file for results\actual vs predicted O3.docx!
Done writing file for results\actual vs predicted PM25.docx!


# PLCC

In [48]:
apc_names = list(plcc_dict.keys())
excludes = ['Direct SVR', 'K-Nearest Neighbors Regression', 'Random Forest Regression', 
                'Stochastic Gradient Boosting']
def get_table_plcc(name, exclude=None, round_off=2):
    df = pd.DataFrame()
    print(name)
    if exclude is None:
        exclude = list()
    for model_name, plcc_points in plcc_dict[name].items():
        if model_name in exclude:
            continue
        x_points = [point[0] for point in plcc_points]
        y_points = [round(point[1], round_off) for point in plcc_points]
        ser = pd.Series(data=y_points, index=x_points, name=model_name)
        df[model_name] = ser
    return df

In [49]:
apc_name = apc_names[0]
get_table_plcc(apc_name, excludes)

PLCC for PM25 (ug/m3)


Unnamed: 0,Recurrent SVR,Stacking Ensemble
1,0.4,0.24
2,0.35,0.25
3,0.31,0.29
4,0.23,0.25
5,0.11,0.19
6,0.04,0.17
7,0.0,0.19
8,0.0,0.16
9,0.02,0.09
10,-0.06,0.03


In [50]:
apc_name = apc_names[1]
get_table_plcc(apc_name, excludes)

PLCC for CO (PPM)


Unnamed: 0,Recurrent SVR,Stacking Ensemble
1,0.39,0.12
2,0.11,0.15
3,-0.04,0.17
4,-0.09,0.15
5,-0.08,0.06
6,-0.04,0.06
7,0.02,0.02
8,0.07,0.01
9,0.11,0.01
10,0.11,0.05


In [51]:
apc_name = apc_names[2]
get_table_plcc(apc_name, excludes)

PLCC for NO2 (PPM)


Unnamed: 0,Recurrent SVR,Stacking Ensemble
1,0.67,-0.56
2,0.49,-0.57
3,0.33,-0.57
4,0.22,-0.55
5,0.13,-0.54
6,0.06,-0.54
7,0.08,-0.53
8,0.14,-0.53
9,0.19,-0.58
10,0.23,-0.63


In [52]:
apc_name = apc_names[3]
get_table_plcc(apc_name, excludes)

PLCC for O3 (PPB)


Unnamed: 0,Recurrent SVR,Stacking Ensemble
1,0.42,0.12
2,0.21,0.12
3,0.04,0.11
4,-0.04,0.11
5,-0.1,0.08
6,-0.1,0.1
7,-0.12,0.13
8,-0.07,0.16
9,-0.01,0.18
10,-0.05,0.18


# RMSE

In [53]:
apc_names = list(rmse_dict.keys())
excludes = ['Direct SVR', 'K-Nearest Neighbors Regression', 'Random Forest Regression', 
                'Stochastic Gradient Boosting']
def get_table_rmse(name, exclude=None, round_off=2):
    df = pd.DataFrame()
    
    print(name)
    if exclude is None:
        exclude = list()
    for model_name, rmse_points in rmse_dict[name].items():
        if model_name in exclude:
            continue
        x_points = [point[0] for point in rmse_points]
        y_points = [round(point[1], round_off) for point in rmse_points]
        ser = pd.Series(data=y_points, index=x_points, name=model_name)
        df[model_name] = ser
    return df

In [54]:
apc_name = apc_names[0]
get_table_rmse(apc_name,excludes)

RMSE for PM25 (ug/m3)


Unnamed: 0,Recurrent SVR,Stacking Ensemble
1,12.74,8.7
2,13.05,8.85
3,13.19,8.83
4,13.59,8.9
5,13.76,9.06
6,13.83,9.08
7,14.04,9.07
8,14.14,9.27
9,14.13,9.5
10,14.35,9.67


In [55]:
apc_name = apc_names[1]
get_table_rmse(apc_name, excludes)

RMSE for CO (PPM)


Unnamed: 0,Recurrent SVR,Stacking Ensemble
1,1.13,0.85
2,1.31,0.86
3,1.41,0.86
4,1.49,0.88
5,1.59,0.9
6,1.76,0.91
7,2.02,0.92
8,2.19,0.91
9,2.24,0.89
10,2.27,0.86


In [56]:
apc_name = apc_names[2]
get_table_rmse(apc_name, excludes)

RMSE for NO2 (PPM)


Unnamed: 0,Recurrent SVR,Stacking Ensemble
1,21.56,38.03
2,27.25,38.64
3,32.23,39.57
4,35.98,40.35
5,39.57,41.06
6,44.44,41.15
7,48.44,40.95
8,49.53,40.76
9,48.43,40.41
10,47.12,39.77


In [57]:
apc_name = apc_names[3]
get_table_rmse(apc_name, excludes)

RMSE for O3 (PPB)


Unnamed: 0,Recurrent SVR,Stacking Ensemble
1,34.32,27.06
2,36.46,27.06
3,37.6,27.09
4,39.11,27.12
5,41.2,27.27
6,41.92,27.21
7,42.1,27.3
8,41.56,27.3
9,40.53,27.04
10,41.21,26.82
