### Preprocess

In [1]:
import pandas as pd, numpy as np
from tqdm import tqdm

In [2]:
df_train = pd.read_csv("bbdc_2023_AWI_data_develop_professional.csv")

In [3]:
clean_cols = df_train.columns[0].split(';')

In [4]:
clean_values = list(map(lambda x: x[0].split(';'), df_train.values))

In [5]:
df_train_clean = pd.DataFrame(clean_values[1:], columns=clean_cols)

In [6]:
df_train_clean["Datum"] = pd.to_datetime(df_train_clean["Datum"], yearfirst=True, dayfirst=True)

In [7]:
def fix_format(string):
    return string.strip("?")

In [8]:
select = df_train_clean["NOx"] != "NA"
select_no3 = df_train_clean["NO3"] != "NA"
select_no2 = df_train_clean["NO2"] != "NA"
select_nh4 = df_train_clean["NH4"] != "NA"
select_sio4 = df_train_clean["SiO4"] != "NA"
select_sal = df_train_clean["Salinität"] != "NA"
select_temp = df_train_clean["Temperatur"] != "NA"
select_sec = df_train_clean["SECCI"] != "NA"

In [9]:
NOx = df_train_clean["NOx"].apply(fix_format)[select].values.astype(float)
NO3 = df_train_clean["NO3"].apply(fix_format)[select & select_no3].values.astype(float)
NO2 = df_train_clean["NO2"].apply(fix_format)[select & select_no2].values.astype(float)
NH4 = df_train_clean["NH4"].apply(fix_format)[select & select_nh4].values.astype(float)

In [10]:
deg = 4

In [11]:
def interpolate(data1, data2, select, degree=deg):
    from sklearn.preprocessing import PolynomialFeatures
    from sklearn.linear_model import LinearRegression
    values1, values2 = data1[select].values.astype(float), data2[select].values.astype(float)
    poly = PolynomialFeatures(degree=degree)
    poly_features = poly.fit_transform(values1)
    poly_reg_model = LinearRegression()
    poly_reg_model.fit(poly_features, values2)
    return poly_reg_model

Model NO3, NO2 and NH4 as a function of Temperatur and Salinitat

In [12]:
target_no3 = df_train_clean["NO3"].apply(fix_format)
target_no2 = df_train_clean["NO2"].apply(fix_format)
target_nh4 = df_train_clean["NH4"].apply(fix_format)

In [13]:
data_temp_sal = pd.concat([df_train_clean["Temperatur"].apply(fix_format), df_train_clean["Salinität"].apply(fix_format)], axis=1)

In [14]:
model_no3 = interpolate(data_temp_sal, target_no3, select_temp & select_sal & select_no3, degree=deg)
model_no2 = interpolate(data_temp_sal, target_no2, select_temp & select_sal & select_no2, degree=deg)
model_nh4 = interpolate(data_temp_sal, target_nh4, select_temp & select_sal & select_nh4, degree=deg)

In [15]:
def predict_with_temp_sal(df, model, select, col_name="NO3"):
    from sklearn.preprocessing import PolynomialFeatures
    poly = PolynomialFeatures(degree=deg)
    for c, (bl, row) in enumerate(zip(select.values, df.iterrows())):
        if bl:
            poly_features = poly.fit_transform(np.array([row[-1]["Temperatur"].strip("?"), row[-1]["Salinität"].strip("?")], dtype=float).reshape(1,-1))
            pred = model.predict(poly_features)
            df.loc[c, col_name] = str(pred[0])
    return df

In [16]:
predict_with_temp_sal(df_train_clean, model_no3, select_temp & select_sal & ~select_no3, col_name="NO3");
predict_with_temp_sal(df_train_clean, model_no2, select_temp & select_sal & ~select_no2, col_name="NO2");
predict_with_temp_sal(df_train_clean, model_nh4, select_temp & select_sal & ~select_nh4, col_name="NH4");

In [17]:
select_no3 = df_train_clean["NO3"] != "NA"
select_no2 = df_train_clean["NO2"] != "NA"
select_nh4 = df_train_clean["NH4"] != "NA"

In [18]:
features_nox = pd.concat([df_train_clean["NO3"].apply(fix_format), df_train_clean["NO2"].apply(fix_format),
                          df_train_clean["NH4"].apply(fix_format)], axis=1)

In [19]:
target_nox = df_train_clean["NOx"].apply(fix_format)
target_secci = df_train_clean["SECCI"].apply(fix_format)

In [20]:
model_nox = interpolate(features_nox, target_nox, select_no3 & select_no2 & select_nh4 & select, degree=deg)
model_secci = interpolate(features_nox, target_secci, select_no3 & select_no2 & select_nh4 & select_sec, degree=deg)

In [21]:
#model_nox.coef_

In [22]:
df_train_clean["NOx"].apply(fix_format)[select_no3 & select_no2 & select_nh4 & select].values.astype(float);

In [23]:
df_train_clean["SECCI"].apply(fix_format)[select_no3 & select_no2 & select_nh4 & select_sec].values.astype(float);

In [24]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=deg)
poly_features = poly.fit_transform(features_nox[select_no3 & select_no2 & select_nh4 & select].values.astype(float))
pred_nox = model_nox.predict(poly_features)

In [25]:
poly = PolynomialFeatures(degree=deg)
poly_features = poly.fit_transform(features_nox[select_no3 & select_no2 & select_nh4 & select_sec].values.astype(float))
pred_secci = model_secci.predict(poly_features)

In [26]:
pred_nox;

In [27]:
pred_secci;

In [28]:
def predict_missing_nox(df, model, select):
    from sklearn.preprocessing import PolynomialFeatures
    poly = PolynomialFeatures(degree=deg)
    for c, (bl, row) in enumerate(zip(select.values, df.iterrows())):
        if bl:
            assert row[-1]["NOx"] == "NA"
            poly_features = poly.fit_transform(np.array([row[-1]["NO3"].strip("?"), row[-1]["NO2"].strip("?"),
                                                        row[-1]["NH4"].strip("?")], dtype=float).reshape(1,-1))
            pred = model.predict(poly_features)
            df.loc[c, "NOx"] = str(pred[0])
    return df

def predict_missing_secci(df, model, select):
    from sklearn.preprocessing import PolynomialFeatures
    poly = PolynomialFeatures(degree=deg)
    for c, (bl, row) in enumerate(zip(select.values, df.iterrows())):
        if bl:
            assert row[-1]["SECCI"] == "NA"
            poly_features = poly.fit_transform(np.array([row[-1]["NO3"].strip("?"), row[-1]["NO2"].strip("?"),
                                                        row[-1]["NH4"].strip("?")], dtype=float).reshape(1,-1))
            pred = model.predict(poly_features)
            df.loc[c, "SECCI"] = str(pred[0])
    return df

In [29]:
predict_missing_nox(df_train_clean, model_nox, select_no3 & select_no2 & select_nh4 & ~select);
predict_missing_secci(df_train_clean, model_secci, select_no3 & select_no2 & select_nh4 & ~select_sec);

In [30]:
df_train_clean[select_no3 & select_no2 & select_nh4 & ~select_sec];

In [31]:
def handle_time(df):
    c=0
    values = []
    for row in tqdm(df['Uhrzeit'], total=len(df)):
        try:
            pd.to_datetime(row)
            values.append(row)
        except ValueError:
            df_train_clean.loc[c, 'Uhrzeit'] = "NA"
        c+=1
    mean_time = pd.to_datetime(values).mean()
    df["Uhrzeit"] = df["Uhrzeit"].replace("NA", mean_time)
    df["Uhrzeit"] = pd.to_datetime(df["Uhrzeit"], yearfirst=True, dayfirst=True)
    return df

In [32]:
df_train_clean = handle_time(df_train_clean)

100%|██████████| 17531/17531 [00:03<00:00, 5154.62it/s]
  mean_time = pd.to_datetime(values).mean()


In [33]:
df_train_clean;

In [34]:
def convert_join_date_time(date, time):
    def remove_time(dt):
        return str(dt).split()[0]
    def remove_date(dt):
        return str(dt).split()[1]
    date_str = date.apply(remove_time)
    time_str = time.apply(remove_date)
    def final_convert(datetime):
        return datetime.replace(':', ',').replace('-', ',')
    datetime = (time_str+','+date_str).apply(final_convert)
    return datetime

In [35]:
datetime = convert_join_date_time(df_train_clean['Datum'], df_train_clean['Uhrzeit'])

In [36]:
df_train_clean['datetime'] = datetime

In [37]:
df_train_clean.columns;

In [38]:
df_train_clean;

In [39]:
def handle_missing_numeric(df, col):
    values = []
    for c, row in enumerate(df[col]):
        if row != "NA":
            values.append(float(row.strip("?")))
        try:
            float(row)
        except ValueError:
            df.loc[c, col] = str(row).strip("?")
    df[col] = df[col].replace("NA", str(np.mean(values))).astype(float)
    return df

In [40]:
for col in tqdm(['SECCI', 'Temperatur', 'Salinität', 'NH4', 'NOx', 'NO2', 'NO3', 'PO4', 'SiO4']):
    df_train_clean = handle_missing_numeric(df_train_clean, col)

100%|██████████| 9/9 [00:05<00:00,  1.78it/s]


In [41]:
df_train_clean.drop(['Datum', 'Uhrzeit'], axis=1, inplace=True)

In [42]:
cols = df_train_clean.columns
cols = [cols[-1]]+list(cols[0:-1])

In [43]:
df_train_clean = df_train_clean[cols]

In [44]:
df_train_clean;

In [45]:
def clean_time(string):
    if '.' in string:
        splits = string.split(',')
        splits[2] = splits[2].split('.')[0]
        return ','.join(splits)
    else:
        return string

In [46]:
df_train_clean['datetime'] = df_train_clean['datetime'].apply(clean_time)

In [47]:
df_train_clean.to_csv('data_clean.csv', index=False)

### Load trained models to predict

In [48]:
import torch, torch.nn as nn, numpy as np, pandas as pd
from tqdm import tqdm

In [49]:
df_test = pd.read_csv("bbdc_2023_AWI_data_evaluate_skeleton_professional.csv")

In [50]:
def handle_time(df):
    c=0
    values = []
    for row in tqdm(df['Uhrzeit'], total=len(df)):
        try:
            pd.to_datetime(row)
            values.append(row)
        except ValueError:
            df_train_clean.loc[c, 'Uhrzeit'] = "NA"
        c+=1
    mean_time = pd.to_datetime(values).mean()
    df["Uhrzeit"] = df["Uhrzeit"].replace("NA", mean_time)
    df["Uhrzeit"] = pd.to_datetime(df["Uhrzeit"], yearfirst=True, dayfirst=True)
    return df

In [51]:
clean_cols = df_test.columns[0].split(';')

In [52]:
test_values = list(map(lambda x: x[0].split(';'), df_test.values))

In [53]:
#test_values

In [54]:
df_test_clean = pd.DataFrame(test_values[1:], columns=clean_cols)

In [55]:
df_test_clean["Datum"] = pd.to_datetime(df_test_clean["Datum"], yearfirst=True, dayfirst=True)

In [56]:
df_test_clean = handle_time(df_test_clean)

100%|██████████| 1365/1365 [00:00<00:00, 5268.15it/s]
  mean_time = pd.to_datetime(values).mean()
  df["Uhrzeit"] = pd.to_datetime(df["Uhrzeit"], yearfirst=True, dayfirst=True)


In [57]:
def convert_join_date_time(date, time):
    def remove_time(dt):
        return str(dt).split()[0]
    def remove_date(dt):
        return str(dt).split()[1]
    date_str = date.apply(remove_time)
    time_str = time.apply(remove_date)
    def final_convert(datetime):
        return datetime.replace(':', ',').replace('-', ',')
    datetime = (time_str+','+date_str).apply(final_convert)
    return datetime

In [58]:
def clean_time(string):
    if '.' in string:
        splits = string.split(',')
        splits[2] = splits[2].split('.')[0]
        return ','.join(splits)
    else:
        return string

In [59]:
df_test_clean['datetime'] = convert_join_date_time(df_test_clean['Datum'], df_test_clean['Uhrzeit'])

In [60]:
cols = df_test_clean.columns
cols = [cols[-1]]+list(cols[0:-1])

In [61]:
df_test_clean = df_test_clean[cols]

In [62]:
df_test_clean.drop(['Datum', 'Uhrzeit'], axis=1, inplace=True)

In [63]:
df_test_clean['datetime'] = df_test_clean['datetime'].apply(clean_time)

In [64]:
from utils import TestDataLoader
from torch.utils.data import DataLoader
from models import *

In [65]:
data_test_additional = pd.read_csv('data_test_additional.csv')

In [66]:
embedding_model = torch.load('Date2Vec/models/d2v_cos_14.054091384440834.pth', map_location='cpu').eval()
test_dataset = TestDataLoader(df_test_clean, data_test_additional, embedding_model)
test_dataloader = DataLoader(test_dataset, batch_size=256, num_workers=8, shuffle=False)

In [67]:
input_dim = 68
proj_dim = 64
num_heads = 2

In [68]:
def get_models(model_names, all_models=False):
    models = []
    for m in model_names:
        for fold in [0, 1, 2, 3, 4]:
            if m == 'SetTransformer':
                for num_inds in [4, 8, 16, 32]:
                    if num_inds in [8, 16, 32]:
                        model = SetTransformer(input_dim, proj_dim, num_inds, 2, 1)
                        model.load_state_dict(torch.load(f'trained_models/trained_SetTransformer{num_inds}_fold{fold}.pt', map_location='cuda'))
                        model.eval()
                        models.append(model.cuda())
                    if all_models:
                        for num_heads in [2, 4]:
                            model = SetTransformer(input_dim, proj_dim, num_inds, num_heads, 1)
                            model.load_state_dict(torch.load(f'trained_models/trained_SetTransformer_num_inds{num_inds}_num_heads{num_heads}_fold{fold}.pt', map_location='cuda'))
                            model.eval()
                            models.append(model.cuda())
            elif m == 'GRU':
                model = GRU(input_dim, proj_dim, 2)
                model.load_state_dict(torch.load(f'trained_models/trained_GRU_fold{fold}.pt', map_location='cuda'))
                model.eval()
                models.append(model.cuda())
                
            elif m == 'LSTM':
                model = LSTM(input_dim, proj_dim, 2)
                model.load_state_dict(torch.load(f'trained_models/trained_LSTM_fold{fold}.pt', map_location='cuda'))
                model.eval()
                models.append(model.cuda())
                    
            elif m == 'MLP':
                model = MLP(input_dim, proj_dim)
                model.load_state_dict(torch.load(f'trained_models/trained_MLP_fold{fold}.pt', map_location='cuda'))
                model.eval()
                models.append(model.cuda())
                    
    return models

def get_prediction(models, x):
    with torch.no_grad():
        for i, model in enumerate(models):
            if i == 0:
                preds = model(x)
            else:
                preds = preds+model(x)
    return preds/len(models)

In [69]:
Models1 = get_models(['SetTransformer', 'GRU', 'LSTM', 'MLP'], all_models=True)
Models = Models1# + Models2
All_predictions = []
for x in test_dataloader:
    x = x.cuda()
    preds = get_prediction(Models, x)
    All_predictions.append(preds.detach().cpu().numpy())
All_predictions = np.concatenate(All_predictions)

### Best Predictions (submission 11)

In [70]:
eval_df = pd.read_csv("bbdc_2023_AWI_data_evaluate_skeleton_professional.csv")

In [71]:
n = len(eval_df)
unique_col = eval_df.columns[0]
for i in range(n):
    if i != 0:
        row = eval_df[unique_col].iloc[i]
        eval_df.loc[i, unique_col] = ';'.join(row.split(';')[:2])+';'+';'.join(list(map(str, All_predictions[i-1])))

In [72]:
eval_df.to_csv('prediction.csv', index=False)

In [73]:
def comparative_plot(preds, prev=9):
    import pandas as pd, numpy as np
    old_prediction = pd.read_csv(f"submissions/prediction{prev}.csv")
    old_prediction_values = list(map(lambda x: x[0].split(';')[2:], old_prediction.values))[1:]
    old_prediction_values = np.array(old_prediction_values, dtype=float)
    import matplotlib.pyplot as plt
    fig, axs = plt.subplots(3, 3, figsize=(15,10), constrained_layout=True)
    for ax, col_num in zip(axs.flat, list(range(3, 12))):
        #ax.plot(crv, mk, markersize=3, linewidth=3, color=c)
        ax.plot(preds[:, col_num-3])
        ax.plot(old_prediction_values[:, col_num-3])
        ax.legend([cols[col_num], cols[col_num]+f"_prediction{prev}"])
        #plt.legend([cols[col_num], cols[col_num]+"_prev"])
    plt.show()

In [75]:
#comparative_plot(All_predictions, 11)