In [1]:
import os, math
import numpy as np
import argparse
import pandas as pd
from tensorflow.keras.layers import Dropout, Dense, SimpleRNN, BatchNormalization
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor
import tensorflow as tf
import sys
import autokeras as ak
from keras.models import Sequential
from keras.layers import LSTM
from tensorflow.keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
%matplotlib inline


plt.rcParams['axes.unicode_minus'] = False    

In [2]:
def get_middle_number(filename):
    return int(filename.split('_')[0][7:])

In [3]:
directory = 'C:/Users/lenovo/Documents/Project/Gears/GuoZhi files/New model/biweekly_date_data/'
sh_w_files = [filename for filename in os.listdir(directory) if filename.endswith('.csv') and filename[-5] == '1']
de_w_files = [filename for filename in os.listdir(directory) if filename.endswith('.csv') and filename[-5] == '2']

sh_w_files.sort(key=get_middle_number)
de_w_files.sort(key=get_middle_number)

In [4]:
#fig = plt.figure(figsize=(10,320),dpi=55)

# Comparsion of all models for each stations

In [5]:
def get_middle_number(filename):
    return int(filename.split('_')[0][7:])

def load_and_preprocess_data(file_path):
    data = pd.read_csv(file_path)
    df = pd.DataFrame(data.iloc[:, 3:], dtype=float)
    df = df.iloc[:, :]

    X = df.values
    Y = df.iloc[:, -1].values

    sc = MinMaxScaler(feature_range=(0, 1))
    X = sc.fit_transform(X[:, 0:-1])

    x_train = []
    y_train = []

    x_test = []
    y_test = []

    for k in range(14, int(df.shape[0] * 0.75)):
        x_train.append(X[k - 14:k, :])
        y_train.append(Y[k])

    for p in range(14, df.shape[0] - int(df.shape[0] * 0.75)):
        x_test.append(X[p - 14:p, :])
        y_test.append(Y[p])

    x_train, y_train = np.array(x_train), np.array(y_train)
    x_test, y_test = np.array(x_test), np.array(y_test)

    return x_train, y_train, x_test, y_test, data

def rnn_model(x_train, y_train, x_test, y_test):
    x_train = np.reshape(x_train, (x_train.shape[0], 14, int(x_train.size/(x_train.shape[0]*14))))
    x_test = np.reshape(x_test, (x_test.shape[0], 14, int(x_test.size/(x_test.shape[0]*14))))

    model = tf.keras.Sequential([
        SimpleRNN(100, return_sequences=True),
        Dropout(0.1),
        SimpleRNN(100),
        Dense(1)
    ])

    model.compile(optimizer=tf.keras.optimizers.Adam(0.001),
                  loss='mean_squared_error')

    history = model.fit(x_train, y_train,
                        batch_size=32,
                        epochs=200,
                        validation_data=(x_test, y_test),
                        validation_freq=1,
                        validation_split=0.25)

    predicted = model.predict(x_test)

    return predicted

def gradient_boosting_model(x_train, y_train, x_test, y_test):
    x_train = np.reshape(x_train, (x_train.shape[0], -1))
    x_test = np.reshape(x_test, (x_test.shape[0], -1))

    gb_model = GradientBoostingRegressor(n_estimators=100, random_state=32, validation_fraction=0.25)
    gb_model.fit(x_train, y_train)

    predicted = gb_model.predict(x_test)

    return predicted

def mlp_model(x_train, y_train, x_test, y_test):
    x_train = np.reshape(x_train, (x_train.shape[0], -1))
    x_test = np.reshape(x_test, (x_test.shape[0], -1))

    scaler = StandardScaler()
    x_train_scaled = scaler.fit_transform(x_train)
    x_test_scaled = scaler.transform(x_test)

    model = Sequential([
        Dense(128, activation='relu', input_dim=x_train_scaled.shape[1]),
        BatchNormalization(),
        Dropout(0.2),
        Dense(64, activation='relu'),
        BatchNormalization(),
        Dropout(0.2),
        Dense(32, activation='relu'),
        Dense(1)
    ])

    optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
    model.compile(optimizer=optimizer, loss='mean_squared_error')

    early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)

    history = model.fit(x_train_scaled, y_train, epochs=300, batch_size=64, validation_split=0.25, callbacks=[early_stopping])

    predicted = model.predict(x_test_scaled)

    return predicted

def autokeras_model(x_train, y_train, x_test, y_test):
    x_train = np.reshape(x_train, (x_train.shape[0], -1))
    x_test = np.reshape(x_test, (x_test.shape[0], -1))

    reg = ak.StructuredDataRegressor(overwrite=True, max_trials=10)
    reg.fit(x_train, y_train, epochs=300, validation_split=0.25)

    predicted = reg.predict(x_test)

    return predicted

def lstm_model(x_train, y_train, x_test, y_test):
    model = Sequential([
        LSTM(100, return_sequences=True, input_shape=(x_train.shape[1], x_train.shape[2])),
        Dropout(0.2),
        LSTM(100),
        Dense(1)
    ])

    # Compile the model
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
    model.compile(optimizer=optimizer, loss='mean_squared_error')

    # Train the model
    model.fit(x_train, y_train, epochs=200, batch_size=32, validation_split=0.25, verbose=0)

    # Predict using the LSTM model
    predicted = model.predict(x_test)

    return predicted

    
def plot_comparison(time_values, y_test, model_predictions, model_names, station_name, show_legend=True):
    plt.figure(figsize=(10, 6))

    plt.plot(time_values, y_test, color='black', label='Observation')

    for predictions, model_name in zip(model_predictions, model_names):
        predictions = predictions[:len(time_values)]
        plt.plot(time_values, predictions, label=model_name)

    plt.xticks(time_values[::30],fontsize=14,fontweight='bold' )
    plt.axhline(2, color="black", linestyle=':')
    plt.title(f'{station_name}',fontsize=16, fontweight='bold' )
    #plt.xlabel(fontsize=15, fontweight='bold')
    plt.ylabel('DO (mg/L)', fontsize=14, fontweight='bold')
    
    if show_legend:
        plt.legend(loc='upper right')
    
    plt.tight_layout()
    plt.show()



In [6]:
if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Process some integers.")
    parser.add_argument(
        "--directory",
        type=str,
        default="C:/Users/lenovo/Documents/Project/Gears/GuoZhi files/New model/biweekly_date_data/",
        help="Directory containing the CSV files"
    )

    args = parser.parse_args()
    main(args.directory)

usage: ipykernel_launcher.py [-h] [--directory DIRECTORY]
ipykernel_launcher.py: error: unrecognized arguments: -f C:\Users\lenovo\AppData\Roaming\jupyter\runtime\kernel-01501e29-8a43-4a33-9e5e-76a38dc8b068.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
