In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import re

In [2]:
data_dir = '../metdata'

In [3]:
files = os.listdir(data_dir)
files = [os.path.join(data_dir,x) for x in files]

In [4]:
pattern = r"([-]?\d+[\.]?\d+)N_([-]?\d+[\.]?\d+)E\.csv"
parsed_latlon = []
for file in files:
    match = re.search(pattern, file)
    if match:
        latitude = match.group(1)
        longitude = match.group(2)
    parsed_latlon.append((float(latitude), float(longitude)))
lat, lon = zip(*parsed_latlon)

In [5]:
kelvin_to_cels = -273.15

def readfile(path, index):
    df = pd.read_csv(path)
    headings = ["year", "month", "day", "hour", "temperature(celsius)", "precipitation", "u-wind", "v-wind"]
    df.columns = headings
    df["temperature(celsius)"] = df["temperature(celsius)"] + kelvin_to_cels
    df["wind-speed"] = np.sqrt(df["u-wind"]**2 + df["v-wind"]**2)
    df["longitude"] = lon[index]
    df["latitude"] = lat[index]
    return df

In [6]:
def join_datasets(files):
    dfs = []
    for i, file in enumerate(files):
        df = readfile(file, i)
        dfs.append(df)
    dfs = pd.concat(dfs, axis = 0)
    dfs.reset_index(drop=True, inplace=True)
    return dfs

dfs = join_datasets(files)
dfs

Unnamed: 0,year,month,day,hour,temperature(celsius),precipitation,u-wind,v-wind,wind-speed,longitude,latitude
0,1980,1,1,1,-2.00,0.000000e+00,1.7201,-1.64870,2.382636,286.01,40.75
1,1980,1,1,2,-3.08,0.000000e+00,1.7232,-1.38870,2.213121,286.01,40.75
2,1980,1,1,3,-3.60,0.000000e+00,1.7494,-1.32370,2.193760,286.01,40.75
3,1980,1,1,4,-3.40,0.000000e+00,1.7849,-1.55190,2.365219,286.01,40.75
4,1980,1,1,5,-4.11,0.000000e+00,1.7648,-1.87200,2.572723,286.01,40.75
...,...,...,...,...,...,...,...,...,...,...,...
4102543,2018,12,31,19,26.56,4.681900e-16,-9.2599,-0.11864,9.260660,283.20,18.00
4102544,2018,12,31,20,26.69,1.755900e-16,-9.2270,0.13446,9.227980,283.20,18.00
4102545,2018,12,31,21,26.86,1.154500e-10,-9.1698,0.47548,9.182119,283.20,18.00
4102546,2018,12,31,22,27.12,2.926000e-16,-9.3076,1.18610,9.382870,283.20,18.00


In [7]:
y_var = "temperature(celsius)"
y = dfs[y_var]
y

0          -2.00
1          -3.08
2          -3.60
3          -3.40
4          -4.11
           ...  
4102543    26.56
4102544    26.69
4102545    26.86
4102546    27.12
4102547    27.24
Name: temperature(celsius), Length: 4102548, dtype: float64

In [8]:
X = dfs.drop(y_var, axis=1)
X

Unnamed: 0,year,month,day,hour,precipitation,u-wind,v-wind,wind-speed,longitude,latitude
0,1980,1,1,1,0.000000e+00,1.7201,-1.64870,2.382636,286.01,40.75
1,1980,1,1,2,0.000000e+00,1.7232,-1.38870,2.213121,286.01,40.75
2,1980,1,1,3,0.000000e+00,1.7494,-1.32370,2.193760,286.01,40.75
3,1980,1,1,4,0.000000e+00,1.7849,-1.55190,2.365219,286.01,40.75
4,1980,1,1,5,0.000000e+00,1.7648,-1.87200,2.572723,286.01,40.75
...,...,...,...,...,...,...,...,...,...,...
4102543,2018,12,31,19,4.681900e-16,-9.2599,-0.11864,9.260660,283.20,18.00
4102544,2018,12,31,20,1.755900e-16,-9.2270,0.13446,9.227980,283.20,18.00
4102545,2018,12,31,21,1.154500e-10,-9.1698,0.47548,9.182119,283.20,18.00
4102546,2018,12,31,22,2.926000e-16,-9.3076,1.18610,9.382870,283.20,18.00


In [15]:
from sklearn.model_selection import train_test_split

np.random.seed(42)

X_train_temp, X_test, y_train_temp, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
X_tr, X_val, y_tr, y_val = train_test_split(X_train_temp, y_train_temp, test_size=0.1, random_state=42)

In [18]:
# X_tr = np.concatenate(X_tr, axis=0)  # combine features into one matrix -- use this as input features for training a classifier
# y_tr = np.concatenate(y_tr)  # combine target labels into one list -- use this as target labels for training a classifier
# X_test = np.concatenate(X_test, axis=0)  # combine features into one matrix -- use this as input features for testing a classifier
# y_test = np.concatenate(y_test)  # combine target labels into one list -- use this as target labels for evaluating a classifier

In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import learning_curve
from sklearn.metrics import log_loss
import warnings
from sklearn.exceptions import ConvergenceWarning

X_tr_np, y_tr_np, X_test_np, y_test_np, X_val_np, y_val_np = np.array(X_tr), np.array(y_tr), np.array(X_test), np.array(y_test), np.array(X_val), np.array(y_val)


In [21]:
def mlp_train():
    mlp = MLPClassifier(early_stopping = True)
    mlp.fit(X_tr,y_tr)
    return mlp


In [None]:

# Suppress only the ConvergenceWarnings specifically for sklearn
warnings.filterwarnings("ignore", category=ConvergenceWarning)

def validation_mlp(hls, epochs, batch_size, act_func, momentum, solver, decay):
    mlp = MLPClassifier(hidden_layer_sizes=hls, solver = solver, batch_size = batch_size, activation = act_func,  max_iter=1, warm_start=True)
    
    # Set the number of epochs
    
    train_losses = []
    val_losses = []
    train_scores = []
    val_scores = []
    train_stds = []
    val_stds =[]
    train_sizes = np.arange(1, epochs+ 1, 1)
    
    for i in range(epochs):
        mlp.fit(X_tr, y_tr)
        # val_indices = np.random.choice(X_tr.shape[0], size=int(np.round(0.1*(X_tr.shape[0]))), replace=False)
        # X_val, y_val = X_tr[val_indices], y_tr[val_indices]
        # train_pred = mlp.predict_proba(X_tr)
        val_pred = mlp.predict_proba(X_val)
    
        # train_loss = log_loss(y_tr, train_pred)
        val_loss = log_loss(y_val, val_pred)
    
        # train_losses.append(train_loss)
        val_losses.append(val_loss)
        
        # train_mean = np.mean(train_losses)
        val_mean = np.mean(val_losses)
        # train_std = np.std(train_losses)
        val_std = np.std(val_losses)
        
        # train_scores.append(train_mean)
        val_scores.append(val_mean)
        # train_stds.append(train_std)
        val_stds.append(val_std)

    
    # train_stds = np.array(train_stds)
    val_stds = np.array(val_stds)
    # train_scores = np.array(train_scores)
    val_scores = np.array(val_scores)

    train_scores = mlp.loss_curve_
    
    # plt.fill_between(train_sizes, train_scores - train_stds, train_scores + train_stds, alpha = 0.2)
    # plt.fill_between(train_sizes, val_scores - val_stds, val_scores + val_stds, alpha = 0.2)
    
    
   
    # plt.show()
    return mlp, train_scores, val_losses, train_sizes

mlp, train_scores, val_losses, train_sizes= validation_mlp((100,), 250, 'auto', 'relu', 0.9, 'adam', 0.9)
plt.plot(train_sizes, train_scores, label="Training Loss")
plt.plot(train_sizes, val_losses, label="Validation Loss")

# plt.plot(train_sizes, train_losses, label='Training Loss')
# plt.plot(train_sizes, val_losses, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Validation and Training Learning Curve')
# plt.title(('Learning Curve: HLS:', hls, "epochs: ", epochs, "solver: ", solver, "batch size: ", batch_size, "activation: ", act_func))
plt.legend()
plt.savefig('./learning_curve2.png')
plt.ylim(0.25, 1.5)
plt.show()