In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import re

In [2]:
data_dir = '../metdata'

In [3]:
files = os.listdir(data_dir)
files = [os.path.join(data_dir,x) for x in files]

In [4]:
pattern = r"([-]?\d+[\.]?\d+)N_([-]?\d+[\.]?\d+)E\.csv"
parsed_latlon = []
for file in files:
    match = re.search(pattern, file)
    if match:
        latitude = match.group(1)
        longitude = match.group(2)
    parsed_latlon.append((float(latitude), float(longitude)))
lat, lon = zip(*parsed_latlon)

In [5]:
kelvin_to_cels = -273.15

def readfile(path, index):
    df = pd.read_csv(path)
    headings = ["year", "month", "day", "hour", "temperature(celsius)", "precipitation", "u-wind", "v-wind"]
    df.columns = headings
    df["temperature(celsius)"] = df["temperature(celsius)"] + kelvin_to_cels
    df["wind-speed"] = np.sqrt(df["u-wind"]**2 + df["v-wind"]**2)
    df["longitude"] = lon[index]
    df["latitude"] = lat[index]
    return df

In [57]:
def join_datasets(files):
    dfs = []
    for i, file in enumerate(files):
        df = readfile(file, i)
        dfs.append(df)
    dfs = pd.concat(dfs, axis = 0)
    dfs.reset_index(drop=True, inplace=True)
    return dfs

dfs = join_datasets(files)
dfs

Unnamed: 0,year,month,day,hour,temperature(celsius),precipitation,u-wind,v-wind,wind-speed,longitude,latitude
0,1980,1,1,1,-2.00,0.000000e+00,1.7201,-1.64870,2.382636,286.01,40.75
1,1980,1,1,2,-3.08,0.000000e+00,1.7232,-1.38870,2.213121,286.01,40.75
2,1980,1,1,3,-3.60,0.000000e+00,1.7494,-1.32370,2.193760,286.01,40.75
3,1980,1,1,4,-3.40,0.000000e+00,1.7849,-1.55190,2.365219,286.01,40.75
4,1980,1,1,5,-4.11,0.000000e+00,1.7648,-1.87200,2.572723,286.01,40.75
...,...,...,...,...,...,...,...,...,...,...,...
4102543,2018,12,31,19,26.56,4.681900e-16,-9.2599,-0.11864,9.260660,283.20,18.00
4102544,2018,12,31,20,26.69,1.755900e-16,-9.2270,0.13446,9.227980,283.20,18.00
4102545,2018,12,31,21,26.86,1.154500e-10,-9.1698,0.47548,9.182119,283.20,18.00
4102546,2018,12,31,22,27.12,2.926000e-16,-9.3076,1.18610,9.382870,283.20,18.00


In [105]:
dfs.sort_values(by=["year","month","day","hour"], inplace=True)
dfs.reset_index(drop=True, inplace=True)


In [106]:
test_data = dfs[dfs["year"]==2018]
rest_data = dfs[dfs["year"]!=2018]

In [107]:
y_var = "temperature(celsius)"
y_test = test_data[y_var]
y = rest_data[y_var]

In [113]:
X = rest_data.drop(y_var, axis=1)
X_test = test_data[y_var]

0          -2.00
1           5.06
2          -1.54
3           8.30
4          17.69
           ...  
3997423   -26.95
3997424    23.34
3997425   -16.89
3997426    16.44
3997427    26.28
Name: temperature(celsius), Length: 3997428, dtype: float64

In [119]:
years = dfs["year"].unique()
np.random.seed(42)
val_indices = np.random.choice(years.shape[0], 2, replace=False)
val_years = years[val_indices]
train_data = dfs[~dfs["year"].isin(val_years)]
val_data = dfs[dfs["year"].isin(val_years)]


In [120]:
X_train = train_data.drop(y_var, axis = 1)
y_train = train_data[y_var]

X_val = val_data.drop(y_var, axis = 1)
y_val = val_data[y_var]


In [129]:
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import accuracy_score
from sklearn.model_selection import learning_curve
from sklearn.metrics import log_loss
import warnings
from sklearn.exceptions import ConvergenceWarning
from sklearn.metrics import mean_absolute_error

warnings.filterwarnings("ignore", category=ConvergenceWarning)

def mlp_train():
    mlp = MLPRegressor(early_stopping = True)
    mlp.fit(X_tr,y_tr)
    return mlp

In [None]:

# Suppress only the ConvergenceWarnings specifically for sklearn
warnings.filterwarnings("ignore", category=ConvergenceWarning)

def validation_mlp(hls, epochs, batch_size, act_func, momentum, solver, decay):
    mlp = MLPRegressor(hidden_layer_sizes=hls, solver = solver, batch_size = batch_size, activation = act_func,  max_iter=1, warm_start=True)
    # Train the model
    losses = []
    for epoch in range(epochs):
        mlp.partial_fit(X_train, y_train)
        y_pred = mlp.predict(X_val)
        loss = mean_absolute_error(y_val, y_pred)
        print(f"epoch: {epoch}, loss: {loss}")
        losses.append(loss)
    return mlp, losses

mlp, loss_curve = validation_mlp((10,5), 100, 'auto', 'relu', 0.9, 'adam', 0.9)


epoch: 0, loss: 9.433801631078312
epoch: 1, loss: 9.43556453833774
epoch: 2, loss: 9.434318888814367
epoch: 3, loss: 9.422473285170911
epoch: 4, loss: 9.429480592985755
epoch: 5, loss: 9.431886051099555
epoch: 6, loss: 9.436911364210365
epoch: 7, loss: 9.427311048519377
epoch: 8, loss: 9.428131552754351
epoch: 9, loss: 9.43119295613593
epoch: 10, loss: 9.437893448419839
epoch: 11, loss: 9.435369921391901
epoch: 12, loss: 9.429507114648146
epoch: 13, loss: 9.438005150941425
epoch: 14, loss: 9.431803237181864
epoch: 15, loss: 9.436857833921438
epoch: 16, loss: 9.436202544567804
epoch: 17, loss: 9.431664304333895
epoch: 18, loss: 9.43275233577247


In [None]:
plt.plot(loss_curve)
plt.xlabel('Epochs')
plt.ylabel('Mean Squared Error')
plt.title('Training Loss Curve')
plt.grid(True)
plt.show()