In [None]:
import pandas as pd 
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit

In [None]:
df = pd.read_csv("data/Train.csv")
#df.head()

In [None]:
df.info()

# basic cleanup and conversion

In [None]:
# covert features  fron string to List of values 
def replace_nan(x):
    if x==" ":
        return np.nan
    else :
        return float(x)
features=["temp","precip","rel_humidity","wind_dir","wind_spd","atmos_press"]
for feature in features : 
    df[feature] = df[feature].apply(lambda x: [ replace_nan(X) for X in x.replace("nan"," ").split(",")])

# drop rows with too many NaNs

In [None]:
# recording periods contain varying degrees of NaNs: compute percent NaN for each recording period and feature
# function to compute the percentage of NaNs per recording period
def compute_percent_nan(df, col_name):
    df['percent_nan_'+col_name] = df[col_name].apply(lambda x: np.isnan(np.array(x)).sum()/len(x)*100)
    return df

In [None]:
# calculate percentage of missings per recording period and feature and append to dataframe
for col_name in features:
    data=compute_percent_nan(df,col_name)

In [None]:
#df.head()

In [None]:
# filter observations based on percent NaN and check again the data distribution of the target and summary features
nan_columns = ['percent_nan_temp', 'percent_nan_precip','percent_nan_rel_humidity', 'percent_nan_wind_dir','percent_nan_wind_spd', 
        'percent_nan_atmos_press']
df = df[(df[nan_columns]<30).all(axis=1)]
df.drop(nan_columns, axis=1, inplace=True)
df.reset_index(inplace=True)
df.head()

# do the wind thing


In [None]:

df["wind_dir_x"] = df.wind_dir.apply(lambda x: list(np.cos(np.array(x) * np.pi /180)))
df["wind_dir_y"] = df.wind_dir.apply(lambda x: list(np.sin(np.array(x) * np.pi /180)))
df.drop("wind_dir", axis=1, inplace=True)

features=["temp","precip","rel_humidity","wind_dir_x","wind_dir_y","wind_spd","atmos_press"]

df.head(3)

# get last measurement from each time series

In [None]:
# get last value for each time series
# did this before dropping all NaNs


def last(x):
    """return last value of list, if not there then return second to last, if not there the one 24h before last. 

    Args:
        x (_type_): _description_

    Returns:
        _type_: _description_
    """
    if not pd.isna(x[-1]):
        return x[-1]
    if pd.isna(x[-1]) and not pd.isna(x[-2]):
        return x[-2]
    return x[-25]
    #e=1
    #while pd.isna(x[-e]) and e < len(x):
    #    e += 24
    #return x[-e]

    
def get_last(x,col_name):
    x["last_"+col_name]=x[col_name].apply(last)
    return x 

for col_name in features:
    df=get_last(df,col_name)

In [None]:
df.head(3)

In [None]:
df.info()

# get the time

In [None]:
from scipy.optimize import curve_fit

def fit_time(time_series, return_fit_curve=False):
    """gets a time series, must be hourly, calculates the starting time from fitting a sine on it.

    Args:
        time_series (list): a list describing the time series
        return_fit_curve (bool, optional): tells, weather it should return the x, y and fitted y values. Defaults to False.

    Returns:
        value: the phase angle describing the time of day, ranging from 0 to 2 pi
        OR also the x and y used for fitting and the fitted y.
    """
    # get some x values
    x = np.arange(len(time_series))
    # do a dataframe to drop NaNs
    xy = pd.DataFrame(
        {"x": x,
        "y": time_series}
    )
    xy.dropna(inplace=True)
    # reconvert
    x = xy.x
    y = xy.y 

    # do the fit
    def tod_func(x, y0, amp, phi):
        return y0 + amp * -np.cos(phi + x*np.pi/12)
    params = curve_fit(tod_func, x, y)
    [y0_fit, amp_fit, phi_fit] = params[0]

    # exclude negative amplitudes -> convert to phase information
    if amp_fit < 0:
        amp_fit = abs(amp_fit)
        phi_fit += np.pi

    # get tod-angle between 0 and 2 pi
    phi_fit = phi_fit % (2*np.pi)
    if phi_fit < 0:
        phi_fit += 2*np.pi
    # calulate y_fit
    y_fit = tod_func(x, y0_fit, amp_fit, phi_fit)
    if return_fit_curve:
        return [phi_fit, x, y, y_fit]
    return phi_fit


In [None]:
y = df.temp[0]
phi_fit, x, y, y_fit = fit_time(y, return_fit_curve=True)
print("time of day angle:", phi_fit.round(2))
plt.plot(x, y)
plt.plot(x, y_fit)

In [None]:
N = 5
fig, axs = plt.subplots(N)
for n in range(N):
    y = df.temp[n+1000]
    _, x, y, y_fit = fit_time(y, return_fit_curve=True)
    axs[n].plot(x, y)
    axs[n].plot(x, y_fit)
    plt.xlim(0, 125)


In [None]:
df["time_of_day_angle"] = df.temp.apply(fit_time)
df.head(3)

In [None]:
sns.scatterplot(y=df.target, x=df.time_of_day_angle, hue=df.location)

In [None]:
df["time_of_day_cos"] = df.time_of_day_angle.apply(lambda x: np.cos(x))
df["time_of_day_sin"] = df.time_of_day_angle.apply(lambda x: np.sin(x))
df.drop("time_of_day_angle", axis=1, inplace=True)
df.head(3)

# other aggregation stuff

In [None]:
# aggregation function extracting summary statistics from every recording period and appending it as a new column to a dataframe
def aggregate_features(x,col_name):
    #x["max_"+col_name]=x[col_name].apply(np.max)
    #x["min_"+col_name]=x[col_name].apply(np.min)
    x["mean_"+col_name]=x[col_name].apply(np.mean)
    x["std_"+col_name]=x[col_name].apply(np.std)
    #x["var_"+col_name]=x[col_name].apply(np.var)
    #x["median_"+col_name]=x[col_name].apply(np.median)
    #x["ptp_"+col_name]=x[col_name].apply(np.ptp)
    return x  

# function returning only non-Null values (helper for aggregation function)
def remove_nan_values(x):
    return [e for e in x if not pd.isna(e)]


In [None]:
# remove NaNs from dataframe
for col_name in features:
   df[col_name]=df[col_name].apply(remove_nan_values)

In [None]:
#extract summary statistics for each recording period and feature
for col_name in features:
    df=aggregate_features(df,col_name)

In [None]:
df.drop(features, axis=1, inplace=True)

In [None]:
df.head(3)

In [None]:
df.dropna(inplace=True)

In [None]:
df.info()

# claculate a quick model
without the last column, to plot residuals over it

In [None]:
# import libraries and metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

In [None]:
df.columns

In [None]:
# get target and features
X = df[['last_temp', 'last_precip', 'last_rel_humidity', 'last_wind_dir', 'last_wind_spd',
       'last_atmos_press','location', 'mean_temp', 'mean_precip', 'mean_rel_humidity', 'mean_wind_dir', 'mean_wind_spd', 'mean_atmos_press']]
y = df.target

print(X.shape)
print(y.shape)

In [None]:
# perform train test split, stratified by location in order to ensure that locations are balanced between training and test set
rseed = 42

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=X['location'], random_state=rseed)

print(X_train.shape)
print(X_test.shape)

In [None]:
# scale features using z-transformation
scaler = StandardScaler()

# fit_transform training data, drop location column since it will not be used for prediction
X_train_scaled = scaler.fit_transform(X_train.drop('location', axis=1))
# aply transform to test data
X_test_scaled = scaler.transform(X_test.drop('location', axis=1))

In [None]:
# train the model
linreg = LinearRegression()

linreg.fit(X_train_scaled, y_train)

In [None]:
# predict new cases
y_pred = linreg.predict(X_test_scaled)

# show first 10 predictions
y_pred[:10]

In [None]:
# evaluate model accurray

rmse_linreg = mean_squared_error(y_test, y_pred, squared=False)
r2_linreg = r2_score(y_test, y_pred)

print(f'RMSE on testset: {round(rmse_linreg,2)}')
print(f'Coefficient of determination on testset: {round(r2_linreg,2)}')

In [None]:
# calculate residuals
residual = y_test - y_pred

In [None]:
sns.scatterplot(x=X_test.last_temp, y=residual, hue=X_test['location'])
plt.xlabel('last temperature')
plt.ylabel('residual')
plt.title('Residual plot');