In [36]:
import pandas as pd

MEASUREMENT_DAYS = 214
ACCEPTABLE_MISSING_PERCENTAGE = 0.75

start_date = f'{2017}-03-01'
end_date = f'{2017}-10-01'
weather_type = 'precipitation'

In [109]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import math
import numpy as np


def interpolate(row):

    y = np.array(row)
    X = np.array(range(0, len(y)))

    # print(y)
    # print(X)
    vals_to_interpolate = []
    for j, val in enumerate(y[:]):
        if math.isnan(val):
            vals_to_interpolate.append(j)
    y = np.delete(y, vals_to_interpolate)
    X = np.delete(X, vals_to_interpolate)
    X.shape = [len(X), 1]
    # print(X.shape)
    # print(y.shape)
    r2_scores = []
    for degree in range(2, 20):
        poly_reg = PolynomialFeatures(degree=degree)
        X_poly = poly_reg.fit_transform(X)
        pol_reg = LinearRegression()
        pol_reg.fit(X_poly, y)
        r2_scores.append(r2_score(y, pol_reg.predict(poly_reg.fit_transform(X))))

    best_degree = (r2_scores.index(max(r2_scores)) + 2)
    poly_reg = PolynomialFeatures(degree=best_degree)
    X_poly = poly_reg.fit_transform(X)
    pol_reg = LinearRegression()
    pol_reg.fit(X_poly, y)

    interpolated_values = []
    for val in vals_to_interpolate:
        interpolated_values.append(pol_reg.predict(poly_reg.fit_transform([[val]]))[0])

    for j, val in enumerate(interpolated_values):
        y = np.insert(y, vals_to_interpolate[j], val)

    return pd.Series(y)



In [111]:

processes_dataset = pd.read_csv(
    f'../../kornmo-data-files/raw-data/weather-data/processed/{weather_type}/{weather_type}_processed_{start_date}_to_{end_date}.csv'
)

print(len(processes_dataset))


for index, row in processes_dataset.iterrows():
    row = row.filter(regex="day_.*")
    missing = row.isna().sum().sum()
    coverage = missing / MEASUREMENT_DAYS

    if (row == 0).sum() + missing == MEASUREMENT_DAYS:
        processes_dataset.drop([index], inplace=True)
        placeholder = 0

    elif coverage >= ACCEPTABLE_MISSING_PERCENTAGE:
        print(f"Dropping row {index} which is missing {round(coverage, 4)}% of the values")
        processes_dataset.drop([index], inplace=True)

    elif missing != 0:

        row.reset_index(drop=True, inplace=True)

        interpolated_row = interpolate(row)

        for i in range(MEASUREMENT_DAYS):
            processes_dataset.loc[index, f'day_{i}'] = interpolated_row[i]

if processes_dataset.isna().sum().sum() != 0:
    print(len(processes_dataset))
    print(f"There are {processes_dataset.isna().sum().sum()} NaN values in the dataset")
    print(processes_dataset.isna().sum())
else:
    processes_dataset.reset_index(drop=True, inplace=True)
    processes_dataset.to_csv(
        f'../../kornmo-data-files/raw-data/weather-data/cleaned/{weather_type}/{weather_type}_processed_{start_date}_to_{end_date}.csv'
    )
    print(f"Dataset valid and cleared for interpolation")

608
Dropping row 39 which is missing 0.8084% of the values
Dropping row 129 which is missing 0.9626% of the values
Dropping row 207 which is missing 0.9907% of the values
Dropping row 208 which is missing 0.8598% of the values
Dropping row 237 which is missing 0.8598% of the values
Dropping row 238 which is missing 0.9673% of the values
Dropping row 286 which is missing 0.8551% of the values
Dropping row 374 which is missing 0.8598% of the values
Dropping row 416 which is missing 0.9626% of the values
Dropping row 438 which is missing 0.8598% of the values
Dropping row 461 which is missing 0.8879% of the values
Dropping row 478 which is missing 0.9159% of the values
Dataset valid and cleared for interpolation
