In [None]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import os.path as op
import random
from data_prep import prepare_datasets, get_dfs

# Read Data

In [None]:
seed = 42
random.seed(seed)
np.random.seed(seed)

if op.exists(op.join("data", "train.csv")) and op.exists(op.join("data", "val.csv")) and op.exists(op.join("data", "test.csv")):
    train_df = pd.read_csv(op.join("data","train.csv"))
    val_df = pd.read_csv(op.join("data","val.csv"))
    test_df = pd.read_csv(op.join("data","test.csv"))
else:
    train_df, val_df, test_df = prepare_datasets(op.join("data","weather.csv"))
df_dct = {"train": train_df, "val": val_df, "test": test_df}
df_timed_dct = get_dfs(df_dct)


In [None]:
for time_offset in df_timed_dct:
    print(time_offset)
    print("train", df_timed_dct[time_offset]["train"].shape)
    print("val", df_timed_dct[time_offset]["val"].shape)
    print("test", df_timed_dct[time_offset]["test"].shape)
    print()

## Weather

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, mean_absolute_error, mean_absolute_percentage_error


def get_predict(dfs, model, problem_name, target_col, drop_cols, metric_func):
    X_train = dfs["train"].drop(drop_cols, axis=1)
    y_train = dfs["train"][target_col]
    X_val = dfs["val"].drop(drop_cols, axis=1)
    y_val = dfs["val"][target_col]
    X_test = dfs["test"].drop(drop_cols, axis=1)
    y_test = dfs["test"][target_col]

    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    y_pred_val = model.predict(X_val)
    y_pred_test = model.predict(X_test)
    print(problem_name)
    metric_func(y_train, y_val, y_test, y_pred_train, y_pred_val, y_pred_test)

def get_classification_metrics(y_train, y_val, y_test, y_pred_train, y_pred_val, y_pred_test):
    print("Train Acc: {}".format(accuracy_score(y_train, y_pred_train)))
    print("Train F1: {}".format(f1_score(y_train, y_pred_train, average=None)))
    print("Val Acc: {}".format(accuracy_score(y_val, y_pred_val)))
    print("Val F1: {}".format(f1_score(y_val, y_pred_val, average=None)))
    print("Test Acc: {}".format(accuracy_score(y_test, y_pred_test)))
    print("Test F1: {}".format(f1_score(y_test, y_pred_test, average=None)))
    print()

def get_regression_metrics(y_train, y_val, y_test, y_pred_train, y_pred_val, y_pred_test):
    print("Train MAE: {}".format(mean_absolute_error(y_train, y_pred_train)))
    print("Train MAPE: {}%".format(mean_absolute_percentage_error(y_train, y_pred_train) * 100))
    print("Val MAE: {}".format(mean_absolute_error(y_val, y_pred_val)))
    print("Val MAPE: {}%".format(mean_absolute_percentage_error(y_val, y_pred_val) * 100))
    print("Test MAE: {}".format(mean_absolute_error(y_test, y_pred_test)))
    print("Test MAPE Percantage: {}%".format(mean_absolute_percentage_error(y_test, y_pred_test) * 100))
    print()



In [None]:
model = RandomForestClassifier(max_depth=10, random_state=0)
for time_offset in df_timed_dct:
    get_predict(df_timed_dct[time_offset], model, f"Weather {time_offset} Min", ["TARGET_WEATHER"], 
                ["TARGET_WEATHER","TARGET_RAIN_PERCENTAGE"], get_classification_metrics)

## RAIN_PERCENTAGE

In [None]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(max_depth=10, random_state=0)
for time_offset in df_timed_dct:
    get_predict(df_timed_dct[time_offset], model, f"Weather {time_offset} Min", ["TARGET_RAIN_PERCENTAGE"], 
                ["TARGET_WEATHER","TARGET_RAIN_PERCENTAGE"], get_regression_metrics)

## Class Distribution

In [None]:
import matplotlib.pyplot as plt
plt.hist(y_train)
plt.show()

In [None]:
plt.hist(y_test)
plt.show()

In [None]:
set(y_test)