# Import Libraries

In [20]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
from numpy import std
from numpy import mean
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score

# Train Classifier Model

# Train Regressor Model

In [13]:
# time_offset column has the time periods that we need (5, 10, 15, 30, 60)
# We are forced to drop na because the time offset is where the na is
# compare the two dataframes (the old one with na dropped is the same as the new one)
# Do we scale rain_percentage? It's a target
# There are time offsets of 90 and 120, drop those?

In [14]:
def clean_data(unclean_df, dropna=True):
    """
    Clean and preprocess the data for the models

    Parameters:
    unclean_df (pandas DataFrame): the dataset that hasn't been prepared for model ingestion
    dropna (boolean): choose whether to drop the rows with missing values

    Returns:
    pandas dataframe: cleaned dataset

    """
    unclean_columns = unclean_df.columns.tolist()
    
    unclean_df = unclean_df[
        (unclean_df['M_NUM_WEATHER_FORECAST_SAMPLES'] != 0) & 
        (unclean_df['M_WEATHER_FORECAST_SAMPLES_M_SESSION_TYPE'] != 0)
    ]
    
    unclean_df = unclean_df[
        ['TIMESTAMP', 'M_AIR_TEMPERATURE', 'M_FORECAST_ACCURACY'] + 
        unclean_columns[
            unclean_columns.index('M_WEATHER_FORECAST_SAMPLES_M_WEATHER'):unclean_columns.index('M_AI_DIFFICULTY') + 1
            ]
    ]

    x = unclean_df.drop(['M_WEATHER', 'TIMESTAMP'], axis=1).values
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)

    df_norm = pd.DataFrame(x_scaled, columns=unclean_df.drop(['M_WEATHER', 'TIMESTAMP'], axis=1).columns.tolist())
    df_norm['TIMESTAMP'] = pd.to_datetime(unclean_df['TIMESTAMP'], unit='s', dayfirst=True).reset_index(drop=True)
    df_norm['M_WEATHER'] = unclean_df['M_WEATHER'].reset_index(drop=True)
    df_norm = df_norm.set_index('TIMESTAMP')

    df_norm['M_WEATHER'] = df_norm['M_WEATHER'].apply(lambda x: str(x))
    
    if dropna:
        return df_norm.dropna()
    return df_norm

In [15]:
def weather_classifier():
    """
    Load the trained classifier model

    Parameters:
    None

    Returns:
    pickle file of the model??

    """
    class_model = pickle.load(open('/Users/home/Documents/GitHub/FormulaAIHacks/Random forest models/finalized_randomforest_classifier_model.sav', "rb"))
    return class_model


In [16]:
def rain_predictor():
    """
    Load the trained timeseries model

    Parameters:
    None

    Returns:
    pickle file of the model??

    """
    # change to relative path
    reg_model = pickle.load(open('/Users/home/Documents/GitHub/FormulaAIHacks/Random forest models/finalized_randomforest_regressor_model.sav', "rb"))
    return reg_model

In [17]:
# def metrics(model):
#     """
#     Display the metrics (MAE or Classification Accuracy) for the model

#     Parameters:
#     model (pickle file): the model for which to display the metrics

#     Returns:
#     float: respective model accuracy

#     """
#     if model == 'classifier':
#         return classifier_accuracy
#     return MAE

In [18]:
def predict(observations, time=0):
    """
    Display the predicted weather type and rain percentage

    Parameters:
    observations (pandas dataframe): single row dataframe of the recorded data on which to predict
    time (int -> 0-60): time at which to predict the weather. 0 outputs the dictionary of predictions at 5, 10, 15, 30, and 
        60 minutes

    Returns:
    dictionary: time interval(s) and their respective weather type and rain percentage predictions

    """
    time_intervals = [5, 10, 15, 30, 60]
    if time == 0:
        predictions = {
            time_interval: {
                'type': model(time),
                'rain_percentage': model(time)
            } for time_interval in time_intervals
        }

# Test Blocks

In [19]:
%%time
df = pd.read_csv('weather.csv')
clean_df = clean_data(df)

KeyboardInterrupt: 

In [None]:
clean_df = clean_data(df)
clean_df

Unnamed: 0_level_0,M_SESSION_UID,M_AIR_TEMPERATURE,M_FORECAST_ACCURACY,M_WEATHER_FORECAST_SAMPLES_M_WEATHER,M_WEATHER_FORECAST_SAMPLES_M_TRACK_TEMPERATURE,M_TRACK_TEMPERATURE_CHANGE,M_WEATHER_FORECAST_SAMPLES_M_AIR_TEMPERATURE,M_AIR_TEMPERATURE_CHANGE,M_RAIN_PERCENTAGE,M_AI_DIFFICULTY,M_WEATHER
TIMESTAMP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2022-01-21 00:16:50,0.162116,0.666667,0.0,,,,,,,0.0,0
2022-01-21 00:16:50,0.162116,0.666667,0.0,,,,,,,0.0,0
2022-01-21 00:16:50,0.162116,0.666667,0.0,,,,,,,0.0,0
2022-01-21 00:16:50,0.162116,0.666667,0.0,,,,,,,0.0,0
2022-01-21 00:16:50,0.162116,0.666667,0.0,,,,,,,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...
2022-01-15 23:14:51,0.602198,0.666667,0.0,0.0,0.8125,1.0,0.666667,1.0,0.021739,1.0,0
2022-01-15 23:14:51,0.602198,0.666667,0.0,0.0,0.8125,1.0,0.666667,1.0,0.021739,1.0,0
2022-01-15 23:14:51,0.602198,0.666667,0.0,0.0,0.8125,1.0,0.666667,1.0,0.021739,1.0,0
2022-01-15 23:14:51,0.602198,0.666667,0.0,0.0,0.8125,1.0,0.666667,1.0,0.021739,1.0,0
