# Import Libraries

In [37]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
from numpy import std, 
from numpy import mean
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score

# Clean Data

In [38]:
# clean_data = clean_data("____add relative path____")

# Train Classifier Model

# Train Regressor Model

In [39]:
# time_offset column has the time periods that we need (5, 10, 15, 30, 60)
# We are forced to drop na because the time offset is where the na is
# compare the two dataframes (the old one with na dropped is the same as the new one)
# Do we scale rain_percentage? It's a target
# There are time offsets of 90 and 120, drop those?

In [40]:
def clean_data(unclean_df, dropna=True):
    """
    Clean and preprocess the data for the models

    Parameters:
    unclean_df (pandas DataFrame): the dataset that hasn't been prepared for model ingestion
    dropna (boolean): choose whether to drop the rows with missing values

    Returns:
    pandas dataframe: cleaned dataset

    """
    unclean_columns = unclean_df.columns.tolist()
    
    unclean_df = unclean_df[
        (unclean_df['M_NUM_WEATHER_FORECAST_SAMPLES'] != 0) & 
        (unclean_df['M_WEATHER_FORECAST_SAMPLES_M_SESSION_TYPE'] != 0)
    ]
    
    unclean_df = unclean_df[
        ['TIMESTAMP', 'M_AIR_TEMPERATURE', 'M_FORECAST_ACCURACY'] + 
        unclean_columns[
            unclean_columns.index('M_WEATHER_FORECAST_SAMPLES_M_WEATHER'):unclean_columns.index('M_AI_DIFFICULTY') + 1
            ]
    ]

    x = unclean_df.drop(['M_WEATHER', 'TIMESTAMP'], axis=1).values
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)

    df_norm = pd.DataFrame(x_scaled, columns=unclean_df.drop(['M_WEATHER', 'TIMESTAMP'], axis=1).columns.tolist())
    df_norm['TIMESTAMP'] = pd.to_datetime(unclean_df['TIMESTAMP'], unit='s', dayfirst=True).reset_index(drop=True)
    df_norm['M_WEATHER'] = unclean_df['M_WEATHER'].reset_index(drop=True)
    df_norm = df_norm.set_index('TIMESTAMP')

    df_norm['M_WEATHER'] = df_norm['M_WEATHER'].apply(lambda x: str(x))
    
    if dropna:
        return df_norm.dropna()
    return df_norm

In [41]:
def weather_classifier(data):
    """
    Load the trained classifier model

    Parameters:
    None

    Returns:
    pickle file of the model??

    """
    df = data
    df = df.dropna()
    Y = df['M_WEATHER']
    X = df.drop(columns=['M_WEATHER'])

    X, X_test, y, y_test = train_test_split(X, Y, test_size=0.2, train_size=0.8, random_state=42)
    X_train, X_cv, y_train, y_cv = train_test_split(X, y, test_size=0.25, train_size=0.75, random_state=42)

    model = RandomForestClassifier(class_weight='balanced')
    model.fit(X_train,y_train)

    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    n_scores = cross_val_score(model, X_cv, y_cv, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')

    # report performance
    print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    print(accuracy)

    row = X.iloc[1].to_numpy().reshape(1,-1)
    yhat = model.predict(row)
    print('predicted weather is', yhat)


In [42]:
def rain_predictor(data):
    """
    Load the trained timeseries model

    Parameters:
    None

    Returns:
    pickle file of the model??

    """
    # change to relative path
    df = data
    df = df.dropna()
    Y = df['M_RAIN_PERCENTAGE']
    X = df.drop(columns=['M_RAIN_PERCENTAGE'])

    X, X_test, y, y_test = train_test_split(X, Y, test_size=0.2, train_size=0.8, random_state=42)
    X_train, X_cv, y_train, y_cv = train_test_split(X, y, test_size=0.25, train_size=0.75, random_state=42)
    
    model = RandomForestRegressor(n_estimators=10, random_state=0)
    model.fit(X_train, y_train)

    cv = KFold(n_splits=10, random_state=1, shuffle=True)
    n_scores = cross_val_score(model, X_cv, y_cv, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1, error_score='raise')
    # report performance
    print('MAE: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

    predictions = model.predict(X_test)
    MSE = mean_squared_error(y_test,predictions)
    print(MSE)
    
    row = X.iloc[1].to_numpy().reshape(1, -1)
    yhat = model.predict(row)
    print('predicted rain probability is', yhat)

In [43]:
# def metrics(model):
#     """
#     Display the metrics (MAE or Classification Accuracy) for the model

#     Parameters:
#     model (pickle file): the model for which to display the metrics

#     Returns:
#     float: respective model accuracy

#     """
#     if model == 'classifier':
#         return classifier_accuracy
#     return MAE

# Test Blocks

In [44]:
def main():
    clean_df = clean_data(df)

    weather_classifier(clean_df)
    rain_predictor(clean_df)

In [45]:
df = pd.read_csv('weather.csv')
main()

  exec(code_obj, self.user_global_ns, self.user_ns)


Accuracy: 0.982 (0.001)
0.9818126799140292
predicted weather is ['0']


