<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Powerpredict" data-toc-modified-id="Powerpredict-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Powerpredict</a></span><ul class="toc-item"><li><span><a href="#Loading-the-dataset" data-toc-modified-id="Loading-the-dataset-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Loading the dataset</a></span></li></ul></li></ul></div>

# Powerpredict

Daily power consumption is related to the weather (rain, sunshine, temperature, etc). 
Prediction of the power consumption based on the weather is relevant for energy suppliers.
In this dataset you have to use the provided weather information to predict the power consumption.

## Loading the dataset

In [None]:
import pandas as pd
import os

In [None]:
# additional imports
import sklearn
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
from joblib import dump, load
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression


# select "0" for Random Forests and "1" for Linear Regression
MACHINE_LEARNING_MODEL = 1

In [None]:
DATASET_PATH = "."
if os.path.exists("/data/mlproject22"):
    DATASET_PATH = "/data/mlproject22"

In [None]:
powerpredict = pd.read_csv(os.path.join(DATASET_PATH,"powerpredict.csv.zip"))
X = powerpredict.drop(columns=["power_consumption"])
y = powerpredict[["power_consumption"]]

print(X.shape)
print(y.shape)

Some columns are dropped here for simplicity, but they might provide useful information as well, so you might want to use them.

In [None]:
def drop_object_columns(df):
    drop_cols = [c for t, c in zip([t !="object" for t in df.dtypes], df.columns) if not t]
    return df.drop(columns=drop_cols)

DOC = drop_object_columns

def predict_show_metrics(name, reg, metric):
    print(f"{name}", metric(y, reg.predict(DOC(x_filtered))))

    
def visualize_data():
    temperature_columns = ['Bedrock_t', 'Gotham City_t', 'New New York_t', 'Paperopoli_t', 'Springfield_t']
    # first plot with temps
    plt.figure(figsize=(10,6))
    for col in temperature_columns:
        plt.plot(powerpredict[col], label=col)
    plt.xlabel("Time")
    plt.ylabel("Temperature")
    plt.legend()
    plt.show()
    
    # second plot with power consumption
    plt.figure(figsize=(10, 6))
    plt.plot(powerpredict['power_consumption'])
    plt.xlabel('Time')
    plt.ylabel('Power Consumption')
    plt.title('Power Consumption')
    plt.show()  

In [None]:
#random forests estimator

le = preprocessing.OrdinalEncoder()
# Buidling the Model:
# 1. Preproccesing of data
selected_features = ['Bedrock_t', 'Bedrock_t_low', 'Bedrock_humidity',
                     'Bedrock_wind_deg', 'Bedrock_clouds', 'Gotham City_t',
                     'Gotham City_t_high', 'Gotham City_bars', 'Gotham City_humidity',
                     'Gotham City_wind_speed', 'Gotham City_wind_deg', 'Gotham City_clouds',
                     'New New York_t', 'New New York_t_low', 'New New York_t_high',
                     'New New York_bars', 'New New York_humidity', 'New New York_wind_speed',
                     'New New York_wind_deg', 'Paperopoli_t', 'Paperopoli_t_low',
                     'Paperopoli_t_high', 'Paperopoli_bars', 'Paperopoli_humidity',
                     'Paperopoli_wind_deg', 'Paperopoli_weather_description',
                     'Springfield_t', 'Springfield_t_low', 'Springfield_t_high',
                     'Springfield_bars', 'Springfield_humidity', 'Springfield_wind_deg']


data = powerpredict[selected_features]
data = data.dropna()
data = le.fit_transform(data)

random_forest_regressor_model = RandomForestRegressor(max_features='log2', criterion='squared_error', max_depth=17, n_jobs=(-1), n_estimators=50, verbose=2)

X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(data, y, random_state=42,test_size=0.1)

random_forest_regressor_model.fit(X_train, y_train.values.ravel())

# store the model:
from joblib import dump, load
dump(random_forest_regressor_model, 'random_forests_model.joblib') 

Here is an example dummy ML method showing the success of such a simple predictor:

In [None]:
encoder = preprocessing.OrdinalEncoder()

# Encode the data without the first row
encoded_data = encoder.fit_transform(X)

# Create a DataFrame with the encoded data
data = pd.DataFrame(encoded_data)

# Fill NaN values with mean
mean_value = data.mean()
data = data.fillna(mean_value)

allData = pd.concat([data, y], axis=1)

# Compute the correlation matrix
correlation_matrix = allData.corr()

# Get the most correlated features and throw away the ones that don't have enough correlation
power_cons_corr = (
    correlation_matrix["power_consumption"].abs().sort_values(ascending=False)
)
selected_columns = power_cons_corr[1:21].index.tolist()
x_filtered = allData[selected_columns]

# split the set into train and test set
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
    x_filtered, y, test_size=0.20, random_state=22
)
# Linear regression
lin = make_pipeline(PolynomialFeatures(2), LinearRegression())
lin.fit(X_train, y_train)
y_predicted = lin.predict(X_test)
y_predicted_train = lin.predict(X_train)
from joblib import dump, load
dump(lin, 'linear_regression.joblib') 

You have to implement a simple method that performs the predictions with the given signature:

In [None]:
from sklearn import preprocessing

# start by preprocessing the data
le = preprocessing.OrdinalEncoder()

selected_features = ['Bedrock_t', 'Bedrock_t_low', 'Bedrock_humidity',
                         'Bedrock_wind_deg', 'Bedrock_clouds', 'Gotham City_t',
                         'Gotham City_t_high', 'Gotham City_bars', 'Gotham City_humidity',
                         'Gotham City_wind_speed', 'Gotham City_wind_deg', 'Gotham City_clouds',
                         'New New York_t', 'New New York_t_low', 'New New York_t_high',
                         'New New York_bars', 'New New York_humidity', 'New New York_wind_speed',
                         'New New York_wind_deg', 'Paperopoli_t', 'Paperopoli_t_low',
                         'Paperopoli_t_high', 'Paperopoli_bars', 'Paperopoli_humidity',
                         'Paperopoli_wind_deg', 'Paperopoli_weather_description',
                         'Springfield_t', 'Springfield_t_low', 'Springfield_t_high',
                         'Springfield_bars', 'Springfield_humidity', 'Springfield_wind_deg']

def leader_board_predict_fn(values):
    
    
    if MACHINE_LEARNING_MODEL == 0:
        values_selected = values[selected_features]
        values_selected = values_selected.dropna()
        values_encoded = le.fit_transform(values_selected)
        from joblib import dump, load
        loaded_rfr = load('random_forests_model.joblib') 
        return loaded_rfr.predict(values_encoded)
    if MACHINE_LEARNING_MODEL == 1:
        try:
            values = le.fit_transform(values)
            # only get the most features that were mostly correlated to the model
            values_filtered = values[:, selected_columns]
            
            from joblib import dump, load
            loaded_rfr = load('linear_regression.joblib')  # Provide the file path to the saved model
            return loaded_rfr.predict(values_filtered)
        except Exception as E:
            print(E)

which will then be used to calculate the leaderboard score in a way similar to this:

In [None]:
def get_score():
    """
    Function to compute scores for train and test datasets.
    """
    
    import sklearn.metrics
    import pandas as pd
    import pathlib
    import os

    try:
        TEST_DATASET_PATH = "."
        if os.path.exists("/data/mlproject22/"):
            TEST_DATASET_PATH = "/data/mlproject22/"

        test_data = pd.read_csv(os.path.join(TEST_DATASET_PATH,"powerpredict.csv.zip"))
        X_test = test_data.drop(columns=["power_consumption"])
        y_test = test_data[["power_consumption"]]
        y_predicted = leader_board_predict_fn(X_test)
        dataset_score = sklearn.metrics.mean_absolute_error(y_test, y_predicted)
    except Exception:
        dataset_score = float("nan")
    print(f"Train Dataset Score: {dataset_score}")

    import os
    import pwd
    import time
    import datetime
    import pandas as pd
    user_id = pwd.getpwuid( os.getuid() ).pw_name
    curtime = time.time()
    dt_now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M")

    try:  
        HIDDEN_DATASET_PATH = os.path.expanduser("/data/mlproject22-test-data/")
        test_data = pd.read_csv(os.path.join(HIDDEN_DATASET_PATH,"hidden_powerpredict.csv.zip"))
        X_test = test_data.drop(columns=["power_consumption"])
        y_test = test_data[["power_consumption"]]
        y_predicted = leader_board_predict_fn(X_test)
        hiddendataset_score = sklearn.metrics.mean_absolute_error(y_test, y_predicted)
        print(f"Test Dataset Score: {hiddendataset_score}")
        score_dict = dict(
            score_hidden=hiddendataset_score,
            score_train=dataset_score,
            unixtime=curtime,
            user=user_id,
            dt=dt_now,
            comment="",
        )
    except Exception as e:
        err = str(e)
        score_dict = dict(
            score_hidden=float("nan"),
            score_train=dataset_score,
            unixtime=curtime,
            user=user_id,
            dt=dt_now,
            comment=err
        )

    #if list(pathlib.Path(os.getcwd()).parents)[0].name == 'source':
    #    print("we are in the source directory... replacing values.")
    #    print(pd.DataFrame([score_dict]))
    #    score_dict["score_hidden"] = -1
    #    score_dict["score_train"] = -1
    #    print("new values:")
    #    print(pd.DataFrame([score_dict]))

    pd.DataFrame([score_dict]).to_csv("powerpredict.csv", index=False)
    
get_score()