In [0]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.ensemble import IsolationForest

In [0]:
# df_raw = pd.read_csv("/Users/mikemurphy/Desktop/Mcgill_Winter_2023/W2_Adv_Info_systems_prod/Group Project/media prediction and its cost.csv")

# File location and type
file_location = "/FileStore/tables/media_prediction_and_its_cost-1.csv"
file_type = "csv"

# CSV options
infer_schema = "True"
first_row_is_header = "True"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

df_raw = df.toPandas()

In [0]:
num_cols, cate_cols, pre_dum_cols = [], [], []
for c in df_raw.columns:
    uni_len = len(set(df_raw[c]))
    dtype = df_raw[c].dtypes

    if dtype == object:
        cate_cols.append(c)
    else: # dtype == float
        if uni_len == 2:
            pre_dum_cols.append(c)
        elif uni_len < 10:
            num_cols.append(c)
            print("Number of unique values: %d \t Datatype: %s \t Column name: %s" %(uni_len, dtype, c))
        else:
            num_cols.append(c)

Number of unique values: 6 	 Datatype: float64 	 Column name: unit_sales(in millions)
Number of unique values: 6 	 Datatype: float64 	 Column name: total_children
Number of unique values: 5 	 Datatype: float64 	 Column name: avg_cars_at home(approx)15
Number of unique values: 6 	 Datatype: float64 	 Column name: num_children_at_home
Number of unique values: 5 	 Datatype: float64 	 Column name: avg_cars_at home(approx)18


In [0]:
df_raw.head()

Unnamed: 0,food_category,food_department,food_family,store_sales(in millions),store_cost(in millions),unit_sales(in millions),promotion_name,sales_country,marital_status,gender,...,grocery_sqft,frozen_sqft,meat_sqft,coffee_bar,video_store,salad_bar,prepared_food,florist,media_type,cost
0,Breakfast Foods,Frozen Foods,Food,7.36,2.7232,4.0,Bag Stuffers,USA,M,F,...,18670.0,5415.0,3610.0,1.0,1.0,1.0,1.0,1.0,"Daily Paper, Radio",126.62
1,Breakfast Foods,Frozen Foods,Food,5.52,2.5944,3.0,Cash Register Lottery,USA,M,M,...,18670.0,5415.0,3610.0,1.0,1.0,1.0,1.0,1.0,"Daily Paper, Radio",59.86
2,Breakfast Foods,Frozen Foods,Food,3.68,1.3616,2.0,High Roller Savings,USA,S,F,...,18670.0,5415.0,3610.0,1.0,1.0,1.0,1.0,1.0,"Daily Paper, Radio",84.16
3,Breakfast Foods,Frozen Foods,Food,3.68,1.1776,2.0,Cash Register Lottery,USA,M,F,...,18670.0,5415.0,3610.0,1.0,1.0,1.0,1.0,1.0,In-Store Coupon,95.78
4,Breakfast Foods,Frozen Foods,Food,4.08,1.428,3.0,Double Down Sale,USA,M,M,...,18670.0,5415.0,3610.0,1.0,1.0,1.0,1.0,1.0,Radio,50.79


# Model Development

In [0]:
y = df_raw['cost']
X = df_raw.drop(columns = ['cost'])
X = pd.get_dummies(X)

In [0]:
X.shape

Out[19]: (60428, 330)

In [0]:
from sklearn.model_selection import train_test_split
X_train_final, X_test_final, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

In [0]:
X_test_final, X_val_final, y_test, y_val = train_test_split(X_test_final, y_test, test_size = 0.5, random_state = 0)

Since the model from TPOT yielded the lowest RMSE, we train the model based on parameters derived from TPOT as our final model.

In [0]:
from sklearn.ensemble import ExtraTreesRegressor
extratree_best = ExtraTreesRegressor(random_state = 0, bootstrap=False, max_features=0.25, min_samples_leaf=3, min_samples_split=4, n_estimators=100)
final_model_opt = extratree_best.fit(X_train_final, y_train)
y_test_pred_final = final_model_opt.predict(X_test_final)

In [0]:
from sklearn.metrics import mean_squared_error
final_model_opt_mse = mean_squared_error(y_test, y_test_pred_final, squared=False)
final_model_opt_mse

Out[23]: 0.7806648492919666

Unnamed: 0,store_sales(in millions),store_cost(in millions),unit_sales(in millions),total_children,avg_cars_at home(approx)15,num_children_at_home,avg_cars_at home(approx)18,SRP,gross_weight,net_weight,...,"media_type_Daily Paper, Radio","media_type_Daily Paper, Radio, TV",media_type_In-Store Coupon,media_type_Product Attachment,media_type_Radio,media_type_Street Handout,media_type_Sunday Paper,"media_type_Sunday Paper, Radio","media_type_Sunday Paper, Radio, TV",media_type_TV
15843,2.01,0.6633,3.0,4.0,4.0,4.0,4.0,0.67,21.00,19.00,...,0,0,0,1,0,0,0,0,0,0
32571,3.56,1.5308,2.0,2.0,1.0,2.0,1.0,1.78,17.10,16.10,...,0,0,0,0,0,0,0,0,1,0
58836,6.51,3.1899,3.0,3.0,2.0,2.0,2.0,2.17,6.96,3.96,...,0,0,0,0,0,1,0,0,0,0
4527,11.61,3.5991,3.0,4.0,2.0,0.0,2.0,3.87,9.55,6.55,...,0,0,0,0,0,0,0,0,0,0
25229,5.72,2.8600,4.0,1.0,1.0,1.0,1.0,1.43,19.10,17.10,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33353,5.60,2.7440,4.0,1.0,1.0,0.0,1.0,1.40,7.28,6.28,...,0,0,0,0,0,1,0,0,0,0
31433,13.52,4.8672,4.0,5.0,3.0,0.0,3.0,3.38,10.20,9.19,...,0,0,0,0,1,0,0,0,0,0
54556,2.94,1.2642,2.0,4.0,4.0,0.0,4.0,1.47,20.70,18.70,...,0,0,0,1,0,0,0,0,0,0
39495,5.55,1.7205,3.0,4.0,2.0,0.0,2.0,1.85,10.40,7.39,...,0,0,0,0,1,0,0,0,0,0


In [0]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
import numpy as np
import mlflow
from mlflow.models import make_metric
import os
import matplotlib.pyplot as plt

# loading the California housing dataset
cali_housing = fetch_california_housing(as_frame=True)

# split the dataset into train and test partitions
X_train, X_test, y_train, y_test = train_test_split(
    cali_housing.data, cali_housing.target, test_size=0.2, random_state=123
)

# train the model
gbr = GradientBoostingRegressor().fit(X_train, y_train)

# creating the evaluation dataframe
eval_data = X_test_final[[c for c in X_test_final.columns if c != "targer"]].copy()

eval_data["target"] = y_test


def squared_diff_plus_one(eval_df, _builtin_metrics):
    """
    This example custom metric function creates a metric based on the ``prediction`` and
    ``target`` columns in ``eval_df``.
    """
    return np.sum(np.abs(eval_df["prediction"] - eval_df["target"] + 1) ** 2)


def sum_on_target_divided_by_two(_eval_df, builtin_metrics):
    """
    This example custom metric function creates a metric derived from existing metrics in
    ``builtin_metrics``.
    """
    return builtin_metrics["sum_on_target"] / 2


def prediction_target_scatter(eval_df, _builtin_metrics, artifacts_dir):
    """
    This example custom artifact generates and saves a scatter plot to ``artifacts_dir`` that
    visualizes the relationship between the predictions and targets for the given model to a
    file as an image artifact.
    """
    plt.scatter(eval_df["prediction"], eval_df["target"])
    plt.xlabel("Targets")
    plt.ylabel("Predictions")
    plt.title("Targets vs. Predictions")
    plot_path = os.path.join(artifacts_dir, "example_scatter_plot.png")
    plt.savefig(plot_path)
    return {"example_scatter_plot_artifact": plot_path}


experiment_id = "4410962850347342"

with mlflow.start_run(experiment_id = experiment_id) as run:
    mlflow.sklearn.log_model(final_model_opt, "model")

    model_uri = mlflow.get_artifact_uri("model")
    
    result = mlflow.evaluate(
        model = model_uri,
        data = eval_data,
        targets = "target",
        model_type = "regressor",
        evaluators=["default"],
        custom_metrics=[
            make_metric(
                eval_fn=squared_diff_plus_one,
                greater_is_better=False,
            ),
            make_metric(
                eval_fn=sum_on_target_divided_by_two,
                greater_is_better=True,
            ),
        ],
        custom_artifacts=[prediction_target_scatter],
    )

print(f"metrics:\n{result.metrics}")
print(f"artifacts:\n{result.artifacts}")


2023/04/27 00:31:50 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.


[0;31m---------------------------------------------------------------------------[0m
[0;31mValueError[0m                                Traceback (most recent call last)
[0;32m<command-4410962850347372>[0m in [0;36m<cell line: 60>[0;34m()[0m
[1;32m     63[0m     [0mmodel_uri[0m [0;34m=[0m [0mmlflow[0m[0;34m.[0m[0mget_artifact_uri[0m[0;34m([0m[0;34m"model"[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[1;32m     64[0m [0;34m[0m[0m
[0;32m---> 65[0;31m     result = mlflow.evaluate(
[0m[1;32m     66[0m         [0mmodel[0m [0;34m=[0m [0mmodel_uri[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[1;32m     67[0m         [0mdata[0m [0;34m=[0m [0meval_data[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m

[0;32m/local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.9/site-packages/mlflow/models/evaluation/base.py[0m in [0;36mevaluate[0;34m(model, data, targets, model_type, dataset_path, feature_names, evaluators, evaluator_config, custom_metrics, custo

In [0]:
X_test



In [0]:
y_test

