# Predict expected e-commerce annual customer spend

In [1]:
# --- Standard library imports --- #
from pathlib import Path
import urllib.request as ureq
from os import unlink as os_unlink
from tempfile import NamedTemporaryFile


# --- PySpark imports --- #
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

from pyspark.sql.dataframe import DataFrame # For type assertion.

In [2]:
APP_NAME: str = "eCommerceSpend"
DATA_SUBDIR: Path = Path().joinpath(r"data")
DATA_URL: str = "https://github.com/krishnaik06/PysparkRegressions/raw/master/Ecommerce_Customers.csv"

print_methods = lambda obj: print("\n".join([i for i in dir(obj) if not i.startswith("_")]))

In [3]:
# Create Spark instance
spark = SparkSession.builder.appName(APP_NAME).getOrCreate()

In [4]:
def create_csv_dataframe(url: str) -> DataFrame:
    """Create and return pyspark DataFrame object from raw csv file.
    
    Parameters
    ----------
    url : str
        Path to remote, internet-hosted data file.
    verbose : bool
        Whether or not to send additional info to stdout.
        
    Returns
    -------
    filepath : str
        Path to local temporary file.
    DataFrame
        pyspark.sql.dataframe.DataFrame object.
    """
       
    with ureq.urlopen(url) as resp:
        tmp = resp.read().decode("utf-8")
        if tmp:
            
            tempf = NamedTemporaryFile(mode="w", encoding="utf-8", dir = DATA_SUBDIR, delete = False)
            tempf.write(tmp)                
                
            tempf.seek(0)

            # Set temp_filepath to variable.
            tmp_fp = tempf.name
            
            tempf.close()
            
            df_ = spark.read.csv(tempf.name, inferSchema = True, header = True)
                        
            assert (type(df_) == DataFrame), "Data object type error."
            
            return df_

dataset = create_csv_dataframe(DATA_URL)

# https://spark.apache.org/docs/3.1.2/api/python/reference/api/pyspark.sql.DataFrame.html#pyspark.sql.DataFrame
if dataset:
#     print(temp_filepath)
    dataset.printSchema()
    print(dataset.columns)

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)

['Email', 'Address', 'Avg Session Length', 'Time on App', 'Time on Website', 'Length of Membership', 'Yearly Amount Spent']


### Create field of vectorized predictors

In [5]:
dependent_variable: str = "Yearly Amount Spent"
features_col: str = "features"

independent_variables = [i for i in dataset.columns if not i in ("Email", "Address", dependent_variable)]

feature_assmblr = VectorAssembler(
    inputCols = independent_variables,
    outputCol = features_col
)

In [6]:
# va_df.select("independent_features").show(n=10)
va_df = feature_assmblr.transform(dataset)
va_df.show(n=10)

+--------------------+--------------------+------------------+-----------+---------------+--------------------+-------------------+--------------------+
|               Email|             Address|Avg Session Length|Time on App|Time on Website|Length of Membership|Yearly Amount Spent|            features|
+--------------------+--------------------+------------------+-----------+---------------+--------------------+-------------------+--------------------+
|mstephenson@ferna...|835 Frank TunnelW...|       34.49726773|12.65565115|    39.57766802|         4.082620633|         587.951054|[34.49726773,12.6...|
|   hduke@hotmail.com|4547 Archer Commo...|       31.92627203|11.10946073|    37.26895887|         2.664034182|        392.2049334|[31.92627203,11.1...|
|    pallen@yahoo.com|24645 Valerie Uni...|       33.00091476|11.33027806|    37.11059744|         4.104543202|        487.5475049|[33.00091476,11.3...|
|riverarebecca@gma...|1414 David Throug...|       34.30555663|13.71751367|    36.7

In [7]:
va_df.select(independent_variables + [dependent_variable]).describe().show()

+-------+------------------+------------------+------------------+--------------------+-------------------+
|summary|Avg Session Length|       Time on App|   Time on Website|Length of Membership|Yearly Amount Spent|
+-------+------------------+------------------+------------------+--------------------+-------------------+
|  count|               500|               500|               500|                 500|                500|
|   mean|    33.05319351824|12.052487936928012|37.060445421080004|  3.5334615559300007|  499.3140382608002|
| stddev|0.9925631111602911|0.9942156084624618|1.0104889068105993|  0.9992775024367542|  79.31478155115914|
|    min|       29.53242897|       8.508152176|       33.91384725|          0.26990109|        256.6705823|
|    max|       36.13966249|       15.12699429|       40.00518164|         6.922689335|        765.5184619|
+-------+------------------+------------------+------------------+--------------------+-------------------+



#### Create 'final' dataframe from dependent variable and vectorized features.

In [8]:
final_df = va_df.select(features_col, dependent_variable)
final_df.show(n=10)

+--------------------+-------------------+
|            features|Yearly Amount Spent|
+--------------------+-------------------+
|[34.49726773,12.6...|         587.951054|
|[31.92627203,11.1...|        392.2049334|
|[33.00091476,11.3...|        487.5475049|
|[34.30555663,13.7...|         581.852344|
|[33.33067252,12.7...|         599.406092|
|[33.87103788,12.0...|        637.1024479|
|[32.0215955,11.36...|        521.5721748|
|[32.73914294,12.3...|        549.9041461|
|[33.9877729,13.38...|         570.200409|
|[31.93654862,11.8...|        427.1993849|
+--------------------+-------------------+
only showing top 10 rows



#### Split data for training and testing

In [9]:
train_pct = 0.75
df_train, df_test = final_df.randomSplit([train_pct, 1-train_pct])

#### Build and fit regression model

In [10]:
regr = LinearRegression(featuresCol = features_col, labelCol = dependent_variable)
regr_mod = regr.fit(df_train)

In [11]:
print(regr_mod.coefficients, "\n", regr_mod.intercept)

[26.09043333353722,38.51786580162793,0.6099454744483147,61.39640784603681] 
 -1066.196241241698


In [12]:
pred = regr_mod.evaluate(df_test)
pred.predictions.show(n=40)

+--------------------+-------------------+------------------+
|            features|Yearly Amount Spent|        prediction|
+--------------------+-------------------+------------------+
|[30.73772037,12.6...|        461.7807422|  450.747454024083|
|[30.81620065,11.8...|        266.0863409| 283.4142781417984|
|[30.87948434,13.2...|           490.2066| 493.6418209103606|
|[31.06132516,12.3...|        487.5554581| 493.3265759950077|
|[31.35847719,12.8...|        495.1759504| 491.0044857492155|
|[31.42522688,13.2...|        530.7667187| 534.5473868126089|
|[31.51473786,12.5...|         489.812488|495.69239821751444|
|[31.52575242,11.3...|        443.9656268|449.70731353505266|
|[31.66104982,11.3...|        416.3583536|417.57422463781245|
|[31.72165236,11.7...|        347.7769266|350.06083415960006|
|[31.73663569,10.7...|        496.9334463| 494.5196666134591|
|[31.76561882,12.4...|        496.5540816|501.49715759708215|
|[31.82797906,12.4...|        440.0027475|449.65413167999986|
|[31.864

### Visualize some data.

In [13]:
from bokeh.models import ColumnDataSource,Legend, LegendItem
from bokeh.plotting import figure, show, output_file
from bokeh.models.tools import HoverTool
from bokeh.io import output_notebook
output_notebook()

In [14]:
actual_values: list = pred.predictions.select(dependent_variable).rdd.map(lambda q: q[0]).collect()
predicted_values: list = pred.predictions.select("prediction").rdd.map(lambda q: q[0]).collect()

In [15]:
P1_TOOLTIPS = [
    ("index", "$index"),
    ("Act.", "@actual{0.00}"),
    ("Pred.", "@predicted{0.00}"),
]

p1 = figure(
    title="Pred. vs. Actual Outcome",
    tools = "pan,wheel_zoom,reset,save",
    toolbar_location="above",
    tooltips=P1_TOOLTIPS,    
    plot_width = 1000,
    plot_height = 700,
)

ddict = {
    "index": list(range(len(actual_values))),
    "actual": actual_values,
    "predicted": predicted_values,
}

srce = ColumnDataSource(data = ddict)

# Renderer
p1.scatter(x = "actual", y = "predicted", size=8, source = srce)

# x-axis label
p1.xaxis.axis_label = "Pred."
p1.xaxis.axis_line_width = 2

# y-axis label
p1.yaxis.axis_label = "Actual"
p1.yaxis.axis_line_width = 2

show(p1)

In [16]:
P2_TOOLTIPS = [
    ("index", "$index"),
    ("Act.", "@actual{0.00}"),
    ("Pred.", "@predicted{0.00}"),
]

p2 = figure(
    title="Pred. vs. Actual Outcome",
    tools = "pan,wheel_zoom,reset,save",
    toolbar_location="above",
    tooltips = P2_TOOLTIPS,
    sizing_mode="stretch_width",
    # max_width = 1500,
    plot_height=800
)


# Renderer
# p2.multi_line(xs = x_values, ys = y_values, color=["blue", "red"])
p2.line(x = "index", y = "actual", legend_label="actual", color="blue", source=srce)
p2.line(x = "index", y = "predicted", legend_label="predicted", color="red", source=srce)

# Plot colors
p2.background_fill_color = (230,240,239)
# p2.border_fill_color = (102, 204, 255)
p2.outline_line_color = (84,26,72)

show(p2)

In [22]:
# If we're happy with our data, remove the temp file(s).

for f in DATA_SUBDIR.rglob("*"):
    _path = Path(f)
    if _path.is_file():
        _path.unlink()


In [23]:
print("Bye, bye!")
spark.stop()

Bye, bye!
