In [1]:
import re
import numpy as np
import pandas as pd

import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import desc
from pyspark.sql.functions import asc
from pyspark.sql.functions import sum as Fsum
from pyspark.sql.functions import avg as Favg
from pyspark.sql.functions import count as Fcount
from pyspark.sql.functions import countDistinct as FcountDistinct
from pyspark.sql.functions import min as Fmin
from pyspark.sql.functions import max as Fmax
from pyspark.sql.functions import when
from pyspark.sql.functions import col
from pyspark.sql.window import Window

from pyspark.ml import Pipeline
from pyspark.ml.feature import MinMaxScaler, VectorAssembler, StringIndexer
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


# Load Dataset

In [2]:
dfpath = '../data/medium-sparkify-event-data.json'
spark = SparkSession\
        .builder\
        .appName('sparkify_etl')\
        .getOrCreate()

df = spark.read.json(dfpath)


# Clean data

As there is no value in looking at users that are not logged in for churn analysis - their entries will be disregarded. 
The empty `Artist`, `Song` and `Length` values will not be removed, as the rest of their rows may provide valuable data.

In [10]:
# Remove columns that are unlikely to be valuable
valuable_columns = ['artist', 'auth', 'gender', 'itemInSession', 'level', 'location', 'page', 'registration', 'sessionId', 'song', 'status', 'ts', 'userAgent', 'userId']
df = df.select(valuable_columns)

df = df.filter(df.auth.isin(['Logged Out', 'Guest']) == False)

# Feature Engineering

In [11]:
def detect_os(column):
    '''
        summary:
            Transformes the userId userAgent column into the platform used by the user
            * Microsoft
            * Apple
            * Linux
        args:
            column - column to be transformed
        returns:
            Name of the platform used
    '''
    detect = re.findall(r'\((\w+)', column)[0]
    if detect in ['iPhone', 'iPad', 'Macintosh']:
        return 'Apple'
    elif detect in ['Windows', 'compatible']:
        return'Microsoft'
    elif detect in ['X11']:
        return'Linux'
    else:
        return 'Not Detected'

detect_os_udf = udf(detect_os)

# Create platform and age in hours column
df = df.withColumn('platform', detect_os_udf(df.userAgent))\
        .withColumn('ageHours', (df.ts - df.registration)/3600000)\
        .drop('userAgent')

df = df.withColumn('levelNr', when(
    col('level') == 'paid', 1)
    .otherwise(0)
)

## Define and flag churn

Churn defined as having a `Cancellation Confirmation` page visit

In [12]:
flag_cancellation_event = udf(lambda x: 1 if x == "Cancellation Confirmation" else 0, IntegerType())
df_churn = df.withColumn('ChurnFlag', flag_cancellation_event('page'))
windowed = Window.partitionBy('userId').orderBy(desc('ts')).rangeBetween(Window.unboundedPreceding, 0)
df_churn = df_churn.withColumn('Churned', Fsum('ChurnFlag').over(windowed))
df_churn.filter(df_churn.Churned == 1)

+--------------------+---------+------+-------------+-----+--------------------+--------------------+-------------+---------+--------------------+------+-------------+------+--------+------------------+-------+---------+-------+
|              artist|     auth|gender|itemInSession|level|            location|                page| registration|sessionId|                song|status|           ts|userId|platform|          ageHours|levelNr|ChurnFlag|Churned|
+--------------------+---------+------+-------------+-----+--------------------+--------------------+-------------+---------+--------------------+------+-------------+------+--------+------------------+-------+---------+-------+
|                null|Cancelled|     F|           67| free|Bridgeport-Stamfo...|Cancellation Conf...|1538016340000|      166|                null|   200|1539254318000|100010|   Apple| 343.8827777777778|      0|        1|      1|
|                null|Logged In|     F|           66| free|Bridgeport-Stamfo...|    

In [13]:
# Flag the last month of user data per user

user_level = df_churn.groupBy('userId')\
                .agg(
                    Fmax('ts').alias('latestSession'), 
                    Fmax(when(col('ChurnFlag') == 1, col('ts'))).alias('ChurnTime'))
df_churn = df_churn.join(user_level, on= 'userId', how='left')
df_churn = df_churn.withColumn(
    'lastMonth',
        when(col('churned') == 1, (col('ChurnTime') - col('ts')) /3600000 <= 720)
        .otherwise((col('latestSession') - col('ts')) /3600000 <= 720))

In [14]:
# Aggregate on user level

df_last_month = df_churn\
    .filter(col('lastMonth') == True)\
    .groupBy(['userId', 'platform', 'Churned'])\
    .agg(
        Fmax(col('levelNr')).alias('premiumUser'),
        Fmax(col('ageHours')).alias('ageHours'),
        Fsum(when(col('page') == 'NextSong', 1)).alias('songsListened'),
        Fsum(when(col('page') == 'Thumbs Down', 1)).alias('downVotes'),    
    )

df_last_month= df_last_month.fillna({
    'downVotes':0,
    'songsListened':0
})

df_last_month.show()

+------+------+---------+-------+------------------+-----------+------------------+-------------+-----------------+-------+---------+------------+------+----+-----------+------------+-----------------------+---------------------+
|userId|gender| platform|Churned| avgItemsInSession|premiumUser|          ageHours|songsListened|playlistAdditions|upVotes|downVotes|friendsAdded|errors|help|adsListened|sessionCount|distinctArtistsListened|distinctSongsListened|
+------+------+---------+-------+------------------+-----------+------------------+-------------+-----------------+-------+---------+------------+------+----+-----------+------------+-----------------------+---------------------+
|200002|     M|    Apple|      1|54.832911392405066|          1|1279.7555555555555|          310|                6|     15|        5|           2|     0|   1|         11|           5|                    287|                  306|
|100049|     M|    Apple|      1| 63.91210613598673|          1|1219.99444444444

## EDA & Feature Engineering results

Taken from the EDA on the smaller data subset.

|         | Not Churned | Churned |
|---------|-------------|---------|
| female  | 44%         | 45%     |
| apple   | 44%         | 46,5%   |
| Windows | 49.9%       | 48.5%   |
| Linux   | 6.1%        | 5%      |

* It does not look like gender has an influence on churn.
* apple users are slightly more likely to churn compared to Linux and Windows users.
* Churned users seem to be slightly more engaged, having a higher percentage of premium users, more items per session, songs listened and votes in general. 
* The one thing that stands out is that they have way give way more Down Votes on average. 
* They seem to be relatively newer to the service than the non churned users.

## Select and transform valuable features

* Platform
* Level
* User Age
* Songs listened
* Down Votes

In [16]:
features = ['label', 'platformIndexed', 'premiumUser', 'ageHours', 'songsListened', 'downVotes']

indexer = StringIndexer(inputCol='platform', outputCol='platformIndexed')
df_user = indexer.fit(df_last_month)\
            .transform(df_last_month)\
            .withColumnRenamed('Churned', 'label')\
            .select(features)

df_user.show()

+-----+---------------+-----------+------------------+-------------+---------+
|label|platformIndexed|premiumUser|          ageHours|songsListened|downVotes|
+-----+---------------+-----------+------------------+-------------+---------+
|    1|            1.0|          1|1279.7555555555555|          310|        5|
|    1|            1.0|          1|1219.9944444444445|          465|        5|
|    0|            0.0|          1|1892.6022222222223|          296|        3|
|    0|            1.0|          0|2381.4780555555553|           12|        1|
|    0|            1.0|          1|3035.9777777777776|          935|       13|
|    1|            1.0|          0|1274.4994444444444|           39|        0|
|    0|            1.0|          1|1924.1805555555557|          419|        7|
|    0|            1.0|          1|1787.1402777777778|         1831|       24|
|    1|            0.0|          0| 811.9580555555556|          503|        8|
|    0|            1.0|          1|1385.751666666666

# Modeling

In [17]:
train, test = df_user.randomSplit([0.8, 0.2], 23)

feature_names = ['platformIndexed', 'premiumUser', 'ageHours', 'songsListened', 'downVotes']
assembler = VectorAssembler(inputCols = feature_names, outputCol = 'features_vectorized')
scaler = MinMaxScaler(inputCol = 'features_vectorized', outputCol = 'features')

In [18]:
def run_pipeline(model, param_grid, data):
    '''
        summary:
            Runs the pipeline for the provided model based on the provided parameter grid.
        args:
            model - model to be trained
            param_grid - parameter grid that is to be evaluated
            data - training data set to use
        returns:
            trained_model - the trained model
    '''
    pipeline = Pipeline(stages = [assembler, scaler, model])
    crossval = CrossValidator(
        estimator=pipeline,
        estimatorParamMaps=param_grid,
        evaluator=MulticlassClassificationEvaluator(metricName='f1'),
        numFolds=3)
    trained_model = crossval.fit(data)
    return trained_model

def evaluate_model(model, data):
    '''
        summary:
            Evaluates the performance of a model using the f1-score.
        args:
            model - model to be evaluated
            data - test data set to use
        returns:
            score - f1-score of the model
            confustion_matrix - confusion matrix of the model
    '''
    evaluator = MulticlassClassificationEvaluator(metricName= 'f1')
    predictions = model.transform(data)

    score = evaluator.evaluate(predictions)
    confusion_matrix = predictions.groupby('label')\
                        .pivot('prediction')\
                        .count()\
                        .toPandas()
    return score, confusion_matrix

## Multilayer Perceptron

In [22]:
mp = MultilayerPerceptronClassifier()

param_grid = ParamGridBuilder()\
                .addGrid(mp.layers, [[5, 4, 4, 2]])\
                .addGrid(mp.blockSize, [64])\
                .build()

mp_model = run_pipeline(mp, param_grid, train)

mp_score, mp_confusion = evaluate_model(mp_model, test)
print(f'The score for the multiplayer perceptron is {mp_score:.4f}')
print(mp_confusion)

The score for the multiplayer perceptron is 0.7849
Best layer configuration: [5, 4, 4, 2]
Best block size: 64


# Closing words

The multilayer perceptron seems to be performing the best on the small dataset used here with an f1-score of 0.785. 
The configuration used to achieve these results are :
* 5 nodes in the input layer
* 4 nodes in the first intermediate layer
* 4 nodes in the second intermediate layer
* 2 nodes in the output layer
* A block size of 64

This configuration will be used to train and evaluate the model on the entire dataset as well.