In [36]:
import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import desc
from pyspark.sql.functions import asc
from pyspark.sql.functions import sum as Fsum
from pyspark.sql.functions import count as Fcount


In [4]:
dfpath = '../data/medium-sparkify-event-data.json'
spark = SparkSession\
        .builder\
        .appName('sparkify_etl')\
        .getOrCreate()

df = spark.read.json(dfpath)

# EDA

In [18]:
df.printSchema()

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: long (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)



In [21]:
for column in df.columns:
    df.describe(column).show()

+-------+-----------------+
|summary|           artist|
+-------+-----------------+
|  count|           432877|
|   mean|527.5289537712895|
| stddev|966.1072451772758|
|    min|              !!!|
|    max|ÃÂlafur Arnalds|
+-------+-----------------+

+-------+----------+
|summary|      auth|
+-------+----------+
|  count|    543705|
|   mean|      null|
| stddev|      null|
|    min| Cancelled|
|    max|Logged Out|
+-------+----------+

+-------+---------+
|summary|firstName|
+-------+---------+
|  count|   528005|
|   mean|     null|
| stddev|     null|
|    min|  Aaliyah|
|    max|   Zyonna|
+-------+---------+

+-------+------+
|summary|gender|
+-------+------+
|  count|528005|
|   mean|  null|
| stddev|  null|
|    min|     F|
|    max|     M|
+-------+------+

+-------+------------------+
|summary|     itemInSession|
+-------+------------------+
|  count|            543705|
|   mean|107.30629109535502|
| stddev|116.72350849188155|
|    min|                 0|
|    max|          

In [23]:
for column in df.dtypes:
    if column[1] == 'string':
        df.select(column[0]).drop_duplicates().show()

+--------------------+
|              artist|
+--------------------+
|      The Black Keys|
|        Yann Tiersen|
|    Jane's Addiction|
|          Tim Hughes|
|Dashboard Confess...|
|                Silk|
|Yonder Mountain S...|
|            La Shica|
|        Elvis Crespo|
|         Silverstein|
|         Eva Cassidy|
|        Generation X|
|     Robyn Hitchcock|
|           Kate Nash|
|       Jupiter Jones|
|           Los Lobos|
|               Rufio|
|     Drive Like Jehu|
|       Yuichi Tamate|
|      Jarabe De Palo|
+--------------------+
only showing top 20 rows

+----------+
|      auth|
+----------+
|Logged Out|
| Cancelled|
|     Guest|
| Logged In|
+----------+

+---------+
|firstName|
+---------+
|   Maddox|
|    Lucas|
|   Karter|
|    Grace|
|    Irvin|
|  Janiyah|
| Antonina|
|    Allan|
|  Lorelei|
|    Devyn|
|    Bodhi|
|  Adriana|
| Isabella|
|  Everett|
|    James|
| Kamalani|
|    Wyatt|
|     Zola|
|     Nora|
|    Issac|
+---------+
only showing top 20 rows

+--

In [61]:
df_size = df.count()

for column in df.columns:
    missing_values = df_size - df.select(column).filter(df[column] != "").na.drop().count()
    print(f'{column}: {missing_values} missing values')

artist: 110828 missing values
auth: 0 missing values
firstName: 15700 missing values
gender: 15700 missing values
itemInSession: 543705 missing values
lastName: 15700 missing values
length: 543705 missing values
level: 0 missing values
location: 15700 missing values
method: 0 missing values
page: 0 missing values
registration: 543705 missing values
sessionId: 543705 missing values
song: 110828 missing values
status: 543705 missing values
ts: 543705 missing values
userAgent: 15700 missing values
userId: 15700 missing values


In [105]:
df_size = df.filter(df.auth.isin(['Logged Out', 'Guest']) == False).count()

for column in df.columns:
    missing_values = ((df_size - df.filter(df.auth.isin(['Logged Out', 'Guest']) == False)\
                                    .select(column)\
                                    .filter(df[column].cast(StringType()) != "")\
                                    .na.drop()\
                                    .count()) 
                        / df_size) * 100
    print(f'{column}: {missing_values:.2f} % missing values')

artist: 18.02 % missing values
auth: 0.00 % missing values
firstName: 0.00 % missing values
gender: 0.00 % missing values
itemInSession: 0.00 % missing values
lastName: 0.00 % missing values
length: 18.02 % missing values
level: 0.00 % missing values
location: 0.00 % missing values
method: 0.00 % missing values
page: 0.00 % missing values
registration: 0.00 % missing values
sessionId: 0.00 % missing values
song: 18.02 % missing values
status: 0.00 % missing values
ts: 0.00 % missing values
userAgent: 0.00 % missing values
userId: 0.00 % missing values


All missing values for the `userId` column are created by users that are either `Logged Out`, or logged in as a `Guest`.

The empty values for `Artist`, `Song` and `Length` are most likely due not no song being played at the time of the event. 

It is also important to note, that the `userId` is sometimes an empty string, not NaN


# Clean data

As there is no value in looking at users that are not logged in for churn analysis - their entries will be disregarded. 
The empty `Artist`, `Song` and `Length` values will not be removed, as the rest of their rows provide valuable data.

In [103]:
def clean_data(df):
    

+------+----+---------+------+-------------+--------+------+-----+--------+------+----+------------+---------+----+------+---+---------+------+
|artist|auth|firstName|gender|itemInSession|lastName|length|level|location|method|page|registration|sessionId|song|status| ts|userAgent|userId|
+------+----+---------+------+-------------+--------+------+-----+--------+------+----+------------+---------+----+------+---+---------+------+
+------+----+---------+------+-------------+--------+------+-----+--------+------+----+------------+---------+----+------+---+---------+------+



False