### ML with PySpark
+ Classify and Predict

In [1]:
# Load Pckgs
from pyspark import SparkContext

In [2]:
import os
import sys
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
sc= SparkContext(master='local[2]')


In [3]:
# Spark UI
sc

In [4]:
# Load pkgs
from pyspark.sql import SparkSession

In [5]:
spark = SparkSession.builder.appName("MLWithSpark").getOrCreate()

### WorkFlow
+ Data Preparation
+ Feature Engineering
+ Build Model
+ Evaluate

# Task 
+ Predict Customer Churn

In [7]:
# Load Dataset
df = spark.read.csv("telecom_dataset.csv", header=True, inferSchema=True)

In [8]:
# Preview Dataset
df.show()

+----------+------+---+--------------+--------------+------------+-----+
|CustomerID|Gender|Age|      Contract|MonthlyCharges|TotalCharges|Churn|
+----------+------+---+--------------+--------------+------------+-----+
|         1|Female| 25|Month-to-Month|          65.7|       156.5|   No|
|         2|  Male| 37|      One Year|          89.0|      2356.8|   No|
|         3|  Male| 52|      Two Year|         115.5|      5408.6|   No|
|         4|Female| 30|Month-to-Month|          75.9|       129.4|  Yes|
|         5|  Male| 45|      One Year|          98.2|      3142.0|   No|
|         6|Female| 55|      Two Year|          99.9|      6541.5|   No|
|         7|  Male| 32|Month-to-Month|          82.1|       267.7|  Yes|
|         8|Female| 28|Month-to-Month|          61.5|       346.9|   No|
|         9|  Male| 48|      One Year|         101.8|      5149.6|  Yes|
|        10|Female| 60|      Two Year|         108.1|      6742.8|  Yes|
|        11|  Male| 42|Month-to-Month|          78.

In [9]:
# Check for columns
print(df.columns)

['CustomerID', 'Gender', 'Age', 'Contract', 'MonthlyCharges', 'TotalCharges', 'Churn']


In [10]:
# Re-arrange and select relevant columns
df = df.select('Gender', 'Contract', 'Churn', 'Age', 'MonthlyCharges', 'TotalCharges')

In [13]:
df.head()

Row(Gender='Female', Contract='Month-to-Month', Churn='No', Age=25, MonthlyCharges=65.7, TotalCharges=156.5)

In [14]:
# Check datatypes
df.dtypes

[('Gender', 'string'),
 ('Contract', 'string'),
 ('Churn', 'string'),
 ('Age', 'int'),
 ('MonthlyCharges', 'double'),
 ('TotalCharges', 'double')]

In [15]:
# Check for the schema
df.printSchema()

root
 |-- Gender: string (nullable = true)
 |-- Contract: string (nullable = true)
 |-- Churn: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- MonthlyCharges: double (nullable = true)
 |-- TotalCharges: double (nullable = true)



In [16]:
# Descriptive summary
print(df.describe().show())

+-------+------+--------------+-----+------------------+-----------------+----------------+
|summary|Gender|      Contract|Churn|               Age|   MonthlyCharges|    TotalCharges|
+-------+------+--------------+-----+------------------+-----------------+----------------+
|  count|    20|            20|   20|                20|               20|              20|
|   mean|  null|          null| null|              39.7|            89.57|         2684.24|
| stddev|  null|          null| null|10.021555714926123|15.79510583760212|2389.05648771093|
|    min|Female|Month-to-Month|   No|                25|             61.5|           129.4|
|    max|  Male|      Two Year|  Yes|                60|            115.5|          6742.8|
+-------+------+--------------+-----+------------------+-----------------+----------------+

None


In [17]:
# Value Count group by contract
df.groupBy('Contract').count().show()

+--------------+-----+
|      Contract|count|
+--------------+-----+
|Month-to-Month|    8|
|      One Year|    6|
|      Two Year|    6|
+--------------+-----+



#### Feature Engineering
+ Numerical Values
+ Vectorization
+ Scaling

In [18]:
import pyspark.ml

In [19]:
dir(pyspark.ml)

['Estimator',
 'Model',
 'Pipeline',
 'PipelineModel',
 'PredictionModel',
 'Predictor',
 'Transformer',
 'UnaryTransformer',
 '__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 'base',
 'classification',
 'clustering',
 'common',
 'evaluation',
 'feature',
 'fpm',
 'image',
 'linalg',
 'param',
 'pipeline',
 'recommendation',
 'regression',
 'stat',
 'tree',
 'tuning',
 'util',
 'wrapper']

In [20]:
# Load ML Pkgs
from pyspark.ml.feature import VectorAssembler, StringIndexer

In [21]:
df.show()

+------+--------------+-----+---+--------------+------------+
|Gender|      Contract|Churn|Age|MonthlyCharges|TotalCharges|
+------+--------------+-----+---+--------------+------------+
|Female|Month-to-Month|   No| 25|          65.7|       156.5|
|  Male|      One Year|   No| 37|          89.0|      2356.8|
|  Male|      Two Year|   No| 52|         115.5|      5408.6|
|Female|Month-to-Month|  Yes| 30|          75.9|       129.4|
|  Male|      One Year|   No| 45|          98.2|      3142.0|
|Female|      Two Year|   No| 55|          99.9|      6541.5|
|  Male|Month-to-Month|  Yes| 32|          82.1|       267.7|
|Female|Month-to-Month|   No| 28|          61.5|       346.9|
|  Male|      One Year|  Yes| 48|         101.8|      5149.6|
|Female|      Two Year|  Yes| 60|         108.1|      6742.8|
|  Male|Month-to-Month|   No| 42|          78.9|       547.6|
|Female|      One Year|   No| 35|          94.7|      1950.2|
|  Male|      Two Year|   No| 41|          96.5|      4188.1|
|  Male|

In [23]:
# Convert Contract column to 0(Month to month) or 1 (One year) and 2 (Two year)
## Check unique values.
df.select('Contract').distinct().show()

+--------------+
|      Contract|
+--------------+
|Month-to-Month|
|      One Year|
|      Two Year|
+--------------+



In [24]:
# Convert the string to numerical code
# label encoding 
contractEncoder = StringIndexer(inputCol='Contract', outputCol='Package').fit(df)

In [25]:
df = contractEncoder.transform(df)

In [26]:
df.show(5)

+------+--------------+-----+---+--------------+------------+-------+
|Gender|      Contract|Churn|Age|MonthlyCharges|TotalCharges|Package|
+------+--------------+-----+---+--------------+------------+-------+
|Female|Month-to-Month|   No| 25|          65.7|       156.5|    0.0|
|  Male|      One Year|   No| 37|          89.0|      2356.8|    1.0|
|  Male|      Two Year|   No| 52|         115.5|      5408.6|    2.0|
|Female|Month-to-Month|  Yes| 30|          75.9|       129.4|    0.0|
|  Male|      One Year|   No| 45|          98.2|      3142.0|    1.0|
+------+--------------+-----+---+--------------+------------+-------+
only showing top 5 rows



In [27]:
# Encode Gender column 0(Male), 1(Female)
df.select('Gender').distinct().show()

+------+
|Gender|
+------+
|Female|
|  Male|
+------+



In [28]:
# Convert the string to numerical code
# label encoding 
genderEncoder = StringIndexer(inputCol='Gender', outputCol='Sex').fit(df)

In [29]:
df = genderEncoder.transform(df)

In [30]:
df.show()

+------+--------------+-----+---+--------------+------------+-------+---+
|Gender|      Contract|Churn|Age|MonthlyCharges|TotalCharges|Package|Sex|
+------+--------------+-----+---+--------------+------------+-------+---+
|Female|Month-to-Month|   No| 25|          65.7|       156.5|    0.0|1.0|
|  Male|      One Year|   No| 37|          89.0|      2356.8|    1.0|0.0|
|  Male|      Two Year|   No| 52|         115.5|      5408.6|    2.0|0.0|
|Female|Month-to-Month|  Yes| 30|          75.9|       129.4|    0.0|1.0|
|  Male|      One Year|   No| 45|          98.2|      3142.0|    1.0|0.0|
|Female|      Two Year|   No| 55|          99.9|      6541.5|    2.0|1.0|
|  Male|Month-to-Month|  Yes| 32|          82.1|       267.7|    0.0|0.0|
|Female|Month-to-Month|   No| 28|          61.5|       346.9|    0.0|1.0|
|  Male|      One Year|  Yes| 48|         101.8|      5149.6|    1.0|0.0|
|Female|      Two Year|  Yes| 60|         108.1|      6742.8|    2.0|1.0|
|  Male|Month-to-Month|   No| 42|     

In [31]:
# Convert the string to numerical code 0(No), 1(Yes)
# label encoding 
churnEncoder = StringIndexer(inputCol='Churn', outputCol='ChurnLabel').fit(df)

In [32]:
df = churnEncoder.transform(df)

In [33]:
df.show()

+------+--------------+-----+---+--------------+------------+-------+---+----------+
|Gender|      Contract|Churn|Age|MonthlyCharges|TotalCharges|Package|Sex|ChurnLabel|
+------+--------------+-----+---+--------------+------------+-------+---+----------+
|Female|Month-to-Month|   No| 25|          65.7|       156.5|    0.0|1.0|       0.0|
|  Male|      One Year|   No| 37|          89.0|      2356.8|    1.0|0.0|       0.0|
|  Male|      Two Year|   No| 52|         115.5|      5408.6|    2.0|0.0|       0.0|
|Female|Month-to-Month|  Yes| 30|          75.9|       129.4|    0.0|1.0|       1.0|
|  Male|      One Year|   No| 45|          98.2|      3142.0|    1.0|0.0|       0.0|
|Female|      Two Year|   No| 55|          99.9|      6541.5|    2.0|1.0|       0.0|
|  Male|Month-to-Month|  Yes| 32|          82.1|       267.7|    0.0|0.0|       1.0|
|Female|Month-to-Month|   No| 28|          61.5|       346.9|    0.0|1.0|       0.0|
|  Male|      One Year|  Yes| 48|         101.8|      5149.6|    

In [34]:
# Get the labels
churnEncoder.labels

['No', 'Yes']

In [35]:
genderEncoder.labels

['Male', 'Female']

In [36]:
contractEncoder.labels

['Month-to-Month', 'One Year', 'Two Year']

In [40]:
print(df.columns)

['Gender', 'Contract', 'Churn', 'Age', 'MonthlyCharges', 'TotalCharges', 'Package', 'Sex', 'ChurnLabel']


In [41]:
### Feature Selection
required_features = ['Age', 'MonthlyCharges', 'TotalCharges', 'Package', 'Sex', 'ChurnLabel']

In [42]:
df2 = df.select(required_features)

In [43]:
df2.show(5)

+---+--------------+------------+-------+---+----------+
|Age|MonthlyCharges|TotalCharges|Package|Sex|ChurnLabel|
+---+--------------+------------+-------+---+----------+
| 25|          65.7|       156.5|    0.0|1.0|       0.0|
| 37|          89.0|      2356.8|    1.0|0.0|       0.0|
| 52|         115.5|      5408.6|    2.0|0.0|       0.0|
| 30|          75.9|       129.4|    0.0|1.0|       1.0|
| 45|          98.2|      3142.0|    1.0|0.0|       0.0|
+---+--------------+------------+-------+---+----------+
only showing top 5 rows



In [44]:
df2 = df2.toPandas().replace('NA',0).astype(float)

In [45]:
type(df2)

pandas.core.frame.DataFrame

In [46]:
# Convert to PySpark Dataframe
new_df =  spark.createDataFrame(df2)

  for column, series in pdf.iteritems():


In [47]:
type(new_df)

pyspark.sql.dataframe.DataFrame

In [48]:
new_df.printSchema()

root
 |-- Age: double (nullable = true)
 |-- MonthlyCharges: double (nullable = true)
 |-- TotalCharges: double (nullable = true)
 |-- Package: double (nullable = true)
 |-- Sex: double (nullable = true)
 |-- ChurnLabel: double (nullable = true)



In [49]:
# Vectorizing
vec_assembler = VectorAssembler(inputCols=required_features, outputCol='features')

In [50]:
vec_df = vec_assembler.transform(new_df)

In [51]:
vec_df.show()

+----+--------------+------------+-------+---+----------+--------------------+
| Age|MonthlyCharges|TotalCharges|Package|Sex|ChurnLabel|            features|
+----+--------------+------------+-------+---+----------+--------------------+
|25.0|          65.7|       156.5|    0.0|1.0|       0.0|[25.0,65.7,156.5,...|
|37.0|          89.0|      2356.8|    1.0|0.0|       0.0|[37.0,89.0,2356.8...|
|52.0|         115.5|      5408.6|    2.0|0.0|       0.0|[52.0,115.5,5408....|
|30.0|          75.9|       129.4|    0.0|1.0|       1.0|[30.0,75.9,129.4,...|
|45.0|          98.2|      3142.0|    1.0|0.0|       0.0|[45.0,98.2,3142.0...|
|55.0|          99.9|      6541.5|    2.0|1.0|       0.0|[55.0,99.9,6541.5...|
|32.0|          82.1|       267.7|    0.0|0.0|       1.0|[32.0,82.1,267.7,...|
|28.0|          61.5|       346.9|    0.0|1.0|       0.0|[28.0,61.5,346.9,...|
|48.0|         101.8|      5149.6|    1.0|0.0|       1.0|[48.0,101.8,5149....|
|60.0|         108.1|      6742.8|    2.0|1.0|      

### Train, Test, Split


In [52]:
# Split dataset into training and testing data sets
train_df,test_df = vec_df.randomSplit([0.7, 0.3])

In [53]:
train_df.count()

16

#### Model building
+ Pyspark.ml: DataFrame
+ Pyspark.mllib: RDD / Legacy

In [54]:
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier

In [55]:
# Logistic Model
lr = LogisticRegression(featuresCol='features', labelCol='ChurnLabel')

In [56]:
lr_mode = lr.fit(train_df)

In [57]:
y_pred = lr_mode.transform(test_df)

In [58]:
print(y_pred.columns)

['Age', 'MonthlyCharges', 'TotalCharges', 'Package', 'Sex', 'ChurnLabel', 'features', 'rawPrediction', 'probability', 'prediction']


In [59]:
y_pred.select('ChurnLabel', 'rawPrediction', 'probability', 'prediction').show()

+----------+--------------------+--------------------+----------+
|ChurnLabel|       rawPrediction|         probability|prediction|
+----------+--------------------+--------------------+----------+
|       0.0|[17.8679331301987...|[0.99999998261978...|       0.0|
|       0.0|[17.9511885165234...|[0.99999998400818...|       0.0|
|       1.0|[-17.653463988255...|[2.15376340778007...|       1.0|
|       1.0|[-19.382195611167...|[3.82313291747622...|       1.0|
+----------+--------------------+--------------------+----------+



In [60]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

#### Model Evaluation

In [61]:
# How to check for Accuracy. Evaluate model
multi_evaluator = MulticlassClassificationEvaluator(labelCol='ChurnLabel', metricName='accuracy')

In [62]:
multi_evaluator.evaluate(y_pred)

1.0

In [None]:
# Precision, F1 Score, Recall: Classification Report

In [63]:
from pyspark.mllib.evaluation import MulticlassMetrics

In [64]:
lr_metric = MulticlassMetrics(y_pred['ChurnLabel', 'prediction'].rdd)



In [65]:
print(lr_metric.precision(1.0))

1.0


In [66]:
print("Accuracy: ",lr_metric.accuracy)
print("Precision: ",lr_metric.precision(1.0))
print("Recall: ",lr_metric.recall(1.0))
print("F1Score: ",lr_metric.fMeasure(1.0))

Accuracy:  1.0
Precision:  1.0
Recall:  1.0
F1Score:  1.0


In [None]:
# Model prediction is 100% accurate