In [None]:
import os
import shutil

# Prepare DataFrame via Spark SQL

As the MLlib is the `DataFrame`-based machine learning APIs, we need to load and prepare datasets into `DataFrame`. Here we will be using Spark SQL to load the data.

You can find the list of all Spark SQL features [here](http://spark.apache.org/docs/2.1.0/api/python/pyspark.sql.html).

In [None]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf().setMaster('local[*]')
spark = SparkSession \
    .builder \
    .appName('Python Spark SQL basic example') \
    .config(conf=conf) \
    .getOrCreate()
    # .config('spark.sql.execution.arrow.pyspark.enabled', 'true')

Here we will use the dataset from Kaggle: [Credit Card Fraud Detection](https://www.kaggle.com/mlg-ulb/creditcardfraud)

In [None]:
# Load CSV files into Spark's DataFrame
df = spark.read.csv(
    'creditcard.csv',
    header=True)

In [None]:
# Show the schema
df.printSchema()

In [None]:
# Number of examples
df.count()

In [None]:
# Describe via pandas.DataFrame
df.describe().toPandas().transpose()

In [None]:
# Describe all variables
df.describe('V1').show()

In [None]:
# Drop column
clean_df = df.drop('Time')

In [None]:
# Get feature columns
feature_columns = clean_df.drop('Class').columns
print(feature_columns)

In [None]:
# Convert from string to float and integer
for c in feature_columns:
    clean_df = clean_df.withColumn(c, clean_df[c].cast('float'))
clean_df = clean_df.withColumn('Class', clean_df['Class'].cast('int'))

In [None]:
clean_df

Let's aggregate the features we will use to make predictions into a single column. We can use `VectorAssembler` which is a feature transformer that merges multiple columns into **a vector column**. We have to merge all feature columns into a single vector column, such that it can be used by the MLlib.

In [None]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(
    inputCols=feature_columns, 
    outputCol='features')
feature_df = assembler.transform(clean_df)

## Training, Validation and Test Sets

In [None]:
# 80% / 10% / 10%
train_df, valid_df, test_df = feature_df.randomSplit([0.8,0.1,0.1], seed=42)
train_df.groupBy('Class').count().show()
valid_df.groupBy('Class').count().show()
test_df.groupBy('Class').count().show()

# Spark - Machine Learn Library (MLlib)

MLlib is Spark’s machine learning (ML) library. Its goal is to make practical machine learning scalable and easy. At a high level, it provides tools such as:

- **ML Algorithms**: common learning algorithms such as classification, regression, clustering, and collaborative filtering
- **Featurization**: feature extraction, transformation, dimensionality reduction, and selection
- **Pipelines**: tools for constructing, evaluating, and tuning ML Pipelines
- **Persistence**: saving and load algorithms, models, and Pipelines
- **Utilities**: linear algebra, statistics, data handling, etc.

Ref: 
* https://spark.apache.org/docs/latest/ml-guide.html
* https://spark.apache.org/docs/latest/api/python/reference/pyspark.ml.html



## Training a Model

In [None]:
from pyspark.ml.classification import DecisionTreeClassifier

model = DecisionTreeClassifier(
    labelCol='Class', 
    featuresCol='features')
model = model.fit(train_df)  # fit() returns DecisionTreeClassificationModel

# from pyspark.ml import Pipeline
# dt = DecisionTreeClassifier(
#     labelCol='Class', 
#     featuresCol='features')
# pipeline = Pipeline(stages=[dt])
# model = pipeline.fit(train_df)

## Save and Load a Model

In [None]:
# Save DecisionTreeClassificationModel
save_path = 'dt.model'
if os.path.isdir(save_path):
    shutil.rmtree(save_path)
model.save('dt.model')

# # Save PipelineModel
# save_path = 'pipeline.model'
# if os.path.isdir(save_path):
#     shutil.rmtree(save_path)
# model.save('pipeline.model')

In [None]:
# Load DecisionTreeClassificationModel
from pyspark.ml.classification import DecisionTreeClassificationModel
load_model2 = DecisionTreeClassificationModel.load('dt.model')

# # Load PipelineModel
# from pyspark.ml import PipelineModel
# load_model = PipelineModel.load('pipeline.model')

## Prediction and Evaluation

In [None]:
# Make predictions
pred_valid_df = model.transform(valid_df)

# Make predictions.
pred_test_df = model.transform(test_df)

In [None]:
pred_valid_df.columns

In [None]:
pred_valid_df.select('prediction','Class').show()
pred_test_df.select('prediction','Class').show()

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Accuracy
acc_eval = MulticlassClassificationEvaluator(
    labelCol='Class', 
    predictionCol='prediction', 
    metricName='accuracy')
valid_acc = acc_eval.evaluate(pred_valid_df)
test_acc = acc_eval.evaluate(pred_test_df)
print(f'Valid acc: {valid_acc}')
print(f'Test acc: {test_acc}')

# Area under ROC
roc_eval = BinaryClassificationEvaluator(
    labelCol='Class',
    rawPredictionCol='rawPrediction', 
    metricName='areaUnderROC')
valid_roc = roc_eval.evaluate(pred_valid_df)
test_roc = roc_eval.evaluate(pred_test_df)
print(f'Valid ROC: {valid_roc}')
print(f'Test ROC: {test_roc}')

# Working with Other ML Libraries

Typically the progress of incorporating the state-of-the-art ML techniques in Spark cannot catch up with the progress of ML research in time. This makes the process of prototyping ML models with Spark becomes time-consuming as we need to re-implement the recent ML techniques to be able to work with Spark.

One potential solution to solve this problem is to utilize the Spark to handle data storage and data pre-processing part, and then load the smaller dataset into the local machine for prototyping with the state-of-the-art ML libraries such as scikit-learn, Tensorflow, etc. Once we know which ones would work, then we can spend time only implement a few techniques with the real big data in Spark.

The smaller dataset can be obtained via random sampling, selecting the recent data, etc.

In [None]:
import numpy as np
import pandas as pd

In [None]:
random_small_df = clean_df.sample(withReplacement=False, fraction=0.2)

In [None]:
# Without replacement by default
stratifed_small_df = clean_df.sampleBy('Class', fractions={0: 0.2, 1: 0.2})

In [None]:
clean_df.groupBy('Class').count().show()
random_small_df.groupBy('Class').count().show()
stratifed_small_df.groupBy('Class').count().show()

In [None]:
pddf = stratifed_small_df.toPandas()

In [None]:
pddf

In [None]:
X = pddf.drop(columns=['Class']).values.astype(np.float32)
y = pddf['Class'].values.astype(np.int32)

## Example: scikit-learn

Once we load the data into pandas.DataFrame, we can then use the similar code from WS6.1 from Lecture 6.

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    random_state=42,
    test_size=0.20)
X_valid, X_test, y_valid, y_test = train_test_split(
    X_test, y_test,
    random_state=42,
    test_size=0.50)

print(f'Training set: {X_train.shape}, {y_train.shape}')
print(f'Validation set: {X_valid.shape}, {y_valid.shape}')
print(f'Test set: {X_test.shape}, {y_test.shape}')

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(
    n_estimators=50, 
    max_depth=50,
    min_samples_leaf=10)

model = model.fit(X_train, y_train)

In [None]:
y_hat_train = model.predict(X_train)
y_hat_valid = model.predict(X_valid)
y_hat_test = model.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

print('Training Set')
print(confusion_matrix(y_true=y_train, y_pred=y_hat_train))
print(f'Accuracy: {accuracy_score(y_true=y_train, y_pred=y_hat_train):.2f}')
print(f'F1-score: {f1_score(y_true=y_train, y_pred=y_hat_train):.2f}')
print('')
print('Validation Set')
print(confusion_matrix(y_true=y_valid, y_pred=y_hat_valid))
print(f'Accuracy: {accuracy_score(y_true=y_valid, y_pred=y_hat_valid):.2f}')
print(f'F1-score: {f1_score(y_true=y_valid, y_pred=y_hat_valid):.2f}')
print('')
print('Test Set')
print(confusion_matrix(y_true=y_test, y_pred=y_hat_test))
print(f'Accuracy: {accuracy_score(y_true=y_test, y_pred=y_hat_test):.2f}')
print(f'F1-score: {f1_score(y_true=y_test, y_pred=y_hat_test):.2f}')