# This Notebook looks at Titanic Survival Data using SparkML

In [1]:
import os

from pprint import pprint
import warnings
warnings.filterwarnings('ignore')

from timeit import default_timer as timer

from kdp_connector import KdpConn

In [None]:
# https://towardsdatascience.com/predicting-the-survival-of-titanic-passengers-30870ccc7e8
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [3]:
#find pyspark installation
import findspark
findspark.init()

#spark imports
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer,VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

ModuleNotFoundError: No module named 'pyspark'

In [4]:
#access token from oauth
jwt = os.getenv('ACCESS_TOKEN')

## First Connect to Koverse

## Then retrieve data into Spark DataFrame

In [None]:
# This example shows you how to use the KDP Python Connector to read data from KDP dataset into
# a Pandas Dataframe..

# get dataset_id from url in kdp4 when dataset is selected
################## Replace with your INFO #####################
email = 'spongebob@koverse.com'
password = 'Password1!'
workspace_id = 'spongebob'
dataset_id = '12345678-1234-asd1-fgh2-378b59bf74ce'
###############################################################


host = 'https://api.dev.koverse.com'
batch_size = 100000
starting_record_id = ''
path_to_ca_file = ''



kdp_conn = KdpConn(path_to_ca_file=path_to_ca_file, host=host)
#jwt = kdp_conn.create_authentication_token(email=email,password=password,workspace_id=workspace_id)
pDF = kdp_conn.read_dataset_to_pandas_dataframe(dataset_id=dataset_id,
                                                      jwt=jwt,
                                                      starting_record_id=starting_record_id,
                                                      batch_size=batch_size)

In [None]:
#Create PySpark SparkSession
spark = SparkSession.builder \
    .master("local[1]") \
    .appName("temp1") \
    .getOrCreate()

In [None]:
df=spark.createDataFrame(pDF) 
df.printSchema()
df.show(10)

## Explore with SQL

In [None]:
df.createOrReplaceTempView("titanicTemp")
result = spark.sql('''
   SELECT sex, AVG(age) as avg_age
   FROM titanicTemp
   GROUP BY sex
   ''')
result.show()

## Explore data using a dataframe or using Pandas

In [None]:
pDF.info(verbose=True)
pDF.describe()

In [None]:
pDF["Survived"] = pd.to_numeric(pDF["Survived"])
pDF["Age"] = pd.to_numeric(pDF["Age"])
pDF["Sex"] = pDF["Sex"].astype(str)
survived = 'Survived'
not_survived = 'not survived'
fig, axes = plt.subplots(nrows=1, ncols=2,figsize=(10, 4))
women = pDF[pDF['Sex']=='female']
men = pDF[pDF['Sex']=='male']
ax = sns.distplot(women[women['Survived']==1].Age.dropna(), bins=18, label = survived, ax = axes[0], kde =False)
ax = sns.distplot(women[women['Survived']==0].Age.dropna(), bins=18, label = not_survived, ax = axes[0], kde =False)
ax.legend()
ax.set_title('Female')
ax = sns.distplot(men[men['Survived']==1].Age.dropna(), bins=18, label = survived, ax = axes[1], kde = False)
ax = sns.distplot(men[men['Survived']==0].Age.dropna(), bins=18, label = not_survived, ax = axes[1], kde = False)
ax.legend()
ax = ax.set_title('Male')

In [None]:
grid = sns.FacetGrid(pDF, col='Survived', row='Pclass', height=2.2, aspect=1.6)
grid.map(plt.hist, 'Age', alpha=.5, bins=20)
grid.add_legend();

## Create Features to be used in model

In [None]:
mean = int(pDF["Age"].mean())
print(mean,type(mean))

df2 = df.withColumn("male", (F.when(F.col("Sex") == 'male', 1).otherwise(0)))
df2 = df2.withColumn("Age",df2.Age.cast('int'))
df2 = df2.withColumn("embarked_num", (F.when(F.col("Embarked") == 'Q', 1).otherwise(0)))
df2 = df2.withColumn("age_new", (F.when(F.col("Age").isNull() | F.isnan(F.col("Age")), mean).otherwise(F.col("Age").cast('int'))))
df2 = df2.withColumn("Parch",df2.Parch.cast('int'))
df2 = df2.withColumn("Pclass",df2.Pclass.cast('int'))
df2 = df2.withColumn("SibSp",df2.SibSp.cast('int'))
df2 = df2.withColumn("Survived",df2.Survived.cast('int'))
df2 = df2.withColumn("embarked_num",df2.embarked_num.cast('int'))
df2 = df2.withColumn("male",df2.male.cast('int'))
df2 = df2.withColumn("age_new",df2.age_new.cast('int'))

In [None]:
df2.toPandas()[['age_new','embarked_num', 'male', 'Parch', 'Pclass', 'SibSp']].info()

In [None]:
cols = ['age_new','embarked_num', 'male', 'Parch', 'Pclass', 'SibSp']


vecassemb = VectorAssembler(inputCols=cols,outputCol='features')
df3 = vecassemb.transform(df2)


df_train, df_test = df3.randomSplit([0.8,0.2])

rf = RandomForestClassifier(featuresCol='features',labelCol='Survived')
model = rf.fit(df_train)
result = model.transform(df_test)


predictionAndLabels = result.select("prediction", "Survived")

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="Survived", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(result)
print("Test Error = %g" % (1.0 - accuracy))

In [None]:
all_results = model.transform(df3)
all_results.select('name','survived','prediction').toPandas()

In [None]:
importances = model.featureImportances

x_values = list(range(len(importances)))

plt.bar(x_values, importances, orientation = 'vertical')
plt.xticks(x_values, cols, rotation=40)
plt.ylabel('Importance')
plt.xlabel('Feature')
plt.title('Feature Importances')

In [None]:
all_results

In [None]:
dfOut = all_results.toPandas()[['age_new','embarked_num', 'male', 'Parch', 'Pclass', 'SibSp','Name','Survived','prediction']]
dfOut.Name = dfOut.Name.astype('string')
dfOut.info()

## Save back to Koverse

In [None]:
## ingest data - replace dataset_id param with your own and uncomment below to save back to koverse

#partitions_set = kdp_conn.ingest(dfOut, "5ffbfb5b-6394-4d60-836f-ef99fc582d06", jwt, batch_size)
#pprint('partitions: %s' % partitions_set)