In [1]:
import pyspark
from pyspark import SparkContext

try:
    sc
except NameError:
    sc = SparkContext()

In [2]:
spark = pyspark.sql.SparkSession.builder \
    .master("local") \
    .appName("Titanic") \
    .getOrCreate()

In [3]:
df = spark.read.csv('/Users/paul/Code/Columbia/PDL-Cloudproc/titanic/train.csv', header=True, inferSchema=True)

In [4]:
df.show(5)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|
+-----------+--------+------+--------------------+------+----+-----+-----+------

In [5]:
# df = df.drop('Name', 'Ticket')
# df.show(5)

+-----------+--------+------+------+----+-----+-----+-------+-----+--------+
|PassengerId|Survived|Pclass|   Sex| Age|SibSp|Parch|   Fare|Cabin|Embarked|
+-----------+--------+------+------+----+-----+-----+-------+-----+--------+
|          1|       0|     3|  male|22.0|    1|    0|   7.25| null|       S|
|          2|       1|     1|female|38.0|    1|    0|71.2833|  C85|       C|
|          3|       1|     3|female|26.0|    0|    0|  7.925| null|       S|
|          4|       1|     1|female|35.0|    1|    0|   53.1| C123|       S|
|          5|       0|     3|  male|35.0|    0|    0|   8.05| null|       S|
+-----------+--------+------+------+----+-----+-----+-------+-----+--------+
only showing top 5 rows



In [6]:
import pyspark.sql.functions as f

df = df.withColumn('Age', f.when(f.col("Age").isNull(), -1).otherwise(f.col("Age")))

In [7]:
# df = df.dropna()
df.show(5)

+-----------+--------+------+------+----+-----+-----+-------+-----+--------+
|PassengerId|Survived|Pclass|   Sex| Age|SibSp|Parch|   Fare|Cabin|Embarked|
+-----------+--------+------+------+----+-----+-----+-------+-----+--------+
|          1|       0|     3|  male|22.0|    1|    0|   7.25| null|       S|
|          2|       1|     1|female|38.0|    1|    0|71.2833|  C85|       C|
|          3|       1|     3|female|26.0|    0|    0|  7.925| null|       S|
|          4|       1|     1|female|35.0|    1|    0|   53.1| C123|       S|
|          5|       0|     3|  male|35.0|    0|    0|   8.05| null|       S|
+-----------+--------+------+------+----+-----+-----+-------+-----+--------+
only showing top 5 rows



In [8]:
df = df.withColumn('Sex',
    f.when(
        f.col('Sex') == 'male', 1
    ).when(
        f.col('Sex') == 'female', 0
    )
)
df.show(5)

+-----------+--------+------+---+----+-----+-----+-------+-----+--------+
|PassengerId|Survived|Pclass|Sex| Age|SibSp|Parch|   Fare|Cabin|Embarked|
+-----------+--------+------+---+----+-----+-----+-------+-----+--------+
|          1|       0|     3|  1|22.0|    1|    0|   7.25| null|       S|
|          2|       1|     1|  0|38.0|    1|    0|71.2833|  C85|       C|
|          3|       1|     3|  0|26.0|    0|    0|  7.925| null|       S|
|          4|       1|     1|  0|35.0|    1|    0|   53.1| C123|       S|
|          5|       0|     3|  1|35.0|    0|    0|   8.05| null|       S|
+-----------+--------+------+---+----+-----+-----+-------+-----+--------+
only showing top 5 rows



In [9]:
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

def male(x):
    return int(x == 'male')

is_male = udf(male, IntegerType())
df.withColumn("Sex", is_male("Sex")).show(5)

+-----------+--------+------+---+----+-----+-----+-------+-----+--------+
|PassengerId|Survived|Pclass|Sex| Age|SibSp|Parch|   Fare|Cabin|Embarked|
+-----------+--------+------+---+----+-----+-----+-------+-----+--------+
|          1|       0|     3|  0|22.0|    1|    0|   7.25| null|       S|
|          2|       1|     1|  0|38.0|    1|    0|71.2833|  C85|       C|
|          3|       1|     3|  0|26.0|    0|    0|  7.925| null|       S|
|          4|       1|     1|  0|35.0|    1|    0|   53.1| C123|       S|
|          5|       0|     3|  0|35.0|    0|    0|   8.05| null|       S|
+-----------+--------+------+---+----+-----+-----+-------+-----+--------+
only showing top 5 rows



In [10]:
set(df.select('Embarked').collect())

{Row(Embarked='C'), Row(Embarked='Q'), Row(Embarked='S'), Row(Embarked=None)}

In [11]:
def embarked(x):
    return {
        'C': 1,
        'Q': 2,
        'S': 3,
        None: -1,
    }.get(x)

df.withColumn("Embarked", udf(embarked, IntegerType())('Embarked'))

df = df.withColumn('Embarked',
    f.when(
        f.col('Embarked') == 'C', 1
    ).when(
        f.col('Embarked') == 'Q', 2
    ).when(
        f.col('Embarked') == 'S', 3
    ).otherwise(-1)  
)
df.show(5)

+-----------+--------+------+---+----+-----+-----+-------+-----+--------+
|PassengerId|Survived|Pclass|Sex| Age|SibSp|Parch|   Fare|Cabin|Embarked|
+-----------+--------+------+---+----+-----+-----+-------+-----+--------+
|          1|       0|     3|  1|22.0|    1|    0|   7.25| null|       3|
|          2|       1|     1|  0|38.0|    1|    0|71.2833|  C85|       1|
|          3|       1|     3|  0|26.0|    0|    0|  7.925| null|       3|
|          4|       1|     1|  0|35.0|    1|    0|   53.1| C123|       3|
|          5|       0|     3|  1|35.0|    0|    0|   8.05| null|       3|
+-----------+--------+------+---+----+-----+-----+-------+-----+--------+
only showing top 5 rows



In [12]:
df = df.withColumn('Cabin', f.lower(f.col('Cabin')))
df.show(5)

+-----------+--------+------+---+----+-----+-----+-------+-----+--------+
|PassengerId|Survived|Pclass|Sex| Age|SibSp|Parch|   Fare|Cabin|Embarked|
+-----------+--------+------+---+----+-----+-----+-------+-----+--------+
|          1|       0|     3|  1|22.0|    1|    0|   7.25| null|       3|
|          2|       1|     1|  0|38.0|    1|    0|71.2833|  c85|       1|
|          3|       1|     3|  0|26.0|    0|    0|  7.925| null|       3|
|          4|       1|     1|  0|35.0|    1|    0|   53.1| c123|       3|
|          5|       0|     3|  1|35.0|    0|    0|   8.05| null|       3|
+-----------+--------+------+---+----+-----+-----+-------+-----+--------+
only showing top 5 rows



In [13]:
df = df.withColumn('Cabin', f.substring(f.col('Cabin'), 0, 1))
df.show(5)

+-----------+--------+------+---+----+-----+-----+-------+-----+--------+
|PassengerId|Survived|Pclass|Sex| Age|SibSp|Parch|   Fare|Cabin|Embarked|
+-----------+--------+------+---+----+-----+-----+-------+-----+--------+
|          1|       0|     3|  1|22.0|    1|    0|   7.25| null|       3|
|          2|       1|     1|  0|38.0|    1|    0|71.2833|    c|       1|
|          3|       1|     3|  0|26.0|    0|    0|  7.925| null|       3|
|          4|       1|     1|  0|35.0|    1|    0|   53.1|    c|       3|
|          5|       0|     3|  1|35.0|    0|    0|   8.05| null|       3|
+-----------+--------+------+---+----+-----+-----+-------+-----+--------+
only showing top 5 rows



In [14]:
import string

def cabin_level(x):
    if x is None:
        return -1

    return string.ascii_lowercase.index(x)

df = df.withColumn("Cabin", udf(cabin_level, IntegerType())('Cabin'))
df.show(5)

+-----------+--------+------+---+----+-----+-----+-------+-----+--------+
|PassengerId|Survived|Pclass|Sex| Age|SibSp|Parch|   Fare|Cabin|Embarked|
+-----------+--------+------+---+----+-----+-----+-------+-----+--------+
|          1|       0|     3|  1|22.0|    1|    0|   7.25|   -1|       3|
|          2|       1|     1|  0|38.0|    1|    0|71.2833|    2|       1|
|          3|       1|     3|  0|26.0|    0|    0|  7.925|   -1|       3|
|          4|       1|     1|  0|35.0|    1|    0|   53.1|    2|       3|
|          5|       0|     3|  1|35.0|    0|    0|   8.05|   -1|       3|
+-----------+--------+------+---+----+-----+-----+-------+-----+--------+
only showing top 5 rows



In [15]:
feature_cols = list(df.columns)
feature_cols.remove('Survived')
feature_cols.remove('PassengerId')
# feature_cols.remove('Cabin')
feature_cols

['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked']

In [29]:
VectorAssembler?

In [16]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

features = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features",  # PySpark uses this as the default column

).transform(df)
features

DataFrame[PassengerId: int, Survived: int, Pclass: int, Sex: int, Age: double, SibSp: int, Parch: int, Fare: double, Cabin: int, Embarked: int, features: vector]

In [17]:
train, test = features.randomSplit([1.0, 0.0], seed=2018)
# train, test = features.randomSplit([0.7, 0.3], seed=2018)

In [18]:
from pyspark.ml.classification import LogisticRegression

In [19]:
lr = LogisticRegression(maxIter=10, featuresCol='features', labelCol='Survived')

model = lr.fit(train)

In [20]:
model.coefficientMatrix, model.interceptVector

(DenseMatrix(1, 8, [-0.5246, -2.9063, -0.0312, -0.3224, -0.1403, 0.0087, 0.1996, -0.1354], 1),
 DenseVector([3.5279]))

In [21]:
predictions = model.transform(test)
predictions.select(*feature_cols).show(10)

+------+---+---+-----+-----+----+-----+--------+
|Pclass|Sex|Age|SibSp|Parch|Fare|Cabin|Embarked|
+------+---+---+-----+-----+----+-----+--------+
+------+---+---+-----+-----+----+-----+--------+



In [22]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

print('Test Area Under ROC', BinaryClassificationEvaluator(labelCol='Survived').evaluate(predictions))

Test Area Under ROC 0.5


In [23]:
test_df = spark.read.csv('/Users/paul/Code/Columbia/PDL-Cloudproc/titanic/test.csv', header=True, inferSchema=True)
test_df = test_df.drop('Name', 'Ticket')
test_df = test_df.withColumn('Age', f.when(f.col("Age").isNull(), -1).otherwise(f.col("Age")))
test_df = test_df.withColumn('Sex',
    f.when(
        f.col('Sex') == 'male', 1
    ).when(
        f.col('Sex') == 'female', 0
    )
)
test_df = test_df.withColumn('Embarked',
    f.when(
        f.col('Embarked') == 'C', 1
    ).when(
        f.col('Embarked') == 'Q', 2
    ).when(
        f.col('Embarked') == 'S', 3
    ).otherwise(-1)  
)
# test_df = test_df.drop('Cabin')
test_df = test_df.withColumn('Cabin', f.lower(f.col('Cabin')))
test_df = test_df.withColumn('Cabin', f.substring(f.col('Cabin'), 0, 1))
test_df = test_df.withColumn("Cabin", udf(cabin_level, IntegerType())('Cabin'))

feature_cols = list(test_df.columns)
feature_cols.remove('PassengerId')
feature_cols

test = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features",
).transform(test_df.fillna(0))

predictions = model.transform(test)

In [24]:
# predictions.select(['PassengerId', 'prediction']).coalesce(1).write.csv('results.csv')

In [25]:
pddf = predictions.select(['PassengerId', 'prediction']).toPandas()
pddf.columns = ['PassengerId', 'Survived']
pddf

Unnamed: 0,PassengerId,Survived
0,892,0.0
1,893,0.0
2,894,0.0
3,895,0.0
4,896,1.0
...,...,...
413,1305,0.0
414,1306,1.0
415,1307,0.0
416,1308,0.0


In [26]:
pddf.to_csv('results.csv', index=False, header=True)

In [28]:
 LogisticRegression?