In [1]:
from pyspark import SparkContext
sc = SparkContext()
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark Titanic Dataset") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [2]:
df = spark.read.csv('data/Titanic.csv',header=True, inferSchema = True)

In [3]:
df = df.drop('_c0')

In [4]:
# Drop Columns with Null Values
df = df.na.drop()

In [5]:
df.show(5)

+-----------+--------+------+--------------------+------+----+-----+-----+--------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|  Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+--------+-------+-----+--------+
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|PC 17599|71.2833|  C85|       C|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|  113803|   53.1| C123|       S|
|          7|       0|     1|McCarthy, Mr. Tim...|  male|54.0|    0|    0|   17463|51.8625|  E46|       S|
|         11|       1|     3|Sandstrom, Miss. ...|female| 4.0|    1|    1| PP 9549|   16.7|   G6|       S|
|         12|       1|     1|Bonnell, Miss. El...|female|58.0|    0|    0|  113783|  26.55| C103|       S|
+-----------+--------+------+--------------------+------+----+-----+-----+--------+-------+-----+--------+
only showing top 5 rows



In [6]:
df.take(5)

[Row(PassengerId=2, Survived=1, Pclass=1, Name='Cumings, Mrs. John Bradley (Florence Briggs Thayer)', Sex='female', Age=38.0, SibSp=1, Parch=0, Ticket='PC 17599', Fare=71.2833, Cabin='C85', Embarked='C'),
 Row(PassengerId=4, Survived=1, Pclass=1, Name='Futrelle, Mrs. Jacques Heath (Lily May Peel)', Sex='female', Age=35.0, SibSp=1, Parch=0, Ticket='113803', Fare=53.1, Cabin='C123', Embarked='S'),
 Row(PassengerId=7, Survived=0, Pclass=1, Name='McCarthy, Mr. Timothy J', Sex='male', Age=54.0, SibSp=0, Parch=0, Ticket='17463', Fare=51.8625, Cabin='E46', Embarked='S'),
 Row(PassengerId=11, Survived=1, Pclass=3, Name='Sandstrom, Miss. Marguerite Rut', Sex='female', Age=4.0, SibSp=1, Parch=1, Ticket='PP 9549', Fare=16.7, Cabin='G6', Embarked='S'),
 Row(PassengerId=12, Survived=1, Pclass=1, Name='Bonnell, Miss. Elizabeth', Sex='female', Age=58.0, SibSp=0, Parch=0, Ticket='113783', Fare=26.55, Cabin='C103', Embarked='S')]

In [7]:
df.describe

<bound method DataFrame.describe of DataFrame[PassengerId: int, Survived: int, Pclass: int, Name: string, Sex: string, Age: double, SibSp: int, Parch: int, Ticket: string, Fare: double, Cabin: string, Embarked: string]>

In [8]:
df.dtypes

[('PassengerId', 'int'),
 ('Survived', 'int'),
 ('Pclass', 'int'),
 ('Name', 'string'),
 ('Sex', 'string'),
 ('Age', 'double'),
 ('SibSp', 'int'),
 ('Parch', 'int'),
 ('Ticket', 'string'),
 ('Fare', 'double'),
 ('Cabin', 'string'),
 ('Embarked', 'string')]

In [9]:
df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [10]:
# See the number of passengers under 25
df.filter(df.Age < 25).count()

48

In [11]:
# Show passengers under 25
df.filter(df.Age < 25).show(20)

+-----------+--------+------+--------------------+------+----+-----+-----+--------+--------+---------------+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|  Ticket|    Fare|          Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+--------+--------+---------------+--------+
|         11|       1|     3|Sandstrom, Miss. ...|female| 4.0|    1|    1| PP 9549|    16.7|             G6|       S|
|         28|       0|     1|Fortune, Mr. Char...|  male|19.0|    3|    2|   19950|   263.0|    C23 C25 C27|       S|
|         89|       1|     1|Fortune, Miss. Ma...|female|23.0|    3|    2|   19950|   263.0|    C23 C25 C27|       S|
|         98|       1|     1|Greenfield, Mr. W...|  male|23.0|    0|    1|PC 17759| 63.3583|        D10 D12|       C|
|        103|       0|     1|White, Mr. Richar...|  male|21.0|    0|    1|   35281| 77.2875|            D26|       S|
|        119|       0|     1|Baxter, Mr. Quigg...|  male

In [12]:
# Display the mean age and Std. dev.
from pyspark.sql.functions import mean, stddev

df.select(mean(df.Age), stddev(df.Age)).show()

+----------------+------------------+
|        avg(Age)|  stddev_samp(Age)|
+----------------+------------------+
|35.6744262295082|15.643865966849717|
+----------------+------------------+



In [13]:
# See the amount of people who embarked from S
df.filter(df.Embarked == 'S').count()

116

In [14]:
# Show the people who embarked from S
df.filter(df.Embarked == 'S').show(20)

+-----------+--------+------+--------------------+------+----+-----+-----+-----------+-------+-----------+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|     Ticket|   Fare|      Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+-----------+-------+-----------+--------+
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|     113803|   53.1|       C123|       S|
|          7|       0|     1|McCarthy, Mr. Tim...|  male|54.0|    0|    0|      17463|51.8625|        E46|       S|
|         11|       1|     3|Sandstrom, Miss. ...|female| 4.0|    1|    1|    PP 9549|   16.7|         G6|       S|
|         12|       1|     1|Bonnell, Miss. El...|female|58.0|    0|    0|     113783|  26.55|       C103|       S|
|         22|       1|     2|Beesley, Mr. Lawr...|  male|34.0|    0|    0|     248698|   13.0|        D56|       S|
|         24|       1|     1|Sloper, Mr. Willi...|  male|28.0|    0|    

In [16]:
import pyspark.sql.functions as f

names_list = df.select(f.collect_list('Name')).first()
print(names_list[0][:5])

['Cumings, Mrs. John Bradley (Florence Briggs Thayer)', 'Futrelle, Mrs. Jacques Heath (Lily May Peel)', 'McCarthy, Mr. Timothy J', 'Sandstrom, Miss. Marguerite Rut', 'Bonnell, Miss. Elizabeth']


In [17]:
df.select('Sex').distinct().show()

+------+
|   Sex|
+------+
|female|
|  male|
+------+



In [18]:
print(df.filter(df['Sex'] == 'male').count())
print(df.filter(df['Sex'] == 'female').count())

95
88
