In [1]:
sc

<pyspark.context.SparkContext at 0x7fb728405630>

In [2]:
from pyspark.sql import SQLContext, Row
sqlc = SQLContext(sc)

In [3]:
!wget https://raw.githubusercontent.com/6chaoran/DataStory/master/Titanic-Spark/train.csv
!wget https://raw.githubusercontent.com/6chaoran/DataStory/master/Titanic-Spark/test.csv

data = sc.textFile('./train.csv')

--2017-02-02 11:39:46--  https://raw.githubusercontent.com/6chaoran/DataStory/master/Titanic-Spark/train.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.112.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.112.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 61194 (60K) [text/plain]
Saving to: ‘train.csv.4’


2017-02-02 11:39:46 (659 KB/s) - ‘train.csv.4’ saved [61194/61194]

--2017-02-02 11:39:46--  https://raw.githubusercontent.com/6chaoran/DataStory/master/Titanic-Spark/test.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.112.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.112.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 28629 (28K) [text/plain]
Saving to: ‘test.csv.4’


2017-02-02 11:39:46 (857 KB/s) - ‘test.csv.4’ saved [28629/28629]



In [4]:
data.take(5)

['PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked',
 '1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S',
 '2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38,1,0,PC 17599,71.2833,C85,C',
 '3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S',
 '4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S']

### Step 1
- Split the fields and make sure to not include the header in the dataset itself.
- Note some variables are categorical - use some transformation to encode them as numeric values
- You may choose NOT to use all variables
- Assembler all variables you chose to keep as a Dense Vector, so you finish this step with an RDD[Vector]

In [43]:
from pyspark.mllib.linalg import Vectors, Vector
from io import StringIO
import numpy as np
import csv
import re

## INSERT YOUR CODE HERE
def load_csv(line):
    return list(csv.reader(StringIO(line), delimiter=',', quotechar='"'))[0]

def readInt(x):
    try:
        return int(x)
    except:
        return np.NaN
    
def readFloat(x):
    try:
        return float(x)
    except:
        return np.NaN

def readSex(x):
    if x=='male':
        return 1
    else:
        return 0 

def parseElement(e):
    return Row( Survived=readInt(e[1]),
                Pclass=readInt(e[2]),
                Sex=readSex(e[4]),
                Age=readFloat(e[5]),
                SibSp=readInt(e[6]),
                Parch=readInt(e[7]),
                Fare=readFloat(e[9])
    )


rdd = data.filter(lambda l: l.find("Age") < 0)
rdd = rdd.map(load_csv).map(parseElement).cache()


print(rdd.count())

print(rdd.take(6))


891
[Row(Age=22.0, Fare=7.25, Parch=0, Pclass=3, Sex=1, SibSp=1, Survived=0), Row(Age=38.0, Fare=71.2833, Parch=0, Pclass=1, Sex=0, SibSp=1, Survived=1), Row(Age=26.0, Fare=7.925, Parch=0, Pclass=3, Sex=0, SibSp=0, Survived=1), Row(Age=35.0, Fare=53.1, Parch=0, Pclass=1, Sex=0, SibSp=1, Survived=1), Row(Age=35.0, Fare=8.05, Parch=0, Pclass=3, Sex=1, SibSp=0, Survived=0), Row(Age=nan, Fare=8.4583, Parch=0, Pclass=3, Sex=1, SibSp=0, Survived=0)]


In [44]:
titanic = rdd.map(Vectors.dense)
type(titanic)
titanic.take(5)

[DenseVector([22.0, 7.25, 0.0, 3.0, 1.0, 1.0, 0.0]),
 DenseVector([38.0, 71.2833, 0.0, 1.0, 0.0, 1.0, 1.0]),
 DenseVector([26.0, 7.925, 0.0, 3.0, 0.0, 0.0, 1.0]),
 DenseVector([35.0, 53.1, 0.0, 1.0, 0.0, 1.0, 1.0]),
 DenseVector([35.0, 8.05, 0.0, 3.0, 1.0, 0.0, 0.0])]

### Step 2
- Explore the features you have assembled
- You can use RDD operations as ***map*** and ***countByKey*** to check how many different values a given feature has and also its distribution
- You can use ***Statistics*** to obtain statistics regarding your features - for instance, the corresponding means
- Are there any ***NaN*** values in your dataset?
- If so, define value/values to fill these ***NaN*** values
    - hint: you can decompose the Dense vector using its ***values*** property and turning it into a list, change the desired value and reassemble it

In [45]:
from pyspark.mllib.stat import Statistics

## INSERT YOUR CODE HERE
sibsp = titanic.map(lambda w: (w[5],1))
print(sibsp.countByKey())

summary = Statistics.colStats(titanic)
print(summary.mean())
print(summary.numNonzeros())

defaultdict(<class 'int'>, {0.0: 608, 1.0: 209, 2.0: 28, 3.0: 16, 4.0: 18, 5.0: 5, 8.0: 7})
[         nan  32.20420797   0.38159371   2.30864198   0.64758698
   0.52300786   0.38383838]
[ 891.  876.  213.  891.  577.  283.  342.]


In [46]:
age = titanic.map(lambda w: w[0])
ages = age.collect()
mean_ages = np.round(np.nanmean(ages))
print(mean_ages)
age_no_nan = titanic.map(lambda w: w[0] if (not np.isnan(w[0])) else mean_ages)

30.0


### Step 3
- Make your RDD an RDD[LabeledPoint]
- Train a classifier of your choice (for instance, Random Forest) using your dataset of LabeledPoints
- Make predictions for the training data
- Use the Binary Classification Metrics to evaluate your model on the training data
- How is your model performing? Try to tune its parameters

In [47]:
from pyspark.mllib.feature import LabeledPoint
from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.mllib.evaluation import BinaryClassificationMetrics

### INSERT YOUR CODE HERE
def make_labeled(row):
    return LabeledPoint(row[-1],row[:-1])

titanic_labeled = titanic.map(make_labeled)
titanic_labeled.take(5)

[LabeledPoint(0.0, [22.0,7.25,0.0,3.0,1.0,1.0]),
 LabeledPoint(1.0, [38.0,71.2833,0.0,1.0,0.0,1.0]),
 LabeledPoint(1.0, [26.0,7.925,0.0,3.0,0.0,0.0]),
 LabeledPoint(1.0, [35.0,53.1,0.0,1.0,0.0,1.0]),
 LabeledPoint(0.0, [35.0,8.05,0.0,3.0,1.0,0.0])]

In [48]:
model = RandomForest.trainClassifier(titanic_labeled, 
                                     numClasses=2, 
                                     categoricalFeaturesInfo={},
                                     numTrees=3, 
                                     featureSubsetStrategy="auto",
                                     impurity='gini', 
                                     maxDepth=4, 
                                     maxBins=32)

#print(model.toDebugString())

In [49]:
labels = titanic_labeled.map(lambda x: x.label)
features = titanic_labeled.map(lambda x: x.features)

predictions = model.predict(features)

labelsAndPredictions = labels.zip(predictions)

testErr = labelsAndPredictions.filter(lambda args: args[0] != args[1]).count() / float(titanic_labeled.count())
print('Train Error = ' + str(testErr))

Train Error = 0.1717171717171717


### Step 4
- Apply the data processing/transforming steps to the testing data
    - hint: it could be useful to revisit your past code and refactor it as function(s) taking an RDD as parameter and performing the operations on it
- Make predictions for the test data
- Save it as ***submission.csv*** and submit it to Kaggle
- What was your score?

In [50]:
### INSERT YOUR CODE HERE

test = sc.textFile('./test.csv')

def load_csv(line):
    return list(csv.reader(StringIO(line), delimiter=',', quotechar='"'))[0]

def readInt(x):
    try:
        return int(x)
    except:
        return np.NaN
    
def readFloat(x):
    try:
        return float(x)
    except:
        return np.NaN

def readSex(x):
    if x=='male':
        return 1
    else:
        return 0 

def parseElement(e):
    return Row( Survived=0,
                Pclass=readInt(e[1]),
                Sex=readSex(e[3]),
                Age=readFloat(e[4]),
                SibSp=readInt(e[5]),
                Parch=readInt(e[6]),
                Fare=readFloat(e[7])
    )


rdd2 = test.filter(lambda l: l.find("Age") < 0)
rdd2 = rdd2.map(load_csv).map(parseElement).cache()

['PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked', '892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q']
[Row(Age=34.5, Fare=330911.0, Parch=0, Pclass=3, Sex=1, SibSp=0, Survived=0)]


In [51]:
titanic2 = rdd2.map(Vectors.dense)
titanic2.take(2)

[DenseVector([34.5, 330911.0, 0.0, 3.0, 1.0, 0.0, 0.0]),
 DenseVector([47.0, 363272.0, 0.0, 3.0, 0.0, 1.0, 0.0])]

In [79]:
titanic_labeled2 = titanic2.map(make_labeled)

labels2 = titanic_labeled2.map(lambda x: x.label)
features2 = titanic_labeled2.map(lambda x: x.features)

predictions2 = model.predict(features2)

In [78]:
import pandas as pd
pd.DataFrame(list(zip(range(892,1310), 
             predictions2.map(int).collect())), 
             columns=['PassengerId','Survived']).to_csv('submission.csv', index=False)

## Result = 77,033%

## Solutions

In [88]:
# Getting the header
header = data.first()
# Removing the header
data = data.filter(lambda row: row != header)

# Parsing function - takes a row (string) and testing flag (boolean)
# It already includes the option of processing a testing file
def process_row(row, testing):
    # If it is testing, "Survived" is not available in the file
    # csv.reader is a very easy way to handle delimiters and quotes, but it only takes files - not string variables
    # Comes StringIO to the rescue, since it allows you to treat a string variable as if it was a file
    if testing:
        passenger_id, pclass, name, sex, age, sibsp, parch, ticket, fare, cabin, embarked \
        = list(csv.reader(StringIO(row), delimiter=',', quotechar='"'))[0]
        survived = np.NaN
    # If it is training, "Survived" is available
    else:
        passenger_id, survived, pclass, name, sex, age, sibsp, parch, ticket, fare, cabin, embarked \
        = list(csv.reader(StringIO(row), delimiter=',', quotechar='"'))[0]
        survived = int(survived)
    # Encode the "Sex" variable as binary
    sex = 1 if sex == 'male' else 0
    # Encode the "Embarked" variable as 3 binary variables, one for each point of embark
    cherbourg = 1 if embarked == 'C' else 0
    queenstown = 1 if embarked == 'Q' else 0
    southampton = 1 if embarked == 'S' else 0
    # Try to cast age as float, making it NaN if it fails
    try:
        age = float(age)
    except:
        age = np.NaN
    # Try to cast fare as float, making it NaN if it fails
    try:
        fare = float(fare)
    except:
        fare = np.NaN
    # Assemble all variables as a dense vector
    # Some variables are being casted at this moment - in this file, there were no NA values for these variables
    # In a real-world problem, it could be the case we do not know ahead if there will be NA values or not
    # So it is recommended to handle all of them as TRY...EXCEPT blocks
    return Vectors.dense([survived, int(pclass), sex, age, int(sibsp), int(parch), fare, cherbourg, queenstown, southampton])

# Assembling a list of the features, so it is easier to match the names with their statistics later
features = ['Survived','PClass','sex','age','sibsp','parch','fare','cherbourg','queenstown','southampton']

# Apply the parsing function to the training test (testing=False)
rows = data.map(lambda row: process_row(row, False))

# Defining a function to load and parse the data for a given filename and testing flag
def load_data(filename, testing):
    data = sc.textFile(filename)
    header = data.first()
    data = data.filter(lambda row: row != header)
    rows = data.map(lambda row: process_row(row, testing))
    return rows

In [85]:
from pyspark.mllib.stat import Statistics

# Exploring the SibSp variable - getting the count for each different value
sibsp = rows.map(lambda row: (row[4], 1))
print(sibsp.countByKey())

# Exploring the PClass variable - getting the count for each different value
pclass = rows.map(lambda row: (row[1], 1))
print(pclass.countByKey())

# Computing statistics over the RDD of rows (obtained at the end of Step 1)
statistics = Statistics.colStats(rows)

# The names of the features used in Step 1
features = ['Survived','PClass','sex','age','sibsp','parch','fare','cherbourg','queenstown','southampton']

# Checking the number of non-zeros for each feature
print(list(zip(features, statistics.numNonzeros())))

# Checking the mean for each feature
# In this case, there is at leat one NaN value and therefore it affects the mean
print(list(zip(features, statistics.mean())))

# So, let's compute the mean of all numeric values
# First, select only the "Age" feature
age = rows.map(lambda row: row[3])
# Then, filter out all NaN values
age = age.filter(lambda age: not np.isnan(age))
# And calculate the mean age
ageMean = age.mean()
print(ageMean)

# Now it is time to replace all NaN values of age with its mean
# So we slice the row.values, keeping the first 3, then doing the replacements at the 4th position
# and appending the remaining values
rowsWithAge = rows.map(lambda row: Vectors.dense(list(row.values[:3]) + [ageMean if np.isnan(row.values[3]) else row.values[3]] + list(row.values[4:])))

# Defining a function to fill the missing age values using its mean
def fill_age(rows):
    age = rows.map(lambda row: row[3])
    age = age.filter(lambda age: not np.isnan(age))
    ageMean = age.mean()
    rowsWithAge = rows.map(lambda row: Vectors.dense(list(row.values[:3]) + [ageMean if np.isnan(row.values[3]) else row.values[3]] + list(row.values[4:])))
    return rowsWithAge

# Computing statistics once again on the new RDD to make sure there are no NaN values anymore
statistics2 = Statistics.colStats(rowsWithAge)

# Now, the "Age" feature has a numeric mean
print(list(zip(features, statistics2.mean())))

# Compute the correlations between all variables - Survived is the first one, so we check its correlations
print(list(zip(features, Statistics.corr(rowsWithAge)[0])))

defaultdict(<class 'int'>, {0.0: 608, 1.0: 209, 2.0: 28, 3.0: 16, 4.0: 18, 5.0: 5, 8.0: 7})
defaultdict(<class 'int'>, {1.0: 216, 2.0: 184, 3.0: 491})
[('Survived', 342.0), ('PClass', 891.0), ('sex', 577.0), ('age', 891.0), ('sibsp', 283.0), ('parch', 213.0), ('fare', 876.0), ('cherbourg', 168.0), ('queenstown', 77.0), ('southampton', 644.0)]
[('Survived', 0.38383838383838381), ('PClass', 2.308641975308642), ('sex', 0.6475869809203143), ('age', nan), ('sibsp', 0.52300785634118974), ('parch', 0.38159371492704819), ('fare', 32.204207968574657), ('cherbourg', 0.18855218855218855), ('queenstown', 0.086419753086419748), ('southampton', 0.72278338945005616)]
29.6991176471
[('Survived', 0.38383838383838381), ('PClass', 2.308641975308642), ('sex', 0.6475869809203143), ('age', 29.699117647058813), ('sibsp', 0.52300785634118974), ('parch', 0.38159371492704819), ('fare', 32.204207968574657), ('cherbourg', 0.18855218855218855), ('queenstown', 0.086419753086419748), ('southampton', 0.72278338945005

In [110]:
# Make the row into a LabeledPoint - the label is the first element
rowsLabeled = rowsWithAge.map(lambda row: LabeledPoint(row[0], Vectors.dense(row[1:])))

# Train a simple and standard RandomForest classifier with 10 trees
model = RandomForest.trainClassifier(rowsLabeled, 
                                     numClasses=2, 
                                     categoricalFeaturesInfo={},
                                     numTrees=10, 
                                     featureSubsetStrategy="auto",
                                     impurity='gini', 
                                     maxDepth=4, 
                                     maxBins=32)

# Split the labeled points into its labels and features
labels = rowsLabeled.map(lambda x: x.label)
features = rowsLabeled.map(lambda x: x.features)

# Make predictions for the features using the RandomForest model
# and then zip it together with its corresponding labels
predictionAndLabels = model.predict(features).zip(labels)

# Evaluate the metrics for the predictions made for the training set
metrics = BinaryClassificationMetrics(predictionAndLabels)

print("Area under ROC = %s" % metrics.areaUnderROC)

Area under ROC = 0.7438884095484614


In [112]:
filename = 'test.csv'

# Load the test data set, with testing=True
rows = load_data(filename, True)

# Fill the NA values for age
rowsWithAge = fill_age(rows)

# Make it an RDD of LabeledPoints
rowsLabeled = rowsWithAge.map(lambda row: LabeledPoint(row[0], Vectors.dense(row[1:])))

# Split the LabeledPoints into its labels and features
labels = rowsLabeled.map(lambda x: x.label)
features = rowsLabeled.map(lambda x: x.features)

# Make predictions for the test set
predictions = model.predict(features)

# Generate a submission file for Kaggle
pd.DataFrame(list(zip(range(892,1310), 
                 predictions.map(int).collect())), 
             columns=['PassengerId','Survived']).to_csv('submission.csv', index=False)