In [1]:
sc

<pyspark.context.SparkContext at 0x7fa98c0a0bd0>

In [2]:
!rm -rf metastore_db/*.lck

from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

### Step 1
- Load the train and test sets
- Check the schema, the variables have their right types?
- If not, how to correctly load the datasets?

In [3]:
train = sqlc.read.csv("train.csv", header=True)

In [4]:
test = sqlc.read.csv("test.csv", header=True)

In [5]:
train.show()

+-----------+--------+------+--------------------+------+---+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex|Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+---+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male| 22|    1|    0|       A/5 21171|   7.25|     |       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female| 38|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female| 26|    0|    0|STON/O2. 3101282|  7.925|     |       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female| 35|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male| 35|    0|    0|          373450|   8.05|     |       S|
|          6|       0|     3|    Moran, Mr. James|  male|   |    0|    0|          33087

In [6]:
train.printSchema()

root
 |-- PassengerId: string (nullable = true)
 |-- Survived: string (nullable = true)
 |-- Pclass: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- SibSp: string (nullable = true)
 |-- Parch: string (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: string (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [7]:
train.createOrReplaceTempView("passengers")
sqlc.sql("select * from passengers where age =''").count()

177

In [8]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

customSchema = StructType([StructField("PassengerId", IntegerType(), True),
                           StructField("Survived", DoubleType(), True),
                           StructField("Pclass", IntegerType(), True), 
                           StructField("Name", StringType(), True),
                           StructField("Sex", StringType(), True),
                           StructField("Age", DoubleType(), True),
                           StructField("SibSp", IntegerType(), True),
                           StructField("Parch", IntegerType(), True),
                           StructField("Ticket", StringType(), True),
                           StructField("Fare", DoubleType(), True),
                           StructField("Cabin", StringType(), True),
                           StructField("Embarked", StringType(), True)])

customSchema2 = StructType([StructField("PassengerId", IntegerType(), True),
                           StructField("Pclass", IntegerType(), True), 
                           StructField("Name", StringType(), True),
                           StructField("Sex", StringType(), True),
                           StructField("Age", DoubleType(), True),
                           StructField("SibSp", IntegerType(), True),
                           StructField("Parch", IntegerType(), True),
                           StructField("Ticket", StringType(), True),
                           StructField("Fare", DoubleType(), True),
                           StructField("Cabin", StringType(), True),
                           StructField("Embarked", StringType(), True)])

In [9]:
train = sqlc.read.csv("train.csv", header=True, schema=customSchema)
test = sqlc.read.csv("test.csv", header=True, schema=customSchema2)

In [10]:
train.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: double (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [11]:
test.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



### Step 2
- Explore the features of your dataset
- You can use DataFrame's ***describe*** method to get summary statistics
    - hint: ***toPandas*** may be useful to ease the manipulation of small dataframes
- Are there any ***NaN*** values in your dataset?
- If so, define value/values to fill these ***NaN*** values
    - hint: ***na*** property of DataFrames provide several methods of handling NA values

In [12]:
train_desc = train.describe().toPandas().set_index('summary')
train_desc

Unnamed: 0_level_0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
summary,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.3838383838383838,2.308641975308642,29.69911764705882,0.5230078563411896,0.3815937149270482,32.2042079685746
stddev,257.3538420152301,0.4865924542648575,0.8360712409770491,14.526497332334037,1.1027434322934315,0.8060572211299488,49.69342859718089
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [13]:
print {col:train.stat.corr('Survived',col) for col in ['Pclass','Age','SibSp','Parch','Fare']}

{'Fare': 0.2573065223849626, 'Age': 0.010539215871285682, 'SibSp': -0.0353224988857356, 'Pclass': -0.3384810359610151, 'Parch': 0.08162940708348339}


In [14]:
print {col:train.where(train[col].isNull()).count() for col in train.columns}

{'Fare': 0, 'Name': 0, 'Embarked': 0, 'Age': 177, 'Parch': 0, 'Pclass': 0, 'Sex': 0, 'Survived': 0, 'SibSp': 0, 'PassengerId': 0, 'Ticket': 0, 'Cabin': 0}


In [15]:
ageMean = float(train_desc.loc['mean']['Age'])

In [16]:
trainFilled = train.na.fill({'Age': ageMean})
testFilled = test.na.fill({'Age': ageMean})

### Step 3
- How to handle categorical features?
    - hint: check the Estimators and Transformers
- Assemble all desired features into a Vector using the VectorAssembler Transformer
- Make sure to end up with a DataFrame with two columns: ***Survived*** and ***vFeatures***

In [17]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder
indexer = StringIndexer().setInputCol("Embarked").setOutputCol("nEmbarked")
indexed1 = indexer.fit(trainFilled).transform(trainFilled)

indexer = StringIndexer().setInputCol("Sex").setOutputCol("nSex")
indexed2 = indexer.fit(indexed1).transform(indexed1)

encoder = OneHotEncoder().setInputCol("nEmbarked").setOutputCol("vEmbarked")
encoded1 = encoder.transform(indexed2)

encoder = OneHotEncoder().setInputCol("nSex").setOutputCol("vSex")
encoded2 = encoder.transform(encoded1)

In [18]:
encoded1.show(5)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+---------+----+-------------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|nEmbarked|nSex|    vEmbarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+---------+----+-------------+
|          1|     0.0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25|     |       S|      0.0| 0.0|(3,[0],[1.0])|
|          2|     1.0|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|      1.0| 1.0|(3,[1],[1.0])|
|          3|     1.0|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925|     |       S|      0.0| 1.0|(3,[0],[1.0])|
|          4|     1.0|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|      0.0| 1.0|(3,[0],

In [19]:
from pyspark.ml.feature import VectorAssembler
from pyspark.mllib.util import MLUtils

assembler = VectorAssembler(inputCols=['Pclass','Age','SibSp','Parch','Fare','vSex','vEmbarked'], outputCol='vFeatures')
assembled = assembler.transform(encoded2)

assembled2 = assembled.select("Survived","vFeatures")

### Step 4
- In Step 5, you will apply a normalization Estimator
- BUT, it does not accept feature vectors of the Sparse type
- So, it is neccessary to apply an User Defined Function to make all features vectors of type VectorUDT
- In this step, you only have to replace ***YOUR DATAFRAME*** and ***NEW DATAFRAME*** with your variables

In [20]:
from pyspark.sql.functions import UserDefinedFunction
from pyspark.ml.linalg import VectorUDT, Vectors

to_vec = UserDefinedFunction(lambda x: Vectors.dense(x.toArray()), VectorUDT())

assembled3 = assembled2.select("Survived", to_vec("vFeatures").alias("features"))

### Step 5
- Apply a normalization Estimator of your choice to the ***features*** vector obtained in Step 4

In [21]:
from pyspark.ml.feature import StandardScaler
scaler = StandardScaler().setInputCol("features").setOutputCol("scaledFeat").setWithStd(True).setWithMean(True)
scalerModel = scaler.fit(assembled3)
scaled = scalerModel.transform(assembled3)

### Step 6
- Train a classifier of your choice (for instance, Random Forest) using your dataset of LabeledPoints
- Make predictions for the training data
- Use the Binary Classification Evaluator to evaluate your model on the training data
- How is your model performing? Try to tune its parameters

In [22]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import RandomForestClassificationModel

rfC = RandomForestClassifier().setLabelCol("Survived") \
                                .setFeaturesCol("scaledFeat") \
                                .setNumTrees(50)

In [23]:
model = rfC.fit(scaled)

In [24]:
predictions = model.transform(scaled)
predictions.show()

+--------+--------------------+--------------------+--------------------+--------------------+----------+
|Survived|            features|          scaledFeat|       rawPrediction|         probability|prediction|
+--------+--------------------+--------------------+--------------------+--------------------+----------+
|     0.0|[3.0,22.0,1.0,0.0...|[0.82691281652436...|[45.1444549359466...|[0.90288909871893...|       0.0|
|     1.0|[1.0,38.0,1.0,0.0...|[-1.5652278312782...|[2.07990312533043...|[0.04159806250660...|       1.0|
|     1.0|[3.0,26.0,0.0,0.0...|[0.82691281652436...|[26.7242611681227...|[0.53448522336245...|       0.0|
|     1.0|[1.0,35.0,1.0,0.0...|[-1.5652278312782...|[5.35564640559562...|[0.10711292811191...|       1.0|
|     0.0|[3.0,35.0,0.0,0.0...|[0.82691281652436...|[44.1763574665143...|[0.88352714933028...|       0.0|
|     0.0|[3.0,29.699117647...|[0.82691281652436...|[43.6161033303125...|[0.87232206660625...|       0.0|
|     0.0|[1.0,54.0,0.0,0.0...|[-1.56522783127

In [25]:
model.featureImportances.toArray()

array([ 0.12343815,  0.07198568,  0.03726658,  0.04504931,  0.1717449 ,
        0.50233138,  0.0200575 ,  0.01623386,  0.01189264])

In [26]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator().setLabelCol("Survived") \
                            .setRawPredictionCol("rawPrediction") \
                            .setMetricName("areaUnderROC")

roc = evaluator.evaluate(predictions)

In [27]:
roc

0.904864240138903

### Step 7
- Take a look at the test data - use DataFrame's ***createOrReplaceTempView*** method to perform SQL queries over the data
    - hint: check if there are any NULL values in the dataset - if so, handle them
- Apply the transformations to the test data
    - hint: you can use Pipelines to chain several Estimators/Transformers
    - warning: unfortunately, it is not possible to include the UDF from Step 4 in the Pipeline
- Make predictions using the model previously trained and the transformed test data
- Save it as ***submission.csv*** and submit it to Kaggle
- What was your score?

In [28]:
testFilled.createOrReplaceTempView('test')

In [29]:
{col: sqlc.sql("select * from test where " + col + " is null").count() for col in testFilled.columns}

{'Age': 0,
 'Cabin': 0,
 'Embarked': 0,
 'Fare': 1,
 'Name': 0,
 'Parch': 0,
 'PassengerId': 0,
 'Pclass': 0,
 'Sex': 0,
 'SibSp': 0,
 'Ticket': 0}

In [30]:
sqlc.sql("select * from test where Fare is null").toPandas()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1044,3,"Storey, Mr. Thomas",male,60.5,0,0,3701,,,S


In [31]:
trainFilled.createOrReplaceTempView('train')
avgFare = sqlc.sql("select mean(Fare) from train where Pclass = 3").take(1)[0][0]
print avgFare

13.6755501018


In [32]:
testFilled = testFilled.na.fill({'Fare': avgFare})

In [33]:
from pyspark.ml import Pipeline

def process_features(train, test):
    indexer1 = StringIndexer().setInputCol("Embarked").setOutputCol("nEmbarked")
    indexer2 = StringIndexer().setInputCol("Sex").setOutputCol("nSex")
    encoder1 = OneHotEncoder().setInputCol("nEmbarked").setOutputCol("vEmbarked")
    encoder2 = OneHotEncoder().setInputCol("nSex").setOutputCol("vSex")
    assembler = VectorAssembler(inputCols=['Pclass','Age','SibSp','Parch','Fare','vSex','vEmbarked'], outputCol='vFeatures')

    pipeFeat = Pipeline().setStages([indexer1, indexer2, encoder1, encoder2, assembler])
    
    model = pipeFeat.fit(train)
    trainFeat = model.transform(train).select("Survived", to_vec("vFeatures").alias("features"))   
    scaler = StandardScaler().setInputCol("features").setOutputCol("scaledFeat").setWithStd(True).setWithMean(True).fit(trainFeat)
    
    testFeat = model.transform(test).select(to_vec("vFeatures").alias("features"))
    
    return scaler.transform(trainFeat), scaler.transform(testFeat)

In [34]:
trainFeat, testFeat = process_features(trainFilled, testFilled)

In [35]:
rfC = RandomForestClassifier().setLabelCol("Survived") \
                                .setFeaturesCol("scaledFeat") \
                                .setNumTrees(50)
model = rfC.fit(trainFeat)

predictions = model.transform(testFeat)

In [36]:
predictions.show()

+--------------------+--------------------+--------------------+--------------------+----------+
|            features|          scaledFeat|       rawPrediction|         probability|prediction|
+--------------------+--------------------+--------------------+--------------------+----------+
|[3.0,34.5,0.0,0.0...|[0.82691281652436...|[44.7826553865377...|[0.89565310773075...|       0.0|
|[3.0,47.0,1.0,0.0...|[0.82691281652436...|[29.3444347833123...|[0.58688869566624...|       0.0|
|[2.0,62.0,0.0,0.0...|[-0.3691575073769...|[43.4280104170114...|[0.86856020834022...|       0.0|
|[3.0,27.0,0.0,0.0...|[0.82691281652436...|[43.9161577225803...|[0.87832315445160...|       0.0|
|[3.0,22.0,1.0,1.0...|[0.82691281652436...|[21.5720901613269...|[0.43144180322653...|       1.0|
|[3.0,14.0,0.0,0.0...|[0.82691281652436...|[36.1721767632747...|[0.72344353526549...|       0.0|
|[3.0,30.0,0.0,0.0...|[0.82691281652436...|[20.4702808904097...|[0.40940561780819...|       1.0|
|[2.0,26.0,1.0,1.0...|[-0.3691

In [37]:
df_predictions = predictions.select("prediction").toPandas().reset_index()
df_predictions['index'] = df_predictions['index'] + 892
df_predictions.columns = ['PassengerId', 'Survived']

df_predictions.to_csv('submission.csv', index=False)

## Result = 75.598%