In [1]:
from pandas.io.json import json_normalize
from pymongo import MongoClient
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
import pprint

In [2]:
course_cluster_uri = "mongodb://agg-student:agg-password@cluster0-shard-00-00-jxeqq.mongodb.net:27017,cluster0-shard-00-01-jxeqq.mongodb.net:27017,cluster0-shard-00-02-jxeqq.mongodb.net:27017/test?ssl=true&replicaSet=Cluster0-shard-0&authSource=admin"
course_client = MongoClient(course_cluster_uri)

In [3]:
titanic = course_client['coursera-agg']['titanic']

In [20]:
# Replace {} with a stage to determine the possible values for gender.
unique_gender_stage = {
    "$group":{
                "_id":"$gender",
                 "gd_p":{"$addToSet":"$gender"}
             }
    }

In [21]:
possible_gender_values = titanic.aggregate([
    {
        "$match": {
            "age": {"$type": "number"},
            "point_of_embarkation": {"$ne": ""}
        }
    },
    unique_gender_stage
])

In [22]:
# Print the distinct list of values for the gender field
pprint.pprint(list(possible_gender_values))

[{'_id': 'female', 'gd_p': ['female']}, {'_id': 'male', 'gd_p': ['male']}]


In [60]:
# Replace {} with a stage to determine the possible values for point_of_embarkation
unique_point_of_embarkation_stage = {
    "$group":{"_id":"$point_of_embarkation",
                 "emb_p":{"$addToSet":"$point_of_embarkation"}
             }
}

In [61]:
possible_point_of_embarkation_values = titanic.aggregate([
    {
        "$match": {
            "age": {"$type": "number"},
            "point_of_embarkation": {"$ne": ""}
        }
    },
    unique_point_of_embarkation_stage
])

In [62]:
# Print the distinct list of values for the point_of_embarkation field
pprint.pprint(list(possible_point_of_embarkation_values))

[{'_id': 'Q', 'emb_p': ['Q']},
 {'_id': 'C', 'emb_p': ['C']},
 {'_id': 'S', 'emb_p': ['S']}]


In [63]:
# Given the possible values for point_of_embarkation and gender replace {} with a stage that
# will convert those field values to an integer.
# e.g., For the gender field convert 'female' to 0 and 'male' to 1
gender_and_point_of_embarkation_conversion_stage = {
    "$project":{
        "gender":{"$cond":{"if": {"$eq":["$gender", "female"]}, 
                           "then": 0, "else": 1}},
        "point_of_embarkation":{"$switch":{
            "branches":[
                {"case":{"$eq":["$point_of_embarkation","Q"]},"then": 1},
                {"case":{"$eq":["$point_of_embarkation","C"]},"then": 2},
                {"case":{"$eq":["$point_of_embarkation","S"]},"then": 3}
                       ],
            "default": 0
                                            }
                               },
    
    "survived":1,
    "age": 1,
    "siblings_spouse": 1,
    "parents_children": 1,
    "fare_paid": 1,
    "class":1
    }
    
}

In [64]:
cursor = titanic.aggregate([
    {
        "$match": {
            "age": {"$type": "number"},
            "point_of_embarkation": {"$ne": ""}
        }
    },
    gender_and_point_of_embarkation_conversion_stage,
    {
        "$project": {
            "_id": 0,
            "ticket_number": 0,
            "name": 0,
            "passenger_id": 0,
            "cabin": 0
        }
    }
])

In [65]:
# Exhaust our cursor into a list
titanic_data = list(cursor)

In [66]:
# Load our dataset into a DataFrame
df = json_normalize(titanic_data)

In [67]:
# Pull out the survived column (only the data we want to correlate against)
df_x = df.drop(['survived'], axis=1)
titanic_data[0]

{'survived': 0,
 'class': 1,
 'age': 54,
 'siblings_spouse': 0,
 'parents_children': 0,
 'fare_paid': 51.8625,
 'gender': 1,
 'point_of_embarkation': 3}

In [68]:
# Only the survived column (the value we want to predict)
df_y = df['survived']

In [69]:
# Create a Least Squares Linear Regression object
reg = linear_model.LinearRegression()

In [70]:
# Split our dataset into a training set (80%) and a test set (20%)
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.2, random_state=0)

In [71]:
# Fit a linear model to our training data
reg.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [72]:
# Check our test set against our trained linear model
reg.predict(x_test)

array([ 0.53481602,  0.14153787,  0.72993988,  0.01062921,  0.03936156,
        0.01111901,  0.48413016,  0.43278259,  0.98123008,  0.89476175,
        0.98556713,  0.17010035,  0.06498627,  0.55358109,  1.05344084,
        0.51507309,  0.1886268 ,  0.19911502,  0.11286982,  0.26802997,
        0.29481791,  0.33464505,  0.72993988,  0.01212517,  0.21349833,
        0.78851342,  0.01864553,  0.17010035,  0.19932795,  0.28762187,
        0.45926882,  0.33079814, -0.04693916,  0.65519805,  0.4287377 ,
        0.76317127,  0.39338335,  0.85943367,  0.04084423,  1.03959807,
        0.63798302,  0.38117047,  0.08408563,  0.00492912,  0.51417451,
        0.05160561,  0.10562098,  0.19475017,  0.2846546 ,  0.89926243,
        0.14158888,  0.30545889, -0.01674171,  0.72089238,  0.81422505,
        0.28279807,  0.19074326,  0.09122569,  0.65453578,  0.17752866,
        0.0054391 ,  0.21592215,  0.84372051,  0.17043819,  0.08932148,
        0.14842775,  0.1128047 ,  0.65160003,  0.17083937,  0.71

In [73]:
# Calculate mean squared error (should be ~0.13-0.15%)
mean_squared_error(y_test, reg.predict(x_test))

0.13599207477491293

In [74]:
# class: 1,
# age: 25,
# siblings_spouse: 1
# parents_children: 0,
# fare_paid: 45,
# gender: 1, (replace Y with the integer you assigned for 'male')
# point_of_embarkation: 2, (replace Z with the integer you assigned for 'C')

fake_passenger = [[1,25,1,0,45,1,2]]

In [75]:
# Use this output to verify your completion of this exercise
reg.predict(fake_passenger)

array([0.49283211])