In [295]:
from pandas.io.json import json_normalize
import pandas as pd
from pymongo import MongoClient
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
import pprint

In [296]:
course_cluster_uri = "mongodb://agg-student:agg-password@cluster0-shard-00-00-jxeqq.mongodb.net:27017,cluster0-shard-00-01-jxeqq.mongodb.net:27017,cluster0-shard-00-02-jxeqq.mongodb.net:27017/test?ssl=true&replicaSet=Cluster0-shard-0&authSource=admin"
course_client = MongoClient(course_cluster_uri)

In [297]:
titanic = course_client['coursera-agg']['titanic']

In [298]:
# Replace {} with a stage to determine the possible values for gender.
unique_gender_stage = {
    "$group": {
        "_id": "$gender",
        "count": {"$sum": 1}
    }
}

In [299]:
possible_gender_values = titanic.aggregate([
    {
        "$match": {
            "age": {"$type": "number"},
            "point_of_embarkation": {"$ne": ""}
        }
    },
    unique_gender_stage
])

In [300]:
# Print the distinct list of values for the gender field
pprint.pprint(list(possible_gender_values))

[{'_id': 'male', 'count': 453}, {'_id': 'female', 'count': 259}]


In [301]:
# Replace {} with a stage to determine the possible values for point_of_embarkation
unique_point_of_embarkation_stage = {
    "$group": {
        "_id": "$point_of_embarkation",
        "count": {"$sum": 1}
    }
}

In [302]:
possible_point_of_embarkation_values = titanic.aggregate([
    {
        "$match": {
            "age": {"$type": "number"},
            "point_of_embarkation": {"$ne": ""}
        }
    },
    unique_point_of_embarkation_stage
])

In [303]:
# Print the distinct list of values for the point_of_embarkation field
pprint.pprint(list(possible_point_of_embarkation_values))

[{'_id': 'S', 'count': 554},
 {'_id': 'C', 'count': 130},
 {'_id': 'Q', 'count': 28}]


In [304]:
# Given the possible values for point_of_embarkation and gender replace {} with a stage that
# will convert those field values to an integer.
# e.g., For the gender field convert 'female' to 0 and 'male' to 1
gender_and_point_of_embarkation_conversion_stage = {
    "$addFields": {
        "gender": {
            "$switch": {
              "branches": [
                 { "case": { "$eq": [ "$gender", "male" ] }, "then": 1 },
                 { "case": { "$eq": [ "$gender", "female" ] }, "then": 0 }
              ],
            "default": -1
           }
        },
        
        "point_of_embarkation": {
            "$switch": {
              "branches": [
                 { "case": { "$eq": [ "$point_of_embarkation", "C" ] }, "then": 0 },
                 { "case": { "$eq": [ "$point_of_embarkation", "Q" ] }, "then": 1 },
                 { "case": { "$eq": [ "$point_of_embarkation", "S" ] }, "then": 2 }
              ],
              "default": "Did not match"
           }
        }
    }
}

In [305]:
cursor = titanic.aggregate([
    {
        "$match": {
            "age": {"$type": "number"},
            "point_of_embarkation": {"$ne": ""}
        }
    },
    gender_and_point_of_embarkation_conversion_stage,
    {
        "$project": {
            "_id": 0,
            "ticket_number": 0,
            "name": 0,
            "passenger_id": 0,
            "cabin": 0
        }
    }
])

In [306]:
# Exhaust our cursor into a list
titanic_data = list(cursor)

In [307]:
# Load our dataset into a DataFrame
print(titanic_data[1])
df = pd.json_normalize(titanic_data)
df.head(n=20)

{'survived': 0, 'class': 3, 'gender': 1, 'age': 22, 'siblings_spouse': 1, 'parents_children': 0, 'fare_paid': 7.25, 'point_of_embarkation': 2}


Unnamed: 0,survived,class,gender,age,siblings_spouse,parents_children,fare_paid,point_of_embarkation
0,0,3,1,35.0,0,0,8.05,2
1,0,3,1,22.0,1,0,7.25,2
2,1,3,0,4.0,1,1,16.7,2
3,1,3,0,27.0,0,2,11.1333,2
4,1,1,0,35.0,1,0,53.1,2
5,0,3,1,20.0,0,0,8.05,2
6,1,1,0,58.0,0,0,26.55,2
7,0,3,1,39.0,1,5,31.275,2
8,1,2,0,55.0,0,0,16.0,2
9,0,3,0,14.0,0,0,7.8542,2


In [308]:
# Pull out the survived column (only the data we want to correlate against)
df_x = df.drop(['survived'], axis=1)

In [309]:
# Only the survived column (the value we want to predict)
df_y = df['survived']

In [310]:
# Create a Least Squares Linear Regression object
reg = linear_model.LinearRegression()

In [311]:
# Split our dataset into a training set (80%) and a test set (20%)
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.1, random_state=0)

In [312]:
# Fit a linear model to our training data
reg.fit(x_train, y_train)

LinearRegression()

In [313]:
# Check our test set against our trained linear model
reg.predict(x_test)

array([ 1.07576116e-01,  9.00124879e-01,  5.61776100e-01,  6.38424867e-01,
       -1.48991234e-03, -4.71394311e-05,  5.33123418e-02,  6.26097010e-01,
        2.29904260e-01,  7.23803072e-01,  1.02804580e+00,  1.13631932e-01,
        5.59945954e-01,  3.77145726e-02,  8.32971118e-01,  4.55971722e-01,
        1.79940255e-01,  3.71573238e-01,  3.09530502e-01,  2.16114922e-01,
        4.48591756e-01,  8.45662048e-01,  6.20394093e-01,  2.28346056e-01,
        3.86775365e-01,  8.36001692e-01,  4.14280937e-01,  2.28347084e-01,
        1.62225393e-01,  1.00826169e-01,  6.14158463e-01, -1.92762178e-02,
       -8.67625675e-02,  6.50134695e-01,  5.78131227e-01,  1.18438602e-02,
        9.77288666e-02,  1.37736927e-01,  3.32766586e-01,  5.24501738e-01,
        8.73666310e-01,  1.49936283e-01,  6.49301518e-01,  1.13631932e-01,
        1.13607456e-01,  4.62686738e-01,  5.05901757e-01,  2.17585297e-01,
        1.25733382e-01,  2.97734110e-01,  6.89357914e-01,  6.50934264e-01,
        5.33324831e-02,  

In [314]:
# Calculate mean squared error (should be ~0.13-0.15%)
mean_squared_error(y_test, reg.predict(x_test))

0.1474278062587369

In [315]:
# age: 25,
# class: 1,
# fare_paid: 45,
# gender: Y, (replace Y with the integer you assigned for 'male')
# parents_children: 0,
# point_of_embarkation: Z, (replace Z with the integer you assigned for 'C')
# siblings_spouse: 1

# fake_passenger = [[25, 1, 45, 1, 0, 2, 1]]

# {'survived': 0, 'class': 3, 'gender': 1, 'age': 22, 'siblings_spouse': 1, 
#  'parents_children': 0, 'fare_paid': 7.25, 'point_of_embarkation': 0}

fake_passenger = [[1, 1, 25, 1, 0, 45, 0]]

In [316]:
# Use this output to verify your completion of this exercise
reg.predict(fake_passenger)

array([0.50297376])