In [1]:
from pandas.io.json import json_normalize
from pymongo import MongoClient
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
import pprint

In [2]:
course_cluster_uri = "mongodb://agg-student:agg-password@cluster0-shard-00-00-jxeqq.mongodb.net:27017,cluster0-shard-00-01-jxeqq.mongodb.net:27017,cluster0-shard-00-02-jxeqq.mongodb.net:27017/test?ssl=true&replicaSet=Cluster0-shard-0&authSource=admin"
course_client = MongoClient(course_cluster_uri)

In [4]:
titanic = course_client['coursera-agg']['titanic']

list(titanic.find({}).limit(5))

[{'_id': ObjectId('59f3769387ef3716f7157ac5'),
  'passenger_id': 5,
  'survived': 0,
  'class': 3,
  'name': 'Allen, Mr. William Henry',
  'gender': 'male',
  'age': 35,
  'siblings_spouse': 0,
  'parents_children': 0,
  'ticket_number': 373450,
  'fare_paid': 8.05,
  'cabin': '',
  'point_of_embarkation': 'S'},
 {'_id': ObjectId('59f3769387ef3716f7157ac4'),
  'passenger_id': 1,
  'survived': 0,
  'class': 3,
  'name': 'Braund, Mr. Owen Harris',
  'gender': 'male',
  'age': 22,
  'siblings_spouse': 1,
  'parents_children': 0,
  'ticket_number': 'A/5 21171',
  'fare_paid': 7.25,
  'cabin': '',
  'point_of_embarkation': 'S'},
 {'_id': ObjectId('59f3769387ef3716f7157aca'),
  'passenger_id': 11,
  'survived': 1,
  'class': 3,
  'name': 'Sandstrom, Miss. Marguerite Rut',
  'gender': 'female',
  'age': 4,
  'siblings_spouse': 1,
  'parents_children': 1,
  'ticket_number': 'PP 9549',
  'fare_paid': 16.7,
  'cabin': 'G6',
  'point_of_embarkation': 'S'},
 {'_id': ObjectId('59f3769387ef3716f7157

In [5]:
# Replace {} with a stage to determine the possible values for gender.
unique_gender_stage = {
    "$group": {
        "_id": "$gender"
    }
}

In [6]:
possible_gender_values = titanic.aggregate([
    {
        "$match": {
            "age": {"$type": "number"},
            "point_of_embarkation": {"$ne": ""}
        }
    },
    unique_gender_stage
])

In [7]:
# Print the distinct list of values for the gender field
pprint.pprint(list(possible_gender_values))

[{'_id': 'male'}, {'_id': 'female'}]


In [8]:
# Replace {} with a stage to determine the possible values for point_of_embarkation
unique_point_of_embarkation_stage = {
    "$group": {
        "_id": "$point_of_embarkation"
    }
}

In [9]:
possible_point_of_embarkation_values = titanic.aggregate([
    {
        "$match": {
            "age": {"$type": "number"},
            "point_of_embarkation": {"$ne": ""}
        }
    },
    unique_point_of_embarkation_stage
])

In [10]:
# Print the distinct list of values for the point_of_embarkation field
pprint.pprint(list(possible_point_of_embarkation_values))

[{'_id': 'S'}, {'_id': 'C'}, {'_id': 'Q'}]


In [29]:
# Given the possible values for point_of_embarkation and gender replace {} with a stage that
# will convert those field values to an integer.
# e.g., For the gender field convert 'female' to 0 and 'male' to 1
gender_and_point_of_embarkation_conversion_stage = {
    "$project": {
        "_id": 1,
        "ticket_number": 1,
        "name": 1,
        "passenger_id": 1,
        "cabin": 1,
        "survived": 1,
        "class": 1,
        "age": 1,
        "siblings_spouse": 1,
        "parents_children": 1,
        "fare_paid": 1,
        "gender": {
            "$cond": [ { "$eq": [ "$gender", "female" ] }, 1, 0 ]
        },
        "point_of_embarkation": {
            "$cond": [ { "$eq": [ "$point_of_embarkation", "S" ] }, 1, 
                     { "$cond": [ { "$eq": [ "$point_of_embarkation", "C" ] }, 2, 
                              { "$cond": [ { "$eq": [ "$point_of_embarkation", "Q" ] }, 3, 0 ]
                              }]
                     },]
        }
    }
}

In [30]:
cursor = titanic.aggregate([
    {
        "$match": {
            "age": {"$type": "number"},
            "point_of_embarkation": {"$ne": ""}
        }
    },
    gender_and_point_of_embarkation_conversion_stage,
    {
        "$project": {
            "_id": 0,
            "ticket_number": 0,
            "name": 0,
            "passenger_id": 0,
            "cabin": 0
        }
    }
])

In [31]:
# Exhaust our cursor into a list
titanic_data = list(cursor)

In [32]:
# Load our dataset into a DataFrame
df = json_normalize(titanic_data)

df.tail()

  df = json_normalize(titanic_data)


Unnamed: 0,survived,class,age,siblings_spouse,parents_children,fare_paid,gender,point_of_embarkation
707,0,2,27.0,0,0,13.0,0,1
708,1,1,19.0,0,0,30.0,1,1
709,0,3,39.0,0,5,29.125,1,3
710,1,1,26.0,0,0,30.0,0,2
711,0,3,32.0,0,0,7.75,0,3


In [33]:
# Pull out the survived column (only the data we want to correlate against)
df_x = df.drop(['survived'], axis=1)

In [53]:
# Only the survived column (the value we want to predict)
df_y = df['survived']

df_y

0      0
1      0
2      1
3      1
4      1
      ..
707    0
708    1
709    0
710    1
711    0
Name: survived, Length: 712, dtype: int64

In [35]:
# Create a Least Squares Linear Regression object
reg = linear_model.LinearRegression()

In [36]:
# Split our dataset into a training set (80%) and a test set (20%)
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.2, random_state=0)

In [76]:
# Fit a linear model to our training data
reg.fit(x_train, y_train)

LinearRegression()

In [77]:
# Check our test set against our trained linear model
reg.predict(x_test)

array([ 0.11617702,  0.87153389,  0.59414763,  0.63423417,  0.01087813,
        0.01750318,  0.06383078,  0.62229847,  0.24885281,  0.67601157,
        0.99611887,  0.12202312,  0.55851268,  0.02757737,  0.8365545 ,
        0.46741338,  0.15605042,  0.34671959,  0.32586636,  0.22807672,
        0.44031932,  0.856371  ,  0.62404433,  0.20277336,  0.39922451,
        0.80068775,  0.41696124,  0.20277451,  0.16895739,  0.09294056,
        0.622996  , -0.03315727, -0.09344652,  0.64547388,  0.61809328,
        0.03926457,  0.09574356,  0.14527433,  0.34813906,  0.56726978,
        0.86719741,  0.15706532,  0.65173015,  0.12202312,  0.12199556,
        0.46368418,  0.63833539,  0.2100709 ,  0.13370386,  0.27020448,
        0.68470669,  0.64637431,  0.03692714,  0.6164581 ,  0.00520431,
        0.56699508,  0.08118305,  0.07836717,  0.84164808,  0.91727615,
        0.61885279,  0.48856664,  0.85852166,  0.20796128,  0.64561859,
        0.76922659,  0.80630917,  0.11662033,  0.19401396,  0.74

In [78]:
# Calculate mean squared error (should be ~0.13-0.15%)
mean_squared_error(y_test, reg.predict(x_test))

0.1313665753940645

In [83]:
# age: 25,
# class: 1,
# fare_paid: 45,l 
# gender: Y, (replace Y with the integer you assigned for 'male')
# parents_children: 0,
# point_of_embarkation: Z, (replace Z with the integer you assigned for 'C')
# siblings_spouse: 1

fake_passenger = [[25, 0, 45, 0, 0, 0, 0]]

In [84]:
# Use this output to verify your completion of this exercise
reg.predict(fake_passenger)

array([-6.22958105])