In [1]:
%%capture
!pip install pymongo pprint dateparser matplotlib pandas sklearn numpy seaborn

In [2]:
import pymongo
import pprint
import dateparser
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

%matplotlib inline

In [3]:
course_cluster_uri = "mongodb://agg-student:agg-password@cluster0-shard-00-00-jxeqq.mongodb.net:27017,cluster0-shard-00-01-jxeqq.mongodb.net:27017,cluster0-shard-00-02-jxeqq.mongodb.net:27017/test?ssl=true&replicaSet=Cluster0-shard-0&authSource=admin"
course_client = pymongo.MongoClient(course_cluster_uri)
titanic = course_client['coursera-agg']['titanic']

In [4]:
initial_project = {
    "$project": {
        "_id": 0,
        "name": 0,
        "point_of_embarkation": 0,
        "ticket_number": 0,
        "passenger_id": 0,
        "cabin": 0,
    }
}

In [53]:
# todo - correct the age.
# *HINT* -- If the $type of "$age" is a string, set it to 0
age_correction =  {
    "$addFields": { 
        "age": { 
            "$cond": [ {"$eq": [ "$age", "" ] }, 0, "$age"]
        } 
    }
}

# fare_paid_correction =  {
#     "$addFields": { 
#         "fare_paid": { 
#             "$cond": [ {"isNumber": "$fare_paid"}, "$fare_paid", 0]
#         } 
#     }
# }

In [10]:
# todo - one hot encode gender_female. 1 if female, 0 if male
one_hot_female = {
    "$addFields": { 
        "gender_female": { 
            "$cond": [ {"$eq": [ "$gender", "female" ] }, 1, 0]
        } 
    }
}

In [11]:
# todo - the inverse of above. 1 if male, 0 if female
one_hot_male = { 
    "$addFields": { 
        "gender_male": { 
            "$cond": [ {"$eq": [ "$gender", "male" ] }, 1, 0]
        } 
    }
}

In [None]:
# encoding_stage = {
#     "$addFields": {
#         "gender_female": one_hot_female,
#         "gender_male": one_hot_male,
#         "age": age_correction
#     }
# }

In [17]:
final_project = {
    "$project": {
        "gender": 0
    }
}

In [54]:
pipeline = [initial_project, age_correction, one_hot_male, one_hot_female, final_project]

In [56]:
df = pd.DataFrame.from_dict(list(titanic.aggregate(pipeline)))
df[df['age'] == 0]

Unnamed: 0,survived,class,age,siblings_spouse,parents_children,fare_paid,gender_male,gender_female
9,1,2,0.0,0,0,13.0000,1,0
10,1,3,0.0,0,0,7.2250,0,1
21,0,3,0.0,0,0,7.8958,1,0
22,1,3,0.0,0,0,7.8792,0,1
23,1,1,0.0,1,0,146.5208,0,1
...,...,...,...,...,...,...,...,...
858,0,3,0.0,0,0,7.2292,1,0
863,0,3,0.0,8,2,69.5500,0,1
868,0,3,0.0,0,0,9.5000,1,0
878,0,3,0.0,0,0,7.8958,1,0


In [57]:
X = df.drop('survived', axis=1)

In [58]:
y = df['survived']

In [59]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15)

In [60]:
dtree = DecisionTreeClassifier()

In [61]:
%%capture
dtree.fit(X_train, y_train)

In [62]:
predictions = dtree.predict(X_test)

In [63]:
print(confusion_matrix(y_test, predictions))
print("\n")
print(classification_report(y_test, predictions))

[[63 17]
 [ 8 46]]


              precision    recall  f1-score   support

           0       0.89      0.79      0.83        80
           1       0.73      0.85      0.79        54

    accuracy                           0.81       134
   macro avg       0.81      0.82      0.81       134
weighted avg       0.82      0.81      0.82       134



In [64]:
rfc = RandomForestClassifier(n_estimators=20)

In [65]:
%%capture
rfc.fit(X_train, y_train)

In [66]:
rfc_pred = rfc.predict(X_test)

In [67]:
print(confusion_matrix(y_test, rfc_pred))
print("\n")
print(classification_report(y_test, rfc_pred, target_names=['test', 'predictions']))

[[67 13]
 [11 43]]


              precision    recall  f1-score   support

        test       0.86      0.84      0.85        80
 predictions       0.77      0.80      0.78        54

    accuracy                           0.82       134
   macro avg       0.81      0.82      0.81       134
weighted avg       0.82      0.82      0.82       134



In [68]:
iterations = 1000
dtree_avg_accuracy = 0
rfc_avg_accuracy = 0
for _ in range(iterations):
    dtree.fit(X_train, y_train)
    dtree_avg_accuracy += dtree.score(X_test, y_test)
    rfc.fit(X_train, y_train)
    rfc_avg_accuracy += rfc.score(X_test, y_test)
    
print(f"""
After {iterations} iterations:
  Single Decision Tree accuracy: {dtree_avg_accuracy / iterations}
  Random Forest accuracy:        {rfc_avg_accuracy / iterations}
  
  Lab Answer:  dtree={round(dtree_avg_accuracy / iterations, 2)}, rfc={round(rfc_avg_accuracy / iterations, 2)}
""")


After 1000 iterations:
  Single Decision Tree accuracy: 0.8143731343283569
  Random Forest accuracy:        0.8133432835820885
  
  Lab Answer:  dtree=0.81, rfc=0.81

