## Compare Firebase Comm Logs with MongoDB Data

### Pull MongoDB data


In [1]:
import os
import datetime as dt
import pandas as pd
import csv

In [2]:
# add the 'src' directory as one where we can import modules
import sys
import pathlib # __file__ isn't recognized in Jupyter, so we need this to get the root path
path = pathlib.Path.cwd()
PROJ_ROOT = path.parent 
src_dir = str(PROJ_ROOT / 'src')
PROJ_ROOT = str(PROJ_ROOT)
sys.path.append(src_dir)
# from data import make_dataset

In [3]:
from data import user_df_setup

interim_data_file_path = os.path.join(PROJ_ROOT,
                                 "data",
                                 "interim",
                                 "full_users_df.pkl")
# users_df = pd.read_pickle(interim_data_file_path)
raw_data_file_path = os.path.join(PROJ_ROOT,
                             "data",
                             "raw",
                             "users_df.pkl")
raw_users_df = pd.read_pickle(raw_data_file_path)

all_users_df = user_df_setup.user_df_setup(raw_data_file_path, interim_data_file_path)

users_df = all_users_df
# raw_users_df
# raw_users_df[['username', 'timeCreated']].sort_values('timeCreated', ascending = False).head(20)
all_users_df.sort_values('date_created', ascending = False).head(20)
# all_users_df.index
# all_users_df.head(20)

Interim users_df not found, generating new


Unnamed: 0_level_0,date_created,refresh_time,userId,unrated_threshold,risky_threshold,supportive_threshold
username,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
pinky618,2019-03-20,,5c92a052683da8001107a632,0.0,1.0,3.0
doyle coleman,2019-03-19,,5c90fb30939ba200110ff237,0.0,1.0,3.0
driverseat,2019-03-19,,5c90fa54939ba200110ff229,0.0,1.0,3.0
blonco,2019-03-19,,5c90f9ea683da8001107a5af,0.0,1.0,3.0
joeltortega,2019-03-19,,5c90f95c939ba200110ff226,0.0,1.0,3.0
matt,2019-03-14,,5c8a99d8ed5e0a0011e30297,0.0,1.0,3.0
fred13,2019-03-13,,5c8995cced5e0a0011e30269,0.0,1.0,3.0
mat cm,2019-03-13,,5c8918bd0daecd0011da07c7,0.0,1.0,3.0
shelly,2019-03-12,,5c8845c8ccded300116f6b8c,0.0,1.0,3.0
shad christie,2019-03-11,,5c86b37accded300116f6b53,0.0,1.0,3.0


In [4]:
current_users = all_users_df[all_users_df['date_created'] > (dt.date.today() - dt.timedelta(30))]
user_ids = all_users_df['userId'].tolist()
print(len(user_ids))
# user_ids

135


### Force-pulling Contacts data from MongoDB

In [5]:
from data import database_query

raw_data_path = os.path.join(PROJ_ROOT,
                             "data",
                             "raw")
current_users_df = all_users_df[all_users_df['date_created'] > (dt.date.today() - dt.timedelta(30))]
user_ids = current_users_df['userId'].tolist()
database_query.make_raw_contacts_df(database_query.mongo_connect(),
                                   raw_data_path,
                                   user_ids)

### Force-creating Contacts interim data

In [6]:
from data import contacts_df_setup

interim_data_path = os.path.join(PROJ_ROOT,
                                 "data",
                                 "interim")
contacts_dict = {}

usernames = current_users.index
usernames = ['pinky618']
for username in usernames: 
    interim_contact_data_file_path = os.path.join(interim_data_path, 'contacts_df_' + username + '.pkl')
    if os.path.isfile(interim_contact_data_file_path):
        contacts_df = pd.read_pickle(interim_contact_data_file_path)
    else:
        contacts_df = contacts_df_setup.contacts_df_setup(username,
                                                          current_users_df,
                                                          raw_data_path,
                                                          interim_data_path)
    contacts_dict[username] = contacts_df

In [7]:
contacts_dict['pinky618']

Unnamed: 0_level_0,_id,score,relationship
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5c92a26c683da8001107a635,b'5c92a26c683da8001107a635',-1.0,family
5c92a26f939ba200110ff2bb,b'5c92a26f939ba200110ff2bb',-1.0,family
5c92a271939ba200110ff2bc,b'5c92a271939ba200110ff2bc',-1.0,friend
5c92a277683da8001107a636,b'5c92a277683da8001107a636',-1.0,family
5c92a27a683da8001107a637,b'5c92a27a683da8001107a637',-1.0,risky
5c92a287939ba200110ff2bd,b'5c92a287939ba200110ff2bd',-1.0,friend
5c92b8ce939ba200110ff2be,b'5c92b8ce939ba200110ff2be',-1.0,none
5c93cc96683da8001107a63e,b'5c93cc96683da8001107a63e',-1.0,none
5c94efb2939ba200110ff2c8,b'5c94efb2939ba200110ff2c8',-1.0,none
5c953bdf939ba200110ff2d0,b'5c953bdf939ba200110ff2d0',-1.0,none


In [8]:
import matplotlib.pyplot as plt

f, axarr = plt.subplots(len(usernames),figsize=(10,30), sharex = True, sharey = True)
plt.tight_layout()

for count, e in enumerate(usernames):  
    unrated_threshold = users_df.loc[e, 'unrated_threshold']
    risky_threshold = users_df.loc[e, 'risky_threshold']
    supportive_threshold = users_df.loc[e, 'supportive_threshold']
    
    contacts_df = contacts_dict[e].sort_values('score', ascending = False)
    unrated = contacts_df['score'][contacts_df['score'] < unrated_threshold]
    risky = contacts_df['score'][(contacts_df['score'] >= unrated_threshold)
                                                     & (contacts_df['score'] <= risky_threshold)]
    neutral = contacts_df['score'][(contacts_df['score'] > risky_threshold)
                                                     & (contacts_df['score'] < supportive_threshold)]
    supportive = contacts_df['score'][(contacts_df['score'] >= supportive_threshold)]
    
    axarr[count].bar(supportive.index, supportive, 0.7, color = '#00cc00')
    axarr[count].bar(neutral.index, neutral, 0.7, color = 'b')
    axarr[count].bar(risky.index, risky, 0.7, color = '#ff6600')
    axarr[count].bar(unrated.index, unrated, 0.7, color = '#C0C0C0')
    axarr[count].set_title(e)
    
    # unrelated, pulling the counts
    current_users.loc[e, 'unrated_contacts'] = len(unrated)
    current_users.loc[e, 'risky_contacts'] = len(risky)
    current_users.loc[e, 'neutral_contacts'] = len(neutral)
    current_users.loc[e, 'supportive_contacts'] = len(supportive)
    current_users.loc[e, 'rated_contacts'] = len(contacts_df) - current_users.loc[e, 'unrated_contacts']
plt.show()

TypeError: 'AxesSubplot' object does not support indexing

In [None]:
# current_users[['date_created', 'unrated_contacts', 'risky_contacts', 'neutral_contacts', 'supportive_contacts', 'rated_contacts']]
# current_users

In [None]:
import numpy as np

for username in usernames: 
    if username in notification_dict.keys():
        notifications_df = notification_dict[username]
        current_users.loc[username, 'notifications'] = len(notifications_df.index)
    else:
        current_users.loc[username, 'notifications'] = 0