# SIMCCT User Analytics

This Jupyter Notebook is for doing some quick and dirty tests for running data 
analytics operations on the MongoDB and Redis data persistence. The purpose of 
which is to then put into a pipeline and then create endpoints for each type 
of query and analysis that can be done.

In [1]:
# Plotly imports

import chart_studio
import chart_studio.plotly as py
import plotly.graph_objects as go
import plotly.io as pio
import plotly.express as px
from plotly.subplots import make_subplots

chart_studio.tools.set_credentials_file(
    username='codeninja55', 
    api_key='mLp691cLJDdKaNgJykR4'
)

chart_studio.tools.set_config_file(
    world_readable=True,
    sharing='public'
)

In [2]:
# imports
import datetime
from os import environ as env
from pymongo import MongoClient
import pandas as pd
from redis import Redis
import json 

In [3]:
conn = MongoClient(env.get('MONGO_URI'))

db_name = 'arc_dev'
collection = 'users'
db = conn[db_name]

In [4]:
db

Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'arc_dev')

In [5]:
cursor = db[collection].find(
    {
        'profile': {'$exists': True}
    }, 
    projection={'password': 0, '_id': False}
)
df = pd.DataFrame(list(cursor))
df.head()

Unnamed: 0,email,first_name,last_name,profile,admin_profile,simulations_count,saved_alloys,active,admin,disable_admin,verified,created,last_updated,ratings,login_data,last_login
0,andrew@neuraldev.io,Andrew,Che,"{'aim': 'Development', 'highest_education': 'B...",{'position': 'Full Stack Developer at NeuralDe...,1,"[{'_id': 5da42fbe2325089b915e89e5, 'name': 'Io...",True,True,False,False,2019-10-14 08:20:14.481,2019-10-14 08:20:42.636,"[{'rating': 4, 'created_date': 2019-10-14 08:2...",[{'created_datetime': 2019-10-14 08:20:14.6380...,2019-10-14 08:20:42.592
1,pbx@ansto.gov.au,Philip,Bendeich,"{'aim': 'Research', 'highest_education': 'PhD'...","{'position': 'Researcher at ANSTO', 'mobile_nu...",1,"[{'_id': 5da42fbe2325089b915e8a07, 'name': 'Ar...",True,True,False,False,2019-10-14 08:20:14.481,2019-10-14 08:20:22.567,"[{'rating': 4, 'created_date': 2019-10-14 08:2...",[{'created_datetime': 2019-10-14 08:20:14.6380...,NaT
2,omz@ansto.gov.au,Ondrej,Muransky,"{'aim': 'Research', 'highest_education': 'PhD'...","{'position': 'Researcher at ANSTO', 'mobile_nu...",1,"[{'_id': 5da42fbe2325089b915e8a11, 'name': 'Ad...",True,True,False,False,2019-10-14 08:20:14.481,2019-10-14 08:20:22.690,"[{'rating': 4, 'created_date': 2019-10-14 08:2...",[{'created_datetime': 2019-10-14 08:20:14.6380...,NaT
3,ironman@avengers.io,Tony,Stark,"{'aim': 'Experimentation', 'highest_education'...",{'position': 'Genius Playboy Billionaire Phila...,1,"[{'_id': 5da42fbe2325089b915e8a1f, 'name': 'Re...",True,True,False,False,2019-10-14 08:20:14.481,2019-10-14 08:20:22.922,"[{'rating': 4, 'created_date': 2019-10-14 08:2...",[{'created_datetime': 2019-10-14 08:20:14.6380...,NaT
4,black_widow@marvel.io,Natasha,Romanoff,"{'aim': 'Engineering Work', 'highest_education...","{'position': 'Russian Superagent', 'mobile_num...",1,"[{'_id': 5da42fbe2325089b915e8a23, 'name': 'Hi...",True,True,False,False,2019-10-14 08:20:14.481,2019-10-14 08:20:22.986,"[{'rating': 3, 'created_date': 2019-10-14 08:2...",[{'created_datetime': 2019-10-14 08:20:14.6380...,NaT


## User Profile Data

In [None]:
pipeline = [
    {'$unwind': '$profile'},
    {'$project': {'profile': 1, '_id': False}},
]

res = db[collection].aggregate(pipeline)
list(res)

In [25]:
pipeline = [
    {'$unwind': '$profile'},
    {'$project': {
        'aim': '$profile.aim', 
        'highest_education': '$profile.highest_education', 
        'sci_tech_exp': '$profile.sci_tech_exp', 
        'phase_transform_exp': '$profile.phase_transform_exp',
        '_id': 0
        }
    },
]

res = db[collection].aggregate(pipeline)
profile_df = pd.DataFrame(list(res))
profile_df['aim'].value_counts()

Experimentation     62
Engineering Work    20
Research            18
Development          1
Name: aim, dtype: int64

In [26]:
list(profile_df['aim'].unique())

['Development', 'Research', 'Experimentation', 'Engineering Work']

In [None]:
list(profile_df['aim'].value_counts())

In [None]:
# layout = go.Layout(
#     title='User Profile Aim',
#     xaxis=dict(title='User Aims'),
#     yaxis=dict(title='Count')
# )

# fig = go.Figure(layout=layout)
fig = make_subplots(
    rows=2, 
    cols=2,
    subplot_titles=[
        'Aim', 
        'Highest Education', 
        'Science Tech. Experience',
        'Phase Transform Experience'
    ]
)

trace_aim = go.Bar(x=list(profile_df['aim'].unique()), y=list(profile_df['aim'].value_counts()))
trace_edu = go.Bar(x=list(profile_df['highest_education'].unique()), y=list(profile_df['highest_education'].value_counts()))
trace_sci = go.Bar(x=list(profile_df['sci_tech_exp'].unique()), y=list(profile_df['sci_tech_exp'].value_counts()))
trace_pha = go.Bar(x=list(profile_df['phase_transform_exp'].unique()), y=list(profile_df['phase_transform_exp'].value_counts()))

fig.add_trace(trace_aim, row=1, col=1)
fig.add_trace(trace_edu, row=1, col=2)
fig.add_trace(trace_sci, row=2, col=1)
fig.add_trace(trace_pha, row=2, col=2)

fig.update_layout(
    # height=800,
    # width=1200,
    showlegend=False,
    title_text="User Profile Answers"
)

py.iplot(fig, filename='user_profile_bar')
# pio.write_image(fig, file='user_profile_aim.png')

## Count

In [11]:
# Total user count
db[collection].estimated_document_count()

215

In [20]:
# Total saved simulations count
db['saved_simulations'].estimated_document_count()

1144

In [18]:
# Total feedback count
db['feedback'].estimated_document_count()

1530

In [19]:
# Total shares

db['shared_simulations'].estimated_document_count()

436

In [16]:
# Total simulations
pipeline = [
    {
        '$group': {
            '_id': None,
            'total': {
                '$sum': '$simulations_count'
            }
        }
    }
]

cursor = db[collection].aggregate(pipeline)
# count_df = pd.DataFrame(list(cursor))
# count_df
list(cursor)[0]['total']

7074

In [17]:
# Total saved user alloys

pipeline = [
    {
        '$group': {
            '_id': None,
            'total': {
                '$sum': {'$size': '$saved_alloys'}
            }
        }
    }
]

cursor = db[collection].aggregate(pipeline)
list(cursor)[0]['total']

2395

In [22]:
# Total ratings average

pipeline = [
    {'$unwind': '$ratings'},
    {
        '$group': {
            '_id': None,
            'total': {'$sum': {'$size': '$saved_alloys'}},
            'average': {'$avg': {'$sum': '$ratings.rating'}}  
        }
    }
]

cursor = db[collection].aggregate(pipeline)
list(cursor)[0]

{'_id': None, 'average': 2.9813953488372094}

## Live Login Data

In [34]:
cursor = db[collection].find(
    {
        'last_login': {'$exists': 1}
    }, 
    projection={'password': 0, '_id': False}
)
df = pd.DataFrame(list(cursor))
df.head()

Unnamed: 0,email,first_name,last_name,profile,admin_profile,simulations_count,saved_alloys,active,admin,disable_admin,verified,created,last_updated,ratings,login_data,last_login
0,andrew@neuraldev.io,Andrew,Che,"{'aim': 'Development', 'highest_education': 'B...",{'position': 'Full Stack Developer at NeuralDe...,2,"[{'_id': 5da3e495e220d454057c43f2, 'name': 'Io...",True,True,False,False,2019-10-14 02:59:33.196,2019-10-14 03:01:26.622,"[{'rating': 4, 'created_date': 2019-10-14 02:5...",[{'created_datetime': 2019-10-14 02:59:33.3650...,2019-10-14 03:01:20.240
1,ironman@avengers.io,Tony,Stark,"{'aim': 'Experimentation', 'highest_education'...",{'position': 'Genius Playboy Billionaire Phila...,1,"[{'_id': 5da3e495e220d454057c442c, 'name': 'Re...",True,True,False,False,2019-10-14 02:59:33.196,2019-10-14 03:06:49.422,"[{'rating': 4, 'created_date': 2019-10-14 02:5...",[{'created_datetime': 2019-10-14 02:59:33.3650...,2019-10-14 03:06:49.408
2,spidey@avengers.io,Peter,Parker,"{'aim': 'Engineering Work', 'highest_education...","{'position': 'Web guy', 'mobile_number': '+780...",1,"[{'_id': 5da3e495e220d454057c4467, 'name': 'Ch...",True,True,False,False,2019-10-14 02:59:33.196,2019-10-14 03:06:55.369,"[{'rating': 3, 'created_date': 2019-10-14 02:5...",[{'created_datetime': 2019-10-14 02:59:33.3650...,2019-10-14 03:06:55.321


In [None]:
pipeline = [
  {'$unwind': '$login_data'},
  {'$project': {'_id': 0, 'login_data': 1, 'email': 1}},
  {'$sort': {'login_data.created_datetime': 1}}
]

res = db[collection].aggregate(pipeline)
# login_df = pd.DataFrame(list(res))
list(res)

In [None]:
pipeline = [
  {'$unwind': '$login_data'},
  {'$project': {
      '_id': 0, 
      'created_datetime': '$login_data.created_datetime', 
    }
  },
]

res = db[collection].aggregate(pipeline)
list(res)

In [None]:
# Using graph_objects
import plotly.graph_objects as go

import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/finance-charts-apple.csv')

df['Date'].head(n=10)

In [None]:
df['AAPL.High'].head(n=10)

In [None]:
fig = go.Figure([go.Scatter(x=df['Date'], y=df['AAPL.High'])])
fig.show()

In [None]:
import plotly.graph_objects as go
import datetime

x = [datetime.datetime(year=2013, month=10, day=4),
     datetime.datetime(year=2013, month=11, day=5),
     datetime.datetime(year=2013, month=12, day=6)]

fig = go.Figure(data=[go.Scatter(x=x, y=[1, 3, 6])])
# Use datetime objects to set xaxis range
fig.update_layout(xaxis_range=[datetime.datetime(2013, 10, 17),
                               datetime.datetime(2013, 11, 20)])
fig.show()

In [None]:
pipeline = [
  {'$unwind': '$login_data'},
  {'$project': {
      '_id': 0, 
      'timestamp': '$login_data.created_datetime', 
      'user': '$email',
    }
  },
]

res = db[collection].aggregate(pipeline)


# dt_idx = pd.to_datetime()

df = pd.DataFrame(list(res))

df['timestamp'] = pd.to_datetime(df['timestamp'])
# df.set_index('timestamp', inplace=True)

df = df.groupby(pd.Grouper(key='timestamp', freq='1min')).count().dropna()

# df = df.groupby(pd.Grouper(key='timestamp', freq='60s'))
# res = (pd.DataFrame(df.index[1:]) - pd.DataFrame(df.index[:-1]))
# df = df.to_frame().reset_index()
# df.resample('T').count()

# res['timestamp'].value_counts()
df

In [None]:
fig = go.Figure()

trace = go.Scatter(x=df.index, y=df['user'])

fig.add_trace(trace)

fig.update_layout(
  showlegend=False,
  title_text="Logged in Users",
  xaxis_range=[
    datetime.datetime(2019, 10, 4),
    datetime.datetime(2019, 10, 5)
  ],
  xaxis_rangeslider_visible=True
)

py.iplot(fig, filename='user_login_timestamps')

## Logged In User Map

In [12]:
redis_uri = env.get('REDIS_URI')
client = Redis(redis_uri)
client

Redis<ConnectionPool<Connection<host=None,port=6379,db=0>>>

In [13]:
keys = client.keys(pattern=u'session*')
keys

[b'session:5d985c0b181db01c07f1464d.13a831ef8df04ab48b6dc2b0fbc58337']

In [24]:
for byte_key in keys:
  key = byte_key.decode('utf-8')
  print()
  sess_store = json.loads(client.get(key))
  print(sess_store)


{'jwt': 'eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE1NzI4NTgxNjQsImlhdCI6MTU3MDI2NjE2NCwic3ViIjoiNWQ5ODVjMGIxODFkYjAxYzA3ZjE0NjRkIiwicm9sZSI6ImFkbWluIn0.9lfd5asdK4PKm1rMzeJ0_fcHpuE7wXjix_9qYm4QRHE', 'ip_address': '172.19.0.1', 'is_admin': True, 'user_id': '5d985c0b181db01c07f1464d', 'state': None, 'country': None, 'simulation': '{"configurations": {"is_valid": false, "method": "Li98", "grain_size": 8.0, "nucleation_start": 1.0, "nucleation_finish": 99.9, "auto_calculate_ms": true, "ms_temp": 0.0, "ms_rate_param": 0.0, "auto_calculate_bs": true, "bs_temp": 0.0, "auto_calculate_ae": true, "ae1_temp": 0.0, "ae3_temp": 0.0, "start_temp": 900, "cct_cooling_rate": 10}, "alloy_store": {"alloy_option": "single", "alloys": {"parent": null, "weld": null, "mix": null}}, "results": {}}'}


In [125]:
gapminder = px.data.gapminder().query("year == 2007")

t = gapminder['iso_alpha'].value_counts().to_dict()
t['AUS']

1

In [81]:
df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/2014_world_gdp_with_codes.csv')

fig = go.Figure(data=go.Choropleth(
    locations = df['CODE'],
    z = df['GDP (BILLIONS)'],
    text = df['COUNTRY'],
    colorscale = 'Blues',
    autocolorscale=False,
    reversescale=True,
    marker_line_color='darkgray',
    marker_line_width=0.5,
    colorbar_tickprefix = '$',
    colorbar_title = 'GDP<br>Billions US$',
))

# fig.update_layout(
#     title_text='2014 Global GDP',
#     geo=dict(
#         showframe=False,
#         showcoastlines=False,
#         projection_type='equirectangular'
#     ),
#     annotations = [dict(
#         x=0.55,
#         y=0.1,
#         xref='paper',
#         yref='paper',
#         text='Source: <a href="https://www.cia.gov/library/publications/the-world-factbook/fields/2195.html">\
#             CIA World Factbook</a>',
#         showarrow = False
#     )]
# )

# fig.show()

df['CODE'].value_counts()

VGB    2
GAB    1
TGO    1
SVK    1
GHA    1
      ..
ARG    1
QAT    1
SYC    1
PRY    1
NLD    1
Name: CODE, Length: 221, dtype: int64

In [141]:
pipeline = [
  {'$unwind': '$login_data'},
  {'$project': {
      '_id': 0, 
      'created_datetime': '$login_data.created_datetime',
      'ip_address': '$login_data.ip_address',
      'state': '$login_data.state',
      'country': '$login_data.country',
      'iso_code': '$login_data.country_iso_code',
      'continent': '$login_data.continent',
      'accuracy_radius': '$login_data.accuracy_radius',
      'timezone': '$login_data.timezone',
      'latitude': {'$arrayElemAt': [ '$login_data.geo_point.coordinates', 0 ]},
      'longitude': {'$arrayElemAt': [ '$login_data.geo_point.coordinates', 1 ]},
    }
  },
]

res = db[collection].aggregate(pipeline)
df = pd.DataFrame(list(res))

df.dropna(subset=['country', 'continent'], axis=0, inplace=True)
# cnt = df['country'].value_counts().to_dict()
# df['count'] = df[ df['country'] == cnt[] ]

df = df.groupby(
  ['latitude', 'longitude', 'country', 'continent']
).size().to_frame('count').reset_index()

# print(df.count())

df.head(n=10)

Unnamed: 0,latitude,longitude,country,continent,count
0,-37.8974,144.7444,Australia,Oceania,51
1,-37.8159,144.9669,Australia,Oceania,49
2,-37.8061,145.0015,Australia,Oceania,55
3,-37.7855,145.1246,Australia,Oceania,49
4,-37.7452,144.9641,Australia,Oceania,100
5,-34.9444,138.5926,Australia,Oceania,48
6,-33.9479,151.202,Australia,Oceania,51
7,-33.8777,151.2187,Australia,Oceania,32
8,-33.8591,151.2002,Australia,Oceania,95
9,-33.8426,150.9987,Australia,Oceania,45


In [162]:
df['country'].tolist()

AttributeError: 'DataFrame' object has no attribute 'tolist'

In [93]:
s_df = df[df['country'] == 'Singapore']
s_df['ip_address'].value_counts()

32.60.52.20    55
Name: ip_address, dtype: int64

In [94]:
# df_group.set_index('continent', inplace=True)
df_group.reset_index()
df_group.head()

Unnamed: 0_level_0,created_datetime,ip_address,continent,accuracy_radius,timezone,latitude,longitude,state
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Australia,946,946,946,946,946,946,946,846
Singapore,55,55,55,55,55,55,55,0


In [122]:
# fig = go.Figure()

# trace = go.Scatter(x=df.index, y=df['user'])

# fig.add_trace(trace)

# fig.update_layout(
#   showlegend=False,
#   title_text="Logged in Users",
#   xaxis_range=[
#     datetime.datetime(2019, 10, 4),
#     datetime.datetime(2019, 10, 5)
#   ],
#   xaxis_rangeslider_visible=True
# )

# py.iplot(fig, filename='user_login_timestamps')

fig = px.scatter_geo(
  df,
  locations='iso_code',
  color='continent',
  hover_name='country',
  size='count',
  projection="natural earth"
)

py.iplot(fig, filename='user_login_map')

In [163]:
# mapbox_access_token = open(
#   "/home/codeninja/Arclytics/arclytics_sim/.mapbox_token"
# ).read()
mapbox_access_token = 'pk.eyJ1IjoiY29kZW5pbmphNTUiLCJhIjoiY2sxZG5kb2JvMDV3dzNsbXV6dmhwd2xkaCJ9.3yH0KfKaMVn0MHNqgq7g5g'

fig = go.Figure(go.Densitymapbox(
        lat=df['latitude'],
        lon=df['longitude'],
        z=df['count'],
        radius=10,
#         mode='markers',
#         marker=go.scattermapbox.Marker(
#             size=8,
#             color='rgb(254, 67, 54)',
#             opacity=0.8
#         ),
        text=df['count'],
    ))

fig.update_layout(
    hovermode='closest',
    mapbox=go.layout.Mapbox(
        accesstoken=mapbox_access_token,
#         bearing=0,
        
        center=go.layout.mapbox.Center(
            lat=0,
            lon=180
        ),
#         pitch=0,
        zoom=1
    )
)

fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})

# fig.show()
py.iplot(fig, filename='user_login_map')

SyntaxError: invalid syntax (<ipython-input-163-c0b895662ed7>, line 17)