# Postgresql to Python: JSON Data

<font color=red>Mr Fugu Data Science</font>

# (◕‿◕✿)

# Purpose & Outcome:

+ Create a new table to store our Highly Nested JSON data
    + Send nested data to postgresql for querying
    + Query from Python to Postgresql


+ Outcome: learning techniques to handle JSON data within postgresql as well as interface with Python.

    + Swagger points, well just because you can do it! why not?


`----------------------------------`


If you do not want to create init or config files: do something similar to this

`import psycopg2
conn = psycopg2.connect("dbname=test user=postgres") #Connect to an existing database
cur = conn.cursor() #Open a cursor to perform database operations`

---------------------------------

In [1]:
import psycopg2             # python->psql connection

# from psycopg2.extras import Json
import psycopg2.extras
import psycopg2.extensions

import pandas as pd         # create dataframes 
import calendar             # convert int to month

import json
# Import the 'config' function from the config_user_dta.py file:
from config_user_dta import config

In [2]:
# Get the config params
params_ = config()

# Connect to the Postgres_DB:
conn = psycopg2.connect(**params_)

# Create new_cursor allowing us to write Python to execute PSQL:
cur = conn.cursor()

conn.autocommit = True  # read documentation understanding when to Use & NOT use (TRUE)

In [36]:
# Create a Table to store JSON data:

def create_staging_table(cursor):
    cursor.execute("""
        DROP TABLE IF EXISTS nested_tweets;
        CREATE UNLOGGED TABLE nested_tweets (
        ID serial NOT NULL PRIMARY KEY,
     retweeted jsonb );""")
# ID serial NOT NULL PRIMARY KEY


In [37]:
# Send the Schema to PSQL
with conn.cursor() as cursor:
    create_staging_table(cursor)

# Function to deal with JSON file, I convert to dataframe then send off to PSQL
(mainly) because, I was having issues sending data.

In [38]:

twts=pd.read_json('nested_tweets04.json')
df=twts
df['retweeted']=df
df=df.iloc[:,1:]

# df=df.iloc[:,1:]
def fcn(df,table,cur):

    if len(df) > 0:
        df_columns = list(df)
        # create (col1,col2,...)
        columns = ",".join(df_columns)

        # create VALUES('%s', '%s",...) one '%s' per column
        values = "VALUES({})".format(",".join(["%s" for _ in df_columns])) 

        #create INSERT INTO table (columns) VALUES('%s',...)
        insert_stmt = "INSERT INTO {} ({}) {}".format(table,columns,values)
        cur.execute("truncate " + table + ";")  #avoiding uploading duplicate data!
        cur = conn.cursor()
        psycopg2.extras.execute_batch(cur, insert_stmt, df.values)
    conn.commit()

fcn(df,'nested_tweets',cur)




# Investigate Our Data:

In [39]:
# cur.execute('select retweeted from nested_tweets')

cur.execute("""
SELECT *
FROM nested_tweets;
""")
cur.fetchall()

[(1, {'retweeted': ['Nothing_retweeted']}),
 (2,
  {'retweeted': [{'id': 1282484990459080705,
     'geo': None,
     'lang': 'fr',
     'text': 'Indigenous rights. LGBTQ. Racism. Climate change. Orange Man Bad. https://t.co/eOouenEsqf',
     'user': {'id': 38885578,
      'url': 'https://t.co/x5G93GBB51',
      'lang': None,
      'name': 'Gary Lamphier',
      'id_str': '38885578',
      'entities': {'url': {'urls': [{'url': 'https://t.co/x5G93GBB51',
          'indices': [0, 23],
          'display_url': 'lamphier.ca',
          'expanded_url': 'http://www.lamphier.ca/'}]},
       'description': {'urls': []}},
      'location': 'Edmonton, Alberta',
      'verified': False,
      'following': None,
      'protected': False,
      'time_zone': None,
      'created_at': 'Sat May 09 16:57:33 +0000 2009',
      'utc_offset': None,
      'description': 'Owner Lamphier Communications. Former biz columnist, lifelong news junkie, sports fan, beer lover, proud hubby, dad & grandad.',
      'ge

# Query: `find only non-retweeted entries with row ID`

In [40]:
cur.execute("""
select *
from  nested_tweets
where retweeted::text like'%Nothing_retweeted%';
""")
cur.fetchall()

[(1, {'retweeted': ['Nothing_retweeted']}),
 (3, {'retweeted': ['Nothing_retweeted']}),
 (4, {'retweeted': ['Nothing_retweeted']}),
 (8, {'retweeted': ['Nothing_retweeted']}),
 (16, {'retweeted': ['Nothing_retweeted']}),
 (17, {'retweeted': ['Nothing_retweeted']}),
 (20, {'retweeted': ['Nothing_retweeted']}),
 (21, {'retweeted': ['Nothing_retweeted']}),
 (23, {'retweeted': ['Nothing_retweeted']}),
 (36, {'retweeted': ['Nothing_retweeted']}),
 (37, {'retweeted': ['Nothing_retweeted']}),
 (43, {'retweeted': ['Nothing_retweeted']}),
 (45, {'retweeted': ['Nothing_retweeted']})]

# Query: using text matching, to find any entry with (created_at)

In [41]:
cur.execute("""
select *
from  nested_tweets
where retweeted::text like'%created%';
""")
cur.fetchall()

[(2,
  {'retweeted': [{'id': 1282484990459080705,
     'geo': None,
     'lang': 'fr',
     'text': 'Indigenous rights. LGBTQ. Racism. Climate change. Orange Man Bad. https://t.co/eOouenEsqf',
     'user': {'id': 38885578,
      'url': 'https://t.co/x5G93GBB51',
      'lang': None,
      'name': 'Gary Lamphier',
      'id_str': '38885578',
      'entities': {'url': {'urls': [{'url': 'https://t.co/x5G93GBB51',
          'indices': [0, 23],
          'display_url': 'lamphier.ca',
          'expanded_url': 'http://www.lamphier.ca/'}]},
       'description': {'urls': []}},
      'location': 'Edmonton, Alberta',
      'verified': False,
      'following': None,
      'protected': False,
      'time_zone': None,
      'created_at': 'Sat May 09 16:57:33 +0000 2009',
      'utc_offset': None,
      'description': 'Owner Lamphier Communications. Former biz columnist, lifelong news junkie, sports fan, beer lover, proud hubby, dad & grandad.',
      'geo_enabled': False,
      'screen_name': 'lam

# Find all entries where you have (created_at)

In [75]:
cur.execute("""
select (jsonb_array_elements(retweeted->'retweeted')->'created_at')::text as trx_id 
from nested_tweets;
""")
cur.fetchall()

[(None,),
 ('"Mon Jul 13 01:20:23 +0000 2020"',),
 (None,),
 (None,),
 ('"Sat Jul 11 18:55:25 +0000 2020"',),
 ('"Sun Jul 12 22:52:00 +0000 2020"',),
 ('"Sun Jul 12 19:02:50 +0000 2020"',),
 (None,),
 ('"Fri Jul 10 22:11:45 +0000 2020"',),
 ('"Mon Sep 30 03:53:00 +0000 2019"',),
 ('"Sun Jul 12 15:48:20 +0000 2020"',),
 ('"Mon Jul 13 00:37:16 +0000 2020"',),
 ('"Mon Jul 13 00:59:00 +0000 2020"',),
 ('"Mon Jul 13 01:15:32 +0000 2020"',),
 ('"Mon Jul 13 01:15:11 +0000 2020"',),
 (None,),
 (None,),
 ('"Sun Jul 12 23:34:27 +0000 2020"',),
 ('"Sun Jul 12 11:53:07 +0000 2020"',),
 (None,),
 (None,),
 ('"Sun Jul 12 23:34:27 +0000 2020"',),
 (None,),
 ('"Wed Jul 08 20:07:48 +0000 2020"',),
 ('"Sun Jul 12 15:48:20 +0000 2020"',),
 ('"Sun Jul 12 12:21:53 +0000 2020"',),
 ('"Mon Jul 13 00:58:01 +0000 2020"',),
 ('"Sun Jul 12 14:18:00 +0000 2020"',),
 ('"Sun Jul 12 19:33:22 +0000 2020"',),
 ('"Sun Jul 12 12:01:25 +0000 2020"',),
 ('"Mon Jul 13 01:15:11 +0000 2020"',),
 ('"Sun Jul 12 17:22:12 +0000 

# Find `entries` nested portion, before we retreive `urls`

Workflow: 

`'retweeted' : [{'entries' :{'urls':[{'url':'the_html'}]}]`

In [109]:
cur.execute("""
select id, (jsonb_array_elements(retweeted->'retweeted')->'entities')::text as trx_id 
from nested_tweets;
""")
cur.fetchall()

[(1, None),
 (2,
  '{"urls": [{"url": "https://t.co/eOouenEsqf", "indices": [66, 89], "display_url": "twitter.com/carmrunco/stat…", "expanded_url": "https://twitter.com/carmrunco/status/1282483761406500865"}], "symbols": [], "hashtags": [], "user_mentions": []}'),
 (3, None),
 (4, None),
 (5,
  '{"urls": [{"url": "https://t.co/WcN9JPCBKG", "indices": [117, 140], "display_url": "twitter.com/i/web/status/1…", "expanded_url": "https://twitter.com/i/web/status/1282025722664517632"}], "symbols": [], "hashtags": [], "user_mentions": []}'),
 (6,
  '{"urls": [{"url": "https://t.co/pazSNK0JM6", "indices": [117, 140], "display_url": "twitter.com/i/web/status/1…", "expanded_url": "https://twitter.com/i/web/status/1282447649183084545"}], "symbols": [], "hashtags": [], "user_mentions": []}'),
 (7,
  '{"urls": [{"url": "https://t.co/6NRHU7Cc5J", "indices": [117, 140], "display_url": "twitter.com/i/web/status/1…", "expanded_url": "https://twitter.com/i/web/status/1282389974667743234"}], "symbols": []

# Start parsing to get into `urls`

In [113]:
cur.execute("""
select id, 
(jsonb_array_elements(retweeted->'retweeted')->'entities'->'urls')::text as trx_id 
from nested_tweets;
""")
cur.fetchall()


[(1, None),
 (2,
  '[{"url": "https://t.co/eOouenEsqf", "indices": [66, 89], "display_url": "twitter.com/carmrunco/stat…", "expanded_url": "https://twitter.com/carmrunco/status/1282483761406500865"}]'),
 (3, None),
 (4, None),
 (5,
  '[{"url": "https://t.co/WcN9JPCBKG", "indices": [117, 140], "display_url": "twitter.com/i/web/status/1…", "expanded_url": "https://twitter.com/i/web/status/1282025722664517632"}]'),
 (6,
  '[{"url": "https://t.co/pazSNK0JM6", "indices": [117, 140], "display_url": "twitter.com/i/web/status/1…", "expanded_url": "https://twitter.com/i/web/status/1282447649183084545"}]'),
 (7,
  '[{"url": "https://t.co/6NRHU7Cc5J", "indices": [117, 140], "display_url": "twitter.com/i/web/status/1…", "expanded_url": "https://twitter.com/i/web/status/1282389974667743234"}]'),
 (8, None),
 (9,
  '[{"url": "https://t.co/rYOR1utbdB", "indices": [116, 139], "display_url": "twitter.com/i/web/status/1…", "expanded_url": "https://twitter.com/i/web/status/1281712740621520896"}]'),
 (10,

# Finally, get the urls nested within:

In [188]:
cur.execute("""
select id, 
(jsonb_array_elements(retweeted->'retweeted')->'entities'->'urls'->0->'url')::text as trx_id
from nested_tweets;
""")
cur.fetchall()


# [r[1] for r in cur.fetchall()]

[(1, None),
 (2, '"https://t.co/eOouenEsqf"'),
 (3, None),
 (4, None),
 (5, '"https://t.co/WcN9JPCBKG"'),
 (6, '"https://t.co/pazSNK0JM6"'),
 (7, '"https://t.co/6NRHU7Cc5J"'),
 (8, None),
 (9, '"https://t.co/rYOR1utbdB"'),
 (10, '"https://t.co/7SgbXu6fIZ"'),
 (11, '"https://t.co/DSHM4UsCOP"'),
 (12, '"https://t.co/OcIpAM85Qz"'),
 (13, '"https://t.co/qnit23AIjZ"'),
 (14, '"https://t.co/11QSJy08hx"'),
 (15, '"https://t.co/umiP9PIpWF"'),
 (16, None),
 (17, None),
 (18, '"https://t.co/1hW2ml8fTZ"'),
 (19, '"https://t.co/o4MhxxJ4u4"'),
 (20, None),
 (21, None),
 (22, '"https://t.co/1hW2ml8fTZ"'),
 (23, None),
 (24, '"https://t.co/e5s0wNxWww"'),
 (25, '"https://t.co/DSHM4UsCOP"'),
 (26, None),
 (27, '"https://t.co/SVgojvj8vf"'),
 (28, '"https://t.co/N9SlIR2cdg"'),
 (29, '"https://t.co/3yUEwhaMZD"'),
 (30, '"https://t.co/KHS3JYIyiK"'),
 (31, '"https://t.co/umiP9PIpWF"'),
 (32, '"https://t.co/aTbC7Y1nWH"'),
 (33, '"https://t.co/Ku7dkrrzvi"'),
 (34, '"https://t.co/r8U2nxOiFc"'),
 (35, '"https:/

# Find more than one entry from a highly nested repeated dictionary list of dictionary lists

In [196]:
cur.execute("""
select id,
(jsonb_array_elements(retweeted->'retweeted')->'entities'->'urls'->0->'url')::text as trx_id
,(jsonb_array_elements(retweeted->'retweeted')->'entities'->'urls'->0->'expanded_url')::text
from nested_tweets;
""")
cur.fetchall()



# (jsonb_array_elements(retweeted->'retweeted')->'entities'->'urls')::text

[(1, None, None),
 (2,
  '"https://t.co/eOouenEsqf"',
  '"https://twitter.com/carmrunco/status/1282483761406500865"'),
 (3, None, None),
 (4, None, None),
 (5,
  '"https://t.co/WcN9JPCBKG"',
  '"https://twitter.com/i/web/status/1282025722664517632"'),
 (6,
  '"https://t.co/pazSNK0JM6"',
  '"https://twitter.com/i/web/status/1282447649183084545"'),
 (7,
  '"https://t.co/6NRHU7Cc5J"',
  '"https://twitter.com/i/web/status/1282389974667743234"'),
 (8, None, None),
 (9,
  '"https://t.co/rYOR1utbdB"',
  '"https://twitter.com/i/web/status/1281712740621520896"'),
 (10,
  '"https://t.co/7SgbXu6fIZ"',
  '"https://twitter.com/i/web/status/1178518077643866117"'),
 (11,
  '"https://t.co/DSHM4UsCOP"',
  '"https://twitter.com/i/web/status/1282341027328569345"'),
 (12,
  '"https://t.co/OcIpAM85Qz"',
  '"https://twitter.com/i/web/status/1282474137412407298"'),
 (13,
  '"https://t.co/qnit23AIjZ"',
  '"https://twitter.com/i/web/status/1282479606667378688"'),
 (14,
  '"https://t.co/11QSJy08hx"',
  '"https:

+ What else can be done to to further this exercise?
    + First: we could figure out how to iterate through and find nested keys.
    + Second: find a way to nest the array queries or run as nested queries to store data
    + Third: if this is a frequent operation make a function or store as a new table 

# <font color=red>Like</font>, Share &
# <font color=red>SUB</font>scribe

`--------------------------------------`

# Citations & Help:

# ◔̯◔

https://www.postgresqltutorial.com/postgresql-json/

https://stackoverflow.com/questions/49960410/import-json-file-into-postgres-table-with-single-json-column

https://dba.stackexchange.com/questions/172746/query-postgres-json-array-for-values-in-text

https://stackoverflow.com/questions/34012146/query-elements-in-a-nested-array-of-a-json-object-in-postgresql-9-4-or-9-5

https://dba.stackexchange.com/questions/150783/query-against-json-array-in-postgresql

https://html.developreference.com/article/10287780/How+to+query+objects+in+nested+json+in+postgres (obscure but helped)

https://medium.com/@sivakumarvadivelu/querying-json-with-postgresql-9e72f808bdda