In [1]:
import psycopg2
import pandas as pd 
# from sqlalchemy.types import Integer, Text, String, DateTime
import sqlalchemy as s
import matplotlib
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import json

with open("config.json") as config_file:
    config = json.load(config_file)

database_connection_string = 'postgres+psycopg2://{}:{}@{}:{}/{}'.format(config['user'], config['password'], config['host'], config['port'], config['database'])

dbschema='augur_data'
engine = s.create_engine(
    database_connection_string,
    connect_args={'options': '-csearch_path={}'.format(dbschema)})

In [2]:
affil_query = f"""
SELECT
   COLUMN_NAME
FROM
   information_schema.COLUMNS
WHERE
   TABLE_NAME = 'contributors';
    """
affil = pd.read_sql_query(affil_query, con=engine)
print(affil)

               column_name
0                 cntrb_id
1              cntrb_login
2              cntrb_email
3            cntrb_company
4         cntrb_created_at
5               cntrb_type
6               cntrb_fake
7            cntrb_deleted
8               cntrb_long
9                cntrb_lat
10      cntrb_country_code
11             cntrb_state
12              cntrb_city
13          cntrb_location
14         cntrb_canonical
15              gh_user_id
16                gh_login
17                  gh_url
18             gh_html_url
19              gh_node_id
20           gh_avatar_url
21          gh_gravatar_id
22        gh_followers_url
23        gh_following_url
24            gh_gists_url
25          gh_starred_url
26    gh_subscriptions_url
27    gh_organizations_url
28            gh_repos_url
29           gh_events_url
30  gh_received_events_url
31                 gh_type
32           gh_site_admin
33             tool_source
34            tool_version
35             data_source
3

In [3]:
affilx_query = f"""
SELECT
   COLUMN_NAME
FROM
   information_schema.COLUMNS
WHERE
   TABLE_NAME = 'contributors_aliases';
    """
affilx = pd.read_sql_query(affilx_query, con=engine)
print(affilx)

            column_name
0              cntrb_id
1            cntrb_a_id
2       canonical_email
3           alias_email
4          cntrb_active
5   cntrb_last_modified
6           tool_source
7          tool_version
8           data_source
9  data_collection_date


In [4]:
affilx2_query = f"""
SELECT
   canonical_email, alias_email
FROM
   contributors_aliases
;
    """
affilx2 = pd.read_sql_query(affilx2_query, con=engine)
print(affilx2)

                        canonical_email                  alias_email
0                mikeb@squaremobius.net           mikeb@rabbitmq.com
1                mikeb@squaremobius.net             mikeb@lshift.net
2     jean-sebastien.pedron@dumbbell.fr  jean-sebastien@rabbitmq.com
3              michael@clojurewerkz.org         michael@rabbitmq.com
4                           sam@soff.es                  sam@soff.es
...                                 ...                          ...
2628      paul.johnston@portswigger.net               pcj@pubref.org
2629              kevin.clark@gmail.com        kevin.clark@gmail.com
2630              nfirvine@nfirvine.com        nfirvine@nfirvine.com
2631              jasonyoung@google.com        jasonyoung@google.com
2632              loic.porte@bibabox.fr        loic.porte@bibabox.fr

[2633 rows x 2 columns]


In [5]:
affil2_query = f"""
SELECT
   ca_domain, ca_affiliation
FROM
   contributor_affiliations
;
    """
affil2 = pd.read_sql_query(affil2_query, con=engine)
print(affil2)

     ca_domain ca_affiliation
0  goggins.com        goggins


In [6]:
affil3_query = f"""
SELECT
   cntrb_company
FROM
   contributors
WHERE
   cntrb_company LIKE '%%pivotal%%' 
GROUP BY
   cntrb_company
;
    """
affil3 = pd.read_sql_query(affil3_query, con=engine)
print(affil3)

                                        cntrb_company
0                          @cloudfoundry @pivotal-cf 
1                                  https://pivotal.io
2                          http://www.pivotallabs.com
3                                            @pivotal
4                                           @pivotal 
5                                             pivotal
6                               pivotal almost vmware
7                                         @pivotal-cf
8                                        @pivotal-cf 
9                           @pivotal-cf @cloudfoundry
10                         @pivotal-cf @cloudfoundry 
11                         @pivotal-cf @greenplum-db 
12               @pivotal-cf @pivotal-cf-experimental
13                      @pivotal-cf @pivotalservices 
14                                @pivotal-cf @vmware
15                          @pivotal / @cloudfoundry 
16                             @pivotal @cloudfoundry
17                          

In [7]:
affil4_query = f"""
SELECT
   cntrb_email, cntrb_company
FROM
   contributors
WHERE
   cntrb_company LIKE '%%pivotal%%' 
GROUP BY
   cntrb_email, cntrb_company
;
    """
affil4 = pd.read_sql_query(affil4_query, con=engine)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(affil4)

                                       cntrb_email  \
0                           abhayashenoy@gmail.com   
1                                abrown@pivotal.io   
2                             acbdesigns@gmail.com   
3                              aedstrom@pivotal.io   
4                           ajaganathan@pivotal.io   
5           ajaganathan@pivotals-MacBook-Pro.local   
6                         alexey.nesterov@live.com   
7                       alex@localhost.localdomain   
8                           alfusinigo.j@gmail.com   
9                          andrewedstrom@gmail.com   
10                               andymoe@gmail.com   
11                 aniruth.parthasarathy@gmail.com   
12                              aslynko@pivotal.io   
13                             astandke@pivotal.io   
14                              asuraci@pivotal.io   
15                         asuraci@pivotallabs.com   
16                           aussielunix@gmail.com   
17           austinbrown@Aus

In [8]:
affil5_query = f"""
SELECT
   cntrb_email
FROM
   contributors
WHERE
   cntrb_email LIKE '%%hept%%' 
GROUP BY
   cntrb_email
;
    """
affil5 = pd.read_sql_query(affil5_query, con=engine)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(affil5)

           cntrb_email
0      adam@hepton.org
1     chuck@heptio.com
2   dcooley@heptio.com
3      eric@heptio.com
4       joe@heptio.com
5     jorge@heptio.com
6  nicholas@heptio.com
7     nolan@heptio.com
