In [1]:
import sqlalchemy as sa
from lagoon.db.connection import get_session
from lagoon.db import schema as sch
from lagoon.ml.common import utils
from lagoon.ml.config import *
import numpy as np
import pandas as pd
import arrow
from tqdm import tqdm
import matplotlib.pyplot as plt

sess = get_session().__enter__()

## Things present in the graph

In [2]:
## Entities
entities = sess.query(sch.Entity) #this is a query object
print(entities.count())
# len(entities) is invalid, however, indexing works
print(entities[:3])

# entities_list = entities.all() #the all() converts a query to a list
# print(len(entities_list))
# NOTE: dealing with lists takes a lot of time, so stick to queries.
# NOTE: When iterating, i.e. `for entity in entities`, time taken is the same for a) queries and b) queries converted to lists using .all(). So, stick to queries.

1000450
[<lagoon.db.schema.Entity 1: EntityTypeEnum.git_commit bb3fdcfe95>, <lagoon.db.schema.Entity 2: EntityTypeEnum.git_commit a8ae7a5613>, <lagoon.db.schema.Entity 3: EntityTypeEnum.git_commit 276a3a6a16>]


In [3]:
## Observations
observations = sess.query(sch.Observation)
print(observations.count())

4051745


In [4]:
## Computed attributes
computed_attrs = sess.query(sch.ComputedAttrs)
print(computed_attrs.count())

929921


In [37]:
# Properties
e = entities[0]
for attr in dir(e):
    if not attr.startswith('__'):
        print(f"'{attr}': {getattr(e,attr)}")
print("_______________________________________________________")
ob = observations[0]
for attr in dir(ob):
    if not attr.startswith('__'):
        print(f"'{attr}': {getattr(ob,attr)}")
print("_______________________________________________________")
ca = computed_attrs[0]
for attr in dir(ca):
    if not attr.startswith('__'):
        print(f"'{attr}': {getattr(ca,attr)}")

'_sa_class_manager': <ClassManager of <class 'lagoon.db.schema.Entity'> at 10e2e45e0>
'_sa_instance_state': <sqlalchemy.orm.state.InstanceState object at 0x13990be20>
'_sa_registry': <sqlalchemy.orm.decl_api.registry object at 0x10e295a60>
'asdict': <bound method DataClassMixin.asdict of <lagoon.db.schema.Entity 1: EntityTypeEnum.git_commit bb3fdcfe95>>
'attrs': {'time': 1636140101.0, 'message': 'Python 3.9.8\n', 'commit_sha': 'bb3fdcfe95b9aeed13b7201ffbc634752ad8ecc9'}
'batch': Batch(id=1, resource='ingest-git-github.com/python/cpython.git', ingest_time=datetime.datetime(2021, 11, 24, 21, 42, 27, 889401), revision=None)
'batch_id': 1
'computed_attrs': SELECT computed_attrs.id AS computed_attrs_id, attrs_base.id AS attrs_base_id, attrs_base.super_type AS attrs_base_super_type, attrs_base.attrs AS attrs_base_attrs, computed_attrs.batch_id AS computed_attrs_batch_id, computed_attrs.obj_id AS computed_attrs_obj_id 
FROM attrs_base JOIN computed_attrs ON computed_attrs.id = attrs_base.id 


## Fused Entities and Observations

In [2]:
## Get fused entities and observations
fused_entities = sess.query(sch.FusedEntity)
print(fused_entities.count())
fused_observations = sess.query(sch.FusedObservation)
print(fused_observations.count())

986139
4051738


In [11]:
fe = fused_entities[0]
for attr in dir(fe):
    if not attr.startswith('__'):
        try:
            print(f"'{attr}': {getattr(fe,attr)}")
        except:
            pass
print("_______________________________________________________")
fob = fused_observations[0]
for attr in dir(fob):
    if not attr.startswith('__'):
        print(f"'{attr}': {getattr(fob,attr)}")

'_sa_class_manager': <ClassManager of <class 'lagoon.db.schema_fused.FusedEntity'> at 10e325ae0>
'_sa_instance_state': <sqlalchemy.orm.state.InstanceState object at 0x1399cde80>
'_sa_registry': <sqlalchemy.orm.decl_api.registry object at 0x10e295a60>
'asdict': <bound method DataClassMixin.asdict of <lagoon.db.schema_fused.FusedEntity 82710: EntityTypeEnum.git_commit dab46f7250>>
'attrs': {'time': 1336997840.0, 'message': 'null merge\n', 'commit_sha': 'dab46f72501750dba0c36e26d8b6679d0d9d5f54'}
'attrs_sources': [<lagoon.db.schema.Entity 82710: EntityTypeEnum.git_commit dab46f7250>, ComputedAttrs(id=13054398, super_type=<SuperTypeEnum.computed_attrs: 'computed_attrs'>, attrs={}, batch_id=29, obj_id=82710)]
'fusions': [EntityFusion(id_lowest=82710, id_other=82710, comment=None)]
'id': 82710
'metadata': MetaData()
'name': dab46f7250
'obs_hops': <bound method FusedEntity.obs_hops of <lagoon.db.schema_fused.FusedEntity 82710: EntityTypeEnum.git_commit dab46f7250>>
'registry': <sqlalchemy.orm

## Hops

In [13]:
# Get observations k hops out
some_entity = sess.query(sch.FusedEntity).get(82710)
for k in [1,2,3]:
    print(len(some_entity.obs_hops(k))) #these are lists
print(some_entity.obs_hops(1)[0])

2
19021
107004
<lagoon.db.schema_fused.FusedObservation 541694: (ObservationTypeEnum.created@2012-05-14, <lagoon.db.schema_fused.FusedEntity 132225: EntityTypeEnum.person Martin v. Löwis <martin@v.loewis.de>>, <lagoon.db.schema_fused.FusedEntity 82710: EntityTypeEnum.git_commit dab46f7250>)>


In [18]:
# Restrict by time
obs = some_entity.obs_hops(2, time_min=arrow.get('2012-01-01').datetime, time_max=arrow.get('2012-12-31').datetime)
print(len(obs))

706


In [32]:
# obs_hops is a better way instead of getting sources and destinations separately

commits = sess.query(sch.FusedEntity).where(sch.FusedEntity.type == sch.EntityTypeEnum.git_commit)
commit = commits[1]
# print(commit.obs_as_src).count() #this will NOT work since `'FusedEntity.obs_as_src' is not available due to lazy='raise'`
# Likewise for dst

# Instead, do this:
obs_as_src = sess.query(sch.FusedObservation).where(sch.FusedObservation.src == commit)
obs_as_dst = sess.query(sch.FusedObservation).where(sch.FusedObservation.dst == commit)
print(obs_as_src.count() + obs_as_dst.count())

# That should be equal to this
print(len(commit.obs_hops(1)))

# Since they are equivalent, it is better to just use obs_hops instead of obs_as_src and obs_as_dst

5
5


## Batches

In [2]:
batches = sess.query(sch.Batch).all()
print(batches)

[Batch(id=1, resource='ingest-git-github.com/python/cpython.git', ingest_time=datetime.datetime(2021, 11, 24, 21, 42, 27, 889401), revision=None), Batch(id=26, resource='ocean-python.pck', ingest_time=datetime.datetime(2021, 11, 25, 7, 3, 49, 724241), revision=None), Batch(id=27, resource='ingest-python-peps', ingest_time=datetime.datetime(2021, 11, 25, 8, 15, 18, 124416), revision=None), Batch(id=28, resource='link-python-peps', ingest_time=datetime.datetime(2021, 11, 25, 11, 34, 45, 457041), revision=None), Batch(id=33, resource='toxicity_badwords', ingest_time=datetime.datetime(2021, 12, 1, 19, 32, 57, 604580), revision=None), Batch(id=50, resource='toxicity_nlp', ingest_time=datetime.datetime(2022, 2, 17, 19, 0, 51, 402608), revision=None), Batch(id=57, resource='hibp-breaches', ingest_time=datetime.datetime(2022, 2, 23, 23, 47, 57, 820284), revision=None)]


In [None]:
## OUTDATED
# Get number of entities and observations of each type in each batch
from lagoon.db.schema import EntityTypeEnum, ObservationTypeEnum
with get_session() as sess:
    for batch_id in [3,25]:
        print(f'Batch id {batch_id}:')
        for typ in EntityTypeEnum:
            entities = sess.query(sch.FusedEntity).where(sch.FusedEntity.batch_id==batch_id).where(sch.FusedEntity.type==typ)
            print(f'{typ}: {entities.count()}')
        for typ in ObservationTypeEnum:
            obs = sess.query(sch.FusedObservation).where(sch.FusedObservation.batch_id==batch_id).where(sch.FusedObservation.type==typ)
            print(f'{typ}: {obs.count()}')

In [None]:
## OUTDATED
# Check if there is batch overlap
for batch_id in [3,25]:
    print(f'Batch id {batch_id}:')
    entity_ids = set()
    obs = sess.query(sch.FusedObservation).where(sch.FusedObservation.batch_id==batch_id)
    for ob in obs:
        entity_ids.add(ob.src_id)
        entity_ids.add(ob.dst_id)
    for entity_id in entity_ids:
        entity = sess.query(sch.FusedEntity).get(entity_id)
        if entity.batch_id != batch_id:
            print(f'Mismatch: {entity.id}')

# RESULT: There is no overlap 

In [35]:
# Get start and end times for observations in a batch
# Alternatively, delete the batch portion to get this for the whole graph

print('Cpython...')
obs = (
    sess.query(sch.FusedObservation)
    .where(sch.FusedObservation.batch_id == 1)
)
times = sorted([ob.time for ob in obs])
print(times[0])
print(times[-1])

print('OCEAN...')
obs = (
    sess.query(sch.FusedObservation)
    .where(sch.FusedObservation.batch_id == 26)
)
times = sorted([ob.time for ob in obs])
print(times[0])
print(times[-1])

Cpython...
1990-08-09 14:25:15
2021-11-08 16:51:01
OCEAN...
1995-03-16 06:08:16
2021-05-01 03:07:50


## Attributes

In [51]:
# All attributes for all entities
keys = sess.query(sa.func.jsonb_object_keys(sch.FusedEntity.attrs)).distinct()
sorted(keys.all())

[('badwords_ex_googleInstantB_any',),
 ('badwords_ex_mrezvan94Harassment_Appearance',),
 ('badwords_ex_mrezvan94Harassment_Generic',),
 ('badwords_ex_mrezvan94Harassment_Intelligence',),
 ('badwords_ex_mrezvan94Harassment_Politics',),
 ('badwords_ex_mrezvan94Harassment_Racial',),
 ('badwords_ex_mrezvan94Harassment_Sexual',),
 ('badwords_ex_swearing_any',),
 ('body_text',),
 ('commit_sha',),
 ('computed_badwords_googleInstantB_any',),
 ('computed_badwords_mrezvan94Harassment_Appearance',),
 ('computed_badwords_mrezvan94Harassment_Generic',),
 ('computed_badwords_mrezvan94Harassment_Intelligence',),
 ('computed_badwords_mrezvan94Harassment_Politics',),
 ('computed_badwords_mrezvan94Harassment_Racial',),
 ('computed_badwords_mrezvan94Harassment_Sexual',),
 ('computed_badwords_swearing_any',),
 ('created',),
 ('email',),
 ('message',),
 ('name',),
 ('number',),
 ('origin_filename',),
 ('replaces',),
 ('requires',),
 ('status',),
 ('subject',),
 ('superseded_by',),
 ('time',),
 ('title',),


In [52]:
# All attributes for all message entities
mkeys = sess.query(sa.func.jsonb_object_keys(sch.FusedEntity.attrs)).where(sch.FusedEntity.type == sch.EntityTypeEnum.message).distinct()
sorted(mkeys.all())

[('badwords_ex_googleInstantB_any',),
 ('badwords_ex_mrezvan94Harassment_Appearance',),
 ('badwords_ex_mrezvan94Harassment_Generic',),
 ('badwords_ex_mrezvan94Harassment_Intelligence',),
 ('badwords_ex_mrezvan94Harassment_Politics',),
 ('badwords_ex_mrezvan94Harassment_Racial',),
 ('badwords_ex_mrezvan94Harassment_Sexual',),
 ('badwords_ex_swearing_any',),
 ('body_text',),
 ('computed_badwords_googleInstantB_any',),
 ('computed_badwords_mrezvan94Harassment_Appearance',),
 ('computed_badwords_mrezvan94Harassment_Generic',),
 ('computed_badwords_mrezvan94Harassment_Intelligence',),
 ('computed_badwords_mrezvan94Harassment_Politics',),
 ('computed_badwords_mrezvan94Harassment_Racial',),
 ('computed_badwords_mrezvan94Harassment_Sexual',),
 ('computed_badwords_swearing_any',),
 ('origin_filename',),
 ('subject',),
 ('time',)]

In [116]:
# Get number of lines changed by looking at the observations between a commit and a file
commit = sess.query(sch.FusedEntity).get(61516)
print(utils.get_neighboring_entities(sess, commit).all()) #this gives id of a file changed as 141768
print()
obs = commit.obs_hops(1) #this gives id of the obs linking to file as 447223
print(obs)
print()
ob = sess.query(sch.FusedObservation).get(447223)
print(ob.attrs)

[<lagoon.db.schema_fused.FusedEntity 130934: EntityTypeEnum.person Gregory P. Smith <greg@krypto.org>>, <lagoon.db.schema_fused.FusedEntity 141768: EntityTypeEnum.file Lib/bsddb/dbtables.py>]

[<lagoon.db.schema_fused.FusedObservation 447223: (ObservationTypeEnum.modified@2007-10-18, <lagoon.db.schema_fused.FusedEntity 61516: EntityTypeEnum.git_commit f8a2a0b5a9>, <lagoon.db.schema_fused.FusedEntity 141768: EntityTypeEnum.file Lib/bsddb/dbtables.py>)>, <lagoon.db.schema_fused.FusedObservation 447221: (ObservationTypeEnum.created@2007-10-18, <lagoon.db.schema_fused.FusedEntity 130934: EntityTypeEnum.person Gregory P. Smith <greg@krypto.org>>, <lagoon.db.schema_fused.FusedEntity 61516: EntityTypeEnum.git_commit f8a2a0b5a9>)>, <lagoon.db.schema_fused.FusedObservation 447222: (ObservationTypeEnum.committed@2007-10-18, <lagoon.db.schema_fused.FusedEntity 130934: EntityTypeEnum.person Gregory P. Smith <greg@krypto.org>>, <lagoon.db.schema_fused.FusedEntity 61516: EntityTypeEnum.git_commit f8

## Toxicity analysis

In [53]:
# Check if flagged_abuse is ever not None
fused_entities = sess.query(sch.FusedEntity)
count = 0
for fe in fused_entities:
    if fe.attrs.get('flagged_abuse'):
        print(fe.attrs)
        print("______________________________________")
        count += 1
    if count==2:
        break

# RESULT: flagged_abuse is always None

In [70]:
messages = sess.query(sch.FusedEntity).where(sch.FusedEntity.type==sch.EntityTypeEnum.message)

# No toxicity
message = sess.query(sch.FusedEntity).get(9801186)
print(message)
print()
print(message.fusions)
print()
print(message.attrs)
print()
print(message.attrs_sources)
print()
print([elem.attrs for elem in message.attrs_sources])

print("______________________________________")

# Toxicity present
message = sess.query(sch.FusedEntity).get(9801187)
print(message)
print()
print(message.fusions)
print()
print(message.attrs)
print()
print(message.attrs_sources)
print()
print([elem.attrs for elem in message.attrs_sources])

#NOTE: The first element of `attrs_sources` contains details of attrs at the time of creation of the FusedEntity, like 'subject' and 'body_text' for messages. The subsequent elements have attrs that were added later, like badwords counts.

<lagoon.db.schema_fused.FusedEntity 9801186: EntityTypeEnum.message Message <200006291349.IAA09962@cj20424-a.reston1.va.home.com>>

[EntityFusion(id_lowest=9801186, id_other=9801186, comment=None)]


[<lagoon.db.schema.Entity 9801186: EntityTypeEnum.message Message <200006291349.IAA09962@cj20424-a.reston1.va.home.com>>, ComputedAttrs(id=13883339, super_type=<SuperTypeEnum.computed_attrs: 'computed_attrs'>, attrs={}, batch_id=29, obj_id=9801186)]

______________________________________
<lagoon.db.schema_fused.FusedEntity 9801187: EntityTypeEnum.message Message <3F5A97F7.7080700@ocf.berkeley.edu>>

[EntityFusion(id_lowest=9801187, id_other=9801187, comment=None)]

{'time': 1062901751.0, 'subject': 'Re: [Python-Dev] Changing select.select to accept iterables', 'body_text': 'Tim Peters wrote:\n\n> [Brett, about <http://www.python.org/sf/798046>]\n> \n> [Guido]\n> \n>>I seem to recall that that code has a long history of being hairy\n>>and full of platform-specific issues, and I\'d rather n

In [71]:
# Look at the ComputedAttrs objects

# Blank
ca_blank = sess.query(sch.ComputedAttrs).get(13883339)
print(ca_blank)

# Not blank
ca_toxic = sess.query(sch.ComputedAttrs).get(13611848)
print(ca_toxic)

ComputedAttrs(id=13883339, super_type=<SuperTypeEnum.computed_attrs: 'computed_attrs'>, attrs={}, batch_id=29, obj_id=9801186)
ComputedAttrs(id=13611848, super_type=<SuperTypeEnum.computed_attrs: 'computed_attrs'>, attrs={'badwords_ex_googleInstantB_any': ['hairy'], 'computed_badwords_googleInstantB_any': 2}, batch_id=29, obj_id=9801187)


In [74]:
commits = sess.query(sch.FusedEntity).where(sch.FusedEntity.type==sch.EntityTypeEnum.git_commit)

# No toxicity
commit = commits[0]
print(commit)
print()
print(commit.fusions)
print()
print(commit.attrs)
print()
print(commit.attrs_sources)
print()
print([elem.attrs for elem in commit.attrs_sources])

print("______________________________________")

# Toxicity present
for commit in commits:
    if commit.attrs_sources[1].attrs:
        print(commit)
        print()
        print(commit.fusions)
        print()
        print(commit.attrs)
        print()
        print(commit.attrs_sources)
        print()
        print([elem.attrs for elem in commit.attrs_sources])
        break

<lagoon.db.schema_fused.FusedEntity 61516: EntityTypeEnum.git_commit f8a2a0b5a9>

[EntityFusion(id_lowest=61516, id_other=61516, comment=None)]

{'time': 1192696460.0, 'message': 'Fix a weird bug in dbtables: if it chose a random rowid string that contained\nNULL bytes it would cause the database all sorts of problems in the future\nleading to very strange random failures and corrupt dbtables.bsdTableDb dbs.\n', 'commit_sha': 'f8a2a0b5a9edc5769b2da40c36c49eed4c5c1b33'}

[<lagoon.db.schema.Entity 61516: EntityTypeEnum.git_commit f8a2a0b5a9>, ComputedAttrs(id=13791559, super_type=<SuperTypeEnum.computed_attrs: 'computed_attrs'>, attrs={}, batch_id=29, obj_id=61516)]

[{'time': 1192696460.0, 'message': 'Fix a weird bug in dbtables: if it chose a random rowid string that contained\nNULL bytes it would cause the database all sorts of problems in the future\nleading to very strange random failures and corrupt dbtables.bsdTableDb dbs.\n', 'commit_sha': 'f8a2a0b5a9edc5769b2da40c36c49eed4c5c1b3

## PEPs

In [None]:
# First check which mailing lists are included in OCEAN
messages = sess.query(sa.func.jsonb_extract_path_text(sch.FusedEntity.attrs,'origin_filename')).where(sch.FusedEntity.type == sch.EntityTypeEnum.message).distinct()
print([message[0] for message in messages.all()])

In [2]:
peps = sess.query(sch.FusedEntity).where(sch.FusedEntity.type==sch.EntityTypeEnum.pep)
peps.count()

570

In [3]:
# Properties of a PEP
pep = peps.first()
for attr in dir(pep):
    if not attr.startswith('__'):
        try:
            print(f"'{attr}': {getattr(pep,attr)}")
        except:
            pass

'_sa_class_manager': <ClassManager of <class 'lagoon.db.schema_fused.FusedEntity'> at 10eac65e0>
'_sa_instance_state': <sqlalchemy.orm.state.InstanceState object at 0x141052d30>
'_sa_registry': <sqlalchemy.orm.decl_api.registry object at 0x10dcb9dc0>
'asdict': <bound method DataClassMixin.asdict of <lagoon.db.schema_fused.FusedEntity 12919798: EntityTypeEnum.pep PEP 13>>
'attrs': {'url': 'https://www.python.org/dev/peps/pep-0013', 'type': 'Informational', 'title': 'Python Language Governance', 'number': 13, 'status': 'Active', 'created': '16-Dec-2018', 'replaces': [], 'requires': [], 'superseded_by': []}
'attrs_sources': [<lagoon.db.schema.Entity 12919798: EntityTypeEnum.pep PEP 13>]
'fusions': [EntityFusion(id_lowest=12919798, id_other=12919798, comment=None)]
'id': 12919798
'metadata': MetaData()
'name': PEP 13
'obs_hops': <bound method FusedEntity.obs_hops of <lagoon.db.schema_fused.FusedEntity 12919798: EntityTypeEnum.pep PEP 13>>
'registry': <sqlalchemy.orm.decl_api.registry objec

In [4]:
# How many good/bad/other PEPs are there?
good = 0
bad = 0
other = 0
for pep in peps:
    if pep.attrs['status'].lower().startswith(tuple(PEP_STATUSES['good'])):
        good += 1
    if pep.attrs['status'].lower().startswith(tuple(PEP_STATUSES['bad'])):
        bad += 1
    if pep.attrs['status'].lower().startswith(tuple(PEP_STATUSES['other'])):
        other += 1
print(f'Good = {good}, Bad = {bad}, Other = {other}')

Good = 335, Bad = 203, Other = 32


In [5]:
# How many good/bad/other PEPs are there in the last 5 years and the time before that?
good_old, good_new = 0, 0
bad_old, bad_new = 0, 0
other_old, other_new = 0, 0
for pep in peps:
    if int(pep.attrs['created'][7:11]) < 2016:
        if pep.attrs['status'].lower().startswith(tuple(PEP_STATUSES['good'])):
            good_old += 1
        if pep.attrs['status'].lower().startswith(tuple(PEP_STATUSES['bad'])):
            bad_old += 1
        if pep.attrs['status'].lower().startswith(tuple(PEP_STATUSES['other'])):
            other_old += 1
    elif 2016 <= int(pep.attrs['created'][7:11]) <= 2020:
        if pep.attrs['status'].lower().startswith(tuple(PEP_STATUSES['good'])):
            good_new += 1
        if pep.attrs['status'].lower().startswith(tuple(PEP_STATUSES['bad'])):
            bad_new += 1
        if pep.attrs['status'].lower().startswith(tuple(PEP_STATUSES['other'])):
            other_new += 1
print(f'Beginning of time to end of 2015: Good = {good_old}, Bad = {bad_old}, Other = {other_old}')
print(f'2016-2020: Good = {good_new}, Bad = {bad_new}, Other = {other_new}')

Beginning of time to end of 2015: Good = 236, Bad = 158, Other = 3
2016-2020: Good = 91, Bad = 42, Other = 17


In [63]:
# How many good/bad/other multi-author PEPs are there?
good = 0
bad = 0
other = 0
for pep in peps:
    authors = utils.get_pep_authors(pep)
    if len(authors)>1:
        if pep.attrs['status'].lower().startswith(tuple(PEP_STATUSES['good'])):
            good += 1
        if pep.attrs['status'].lower().startswith(tuple(PEP_STATUSES['bad'])):
            bad += 1
        if pep.attrs['status'].lower().startswith(tuple(PEP_STATUSES['other'])):
            other += 1
print(f'Good = {good}, Bad = {bad}, Other = {other}')

Good = 104, Bad = 44, Other = 11


## NLP

In [1]:
# Basics
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

# Explanation of the tokenizer output:
## 'input_ids' are the numerical representations of the tokens
## 'token_type_ids' is all zeros for single sentences, or zeros and ones for pairs of sentences
## 'attention_mask' is all ones except for unimportant tokens like [PAD], for which it is zero

# Deal with a list of sentences
sentences = ["Hi, who are you?", "Wow!! THIS IS so GRT & good :-). A, B, and I are amazed.", "Byee"]
encoded_inputs = tokenizer(sentences, truncation=True, padding=True, return_tensors = 'pt')
print(encoded_inputs)
for single_input_ids in encoded_inputs['input_ids']:
    decoded_input = tokenizer.decode(single_input_ids)
    print(decoded_input)

print("__________________")

# See tokens for a single sentence - it splits punctuation, and splits weird words (including words in caps) it can't understand by pre-pending ##
tokenized_sequence = tokenizer.tokenize("Wow!! THIS IS so GRT & good :-). A, B, and I are amazed.")
print(tokenized_sequence)

tokenized_sequence = tokenizer.tokenize("Hi, who are you? I am Optimus Prime.")
print(tokenized_sequence)

print("__________________")

# Deal with pairs of sentences
encoded_inputs = tokenizer("Hi, who are you?", "I am Optimus Prime.", return_tensors = 'pt')
print(encoded_inputs)
for single_input_ids in encoded_inputs['input_ids']:
    decoded_input = tokenizer.decode(single_input_ids)
    print(decoded_input)

{'input_ids': tensor([[  101,  8790,   117,  1150,  1132,  1128,   136,   102,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [  101, 11750,   106,   106,   157,  3048,  6258, 19432,  1177,   144,
         10460,   111,  1363,   131,   118,   114,   119,   138,   117,   139,
           117,  1105,   146,  1132, 16603,   119,   102],
        [  101, 17774,  1162,   102,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0]]), 'attention_ma

In [5]:
# Brad's example
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from torch.nn.functional import softmax
from torch import sigmoid

list_of_comments = ['I hate Python', 'I love Python', 'You are an ABSOLUTE idiot', '']

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
test_encodings = tokenizer(list_of_comments, truncation=True, padding=True, return_tensors = 'pt')
print(test_encodings)

print("_________________")

model = AutoModelForSequenceClassification.from_pretrained(os.path.join(NLP_MODELS_FOLDER,'tox_classifier'), num_labels = 2)
result = model(**test_encodings)
print(result)
result_softmax = softmax(result.logits, dim=-1) # (not toxic, toxic)
print(result_softmax)

print("_________________")

model = AutoModelForSequenceClassification.from_pretrained(os.path.join(NLP_MODELS_FOLDER,'tox_regression'), num_labels=1)
result = model(**test_encodings)
result_sigmoid = sigmoid(result.logits)
print(result_sigmoid) # higher indicates toxicity

{'input_ids': tensor([[  101,   146,  4819, 23334,   102,     0,     0,     0,     0,     0],
        [  101,   146,  1567, 23334,   102,     0,     0,     0,     0,     0],
        [  101,  1192,  1132,  1126, 20066, 13901, 16830,  2036, 10696,   102],
        [  101,   102,     0,     0,     0,     0,     0,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])}
_________________
SequenceClassifierOutput(loss=None, logits=tensor([[ 0.1961, -0.2846],
        [ 3.0440, -4.1388],
        [-2.8806,  2.9183],
        [ 1.7171, -2.0363]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
tensor([[6.1790e-01, 3.8210e-01],
        [9.9924e-01, 7.5892e-04]

In [8]:
# Pipelines
from transformers import pipeline

classifier = pipeline("sentiment-analysis")
result = classifier("I feel like you are doing an acceptable job.")
result

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)


[{'label': 'POSITIVE', 'score': 0.9994511008262634}]

In [2]:
# Custom model
from transformers import BertTokenizer, BertForSequenceClassification

inputs = ['here is a comment', 'I love Python', 'weak typing makes me sad']

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
tokenized_inputs = tokenizer(inputs, truncation=True, padding=True, return_tensors = 'pt')
print(tokenized_inputs)

print("_________________")

model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
outputs = model(**tokenized_inputs)
print(outputs)

{'input_ids': tensor([[  101,  2182,  2003,  1037,  7615,   102,     0],
        [  101,  1045,  2293, 18750,   102,     0,     0],
        [  101,  5410, 22868,  3084,  2033,  6517,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 1]])}
_________________


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

SequenceClassifierOutput(loss=None, logits=tensor([[-0.2683, -0.0348],
        [-0.3979, -0.1225],
        [-0.3856, -0.1495]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


## Have I Been Pwned

In [2]:
## Get emails in the DB which are None, and those that are neither string nor None
objs = sess.query(sch.Entity).where(sch.Entity.type == sch.EntityTypeEnum.person)
not_str = []
not_str_not_none = []
for obj in objs:
    if type(obj.attrs['email']) != str:
        if obj.attrs['email'] is None:
            not_str.append(obj.id)
        else:
            not_str_not_none.append(obj.id)
print(not_str)
print(not_str_not_none)

[8716868, 8756695, 8757964, 8758490, 8822102, 8860772, 8862943, 8865325, 8882593, 8889408, 8890205, 8932637, 8933240, 8955639, 8980383, 8981958, 9036815, 9058850, 9059340, 9060210, 9060212, 9061248, 9086764, 9092212, 9122017, 9148994, 9177112, 9195200, 9197798, 9220664, 9226749, 9227546, 9228893, 9249343, 9255687, 9257335, 9276650, 9341404, 9349998, 9350637, 9406701, 9406760, 9408095, 9447860, 9483238, 9517818, 9517870, 9546186, 9554304, 9627014, 9627571, 9629918, 9632476, 9633532, 9657695, 9663098, 9666054, 9692409, 9694406, 9695764, 9725733, 9725639, 9729051, 9760565, 9760568, 9805904, 9849074, 9883134, 9884404, 9887444, 9889083, 9891141, 9927053, 9930911, 9933872, 9978572, 9982083, 9981567, 10015487, 10019171, 10021559, 10106657, 10107248, 10112327, 10113121, 10150490, 10153221, 10193197, 10234926, 10236603, 10237774, 10289357, 10292594, 10391350, 10447722, 10502240, 10545352, 10545671, 10549182, 10551719, 10590470, 10591448, 10649776, 10650446, 10709895, 10711066, 10712690, 1077092