In [1]:
from sqlalchemy.orm import sessionmaker

In [2]:
# Establish MariaDB connection
from db.mariadb_connector import engine as mariadb_engine

Session = sessionmaker(bind=mariadb_engine)

INFO:db.mariadb_connector:Connecting to MariaDB at localhost:2001 with user mariadb-user
INFO:db.mariadb_connector:Connection to MariaDB successful! Databases: ['igcl', 'information_schema']
INFO:db.mariadb_connector:SQLAlchemy engine for MariaDB created successfully.


In [3]:
# Create Session
session = Session()

In [4]:
from models.base import Base
from models.author import Author
from models.division import Division
from models.faculty import Faculty
from models.institute import Institute
from models.sdg_prediction import SDGPrediction
from models.sdg_label import SDGLabel
from models.sdg_label_history import SDGLabelHistory
from models.sdg_label_decision import SDGLabelDecision
from models.sdg_user_label import SDGUserLabel
from models.dim_red import DimRed
from models.publication import Publication

In [15]:
publications_mariadb = session.query(Publication).all()

[<Publication(publication_id=1, title=HEE-GER: a systematic review of German economic evaluations of health care published 1990-2004, oai_identifier=oai:www.zora.uzh.ch:1, year=2007, publisher=BioMed Central)>,
 <Publication(publication_id=2, title=The economics of primary prevention of cardiovascular disease - a systematic review of economic evaluations, oai_identifier=oai:www.zora.uzh.ch:2, year=2007, publisher=BioMed Central)>,
 <Publication(publication_id=3, title=Comparison of two chromogenic media and evaluation of two molecular based identification systems for Enterobacter sakazakii detection, oai_identifier=oai:www.zora.uzh.ch:3, year=2006, publisher=BioMed Central)>,
 <Publication(publication_id=4, title=Serotypes, intimin variants and other virulence factors of eae positive Escherichia coli strains isolated from healthy cattle in Switzerland. Identification of a new intimin variant gene (eae-η2), oai_identifier=oai:www.zora.uzh.ch:4, year=2005, publisher=BioMed Central)>,
 <P

In [None]:
publication_abstracts = [publication.payload['title'] + publication.payload['description'] for publication in publications]
print(len(publication_abstracts))

In [6]:
from db.qdrantdb_connector import client as qdrantdb_client

INFO:db.qdrantdb_connector:Connecting to Qdrant at localhost:2003
INFO:httpx:HTTP Request: GET http://localhost:2003/collections "HTTP/1.1 200 OK"
INFO:db.qdrantdb_connector:Connection to Qdrant successful! Collections: [CollectionDescription(name='publications-sdg-scout')]


In [10]:
result = qdrantdb_client.scroll(
    collection_name="publications-sdg-scout",
    with_payload=['id'],
    with_vectors=False,
    limit=200000,
)

INFO:httpx:HTTP Request: POST http://localhost:2003/collections/publications-sdg-scout/points/scroll "HTTP/1.1 200 OK"


In [11]:
publications = result[0]

In [12]:
publications

[Record(id=1, payload={'id': 'oai:www.zora.uzh.ch:1'}, vector=None, shard_key=None, order_value=None),
 Record(id=2, payload={'id': 'oai:www.zora.uzh.ch:2'}, vector=None, shard_key=None, order_value=None),
 Record(id=3, payload={'id': 'oai:www.zora.uzh.ch:3'}, vector=None, shard_key=None, order_value=None),
 Record(id=4, payload={'id': 'oai:www.zora.uzh.ch:4'}, vector=None, shard_key=None, order_value=None),
 Record(id=5, payload={'id': 'oai:www.zora.uzh.ch:5'}, vector=None, shard_key=None, order_value=None),
 Record(id=6, payload={'id': 'oai:www.zora.uzh.ch:6'}, vector=None, shard_key=None, order_value=None),
 Record(id=7, payload={'id': 'oai:www.zora.uzh.ch:7'}, vector=None, shard_key=None, order_value=None),
 Record(id=8, payload={'id': 'oai:www.zora.uzh.ch:8'}, vector=None, shard_key=None, order_value=None),
 Record(id=9, payload={'id': 'oai:www.zora.uzh.ch:9'}, vector=None, shard_key=None, order_value=None),
 Record(id=10, payload={'id': 'oai:www.zora.uzh.ch:10'}, vector=None, sha

In [13]:
publication_ids = [publication.payload['id'] for publication in publications]

In [14]:
len(publication_ids)

107251

In [21]:
publication_ids

['oai:www.zora.uzh.ch:1',
 'oai:www.zora.uzh.ch:2',
 'oai:www.zora.uzh.ch:3',
 'oai:www.zora.uzh.ch:4',
 'oai:www.zora.uzh.ch:5',
 'oai:www.zora.uzh.ch:6',
 'oai:www.zora.uzh.ch:7',
 'oai:www.zora.uzh.ch:8',
 'oai:www.zora.uzh.ch:9',
 'oai:www.zora.uzh.ch:10',
 'oai:www.zora.uzh.ch:11',
 'oai:www.zora.uzh.ch:12',
 'oai:www.zora.uzh.ch:13',
 'oai:www.zora.uzh.ch:14',
 'oai:www.zora.uzh.ch:15',
 'oai:www.zora.uzh.ch:16',
 'oai:www.zora.uzh.ch:17',
 'oai:www.zora.uzh.ch:18',
 'oai:www.zora.uzh.ch:19',
 'oai:www.zora.uzh.ch:20',
 'oai:www.zora.uzh.ch:21',
 'oai:www.zora.uzh.ch:22',
 'oai:www.zora.uzh.ch:23',
 'oai:www.zora.uzh.ch:24',
 'oai:www.zora.uzh.ch:25',
 'oai:www.zora.uzh.ch:26',
 'oai:www.zora.uzh.ch:27',
 'oai:www.zora.uzh.ch:28',
 'oai:www.zora.uzh.ch:29',
 'oai:www.zora.uzh.ch:30',
 'oai:www.zora.uzh.ch:31',
 'oai:www.zora.uzh.ch:32',
 'oai:www.zora.uzh.ch:33',
 'oai:www.zora.uzh.ch:35',
 'oai:www.zora.uzh.ch:39',
 'oai:www.zora.uzh.ch:40',
 'oai:www.zora.uzh.ch:41',
 'oai:www.

In [19]:
publication_ids_mariadb = [publication.oai_identifier for publication in publications_mariadb]

In [20]:
publication_ids_mariadb

['oai:www.zora.uzh.ch:1',
 'oai:www.zora.uzh.ch:2',
 'oai:www.zora.uzh.ch:3',
 'oai:www.zora.uzh.ch:4',
 'oai:www.zora.uzh.ch:5',
 'oai:www.zora.uzh.ch:6',
 'oai:www.zora.uzh.ch:7',
 'oai:www.zora.uzh.ch:8',
 'oai:www.zora.uzh.ch:9',
 'oai:www.zora.uzh.ch:10',
 'oai:www.zora.uzh.ch:11',
 'oai:www.zora.uzh.ch:12',
 'oai:www.zora.uzh.ch:13',
 'oai:www.zora.uzh.ch:14',
 'oai:www.zora.uzh.ch:15',
 'oai:www.zora.uzh.ch:16',
 'oai:www.zora.uzh.ch:17',
 'oai:www.zora.uzh.ch:18',
 'oai:www.zora.uzh.ch:19',
 'oai:www.zora.uzh.ch:20',
 'oai:www.zora.uzh.ch:21',
 'oai:www.zora.uzh.ch:22',
 'oai:www.zora.uzh.ch:23',
 'oai:www.zora.uzh.ch:24',
 'oai:www.zora.uzh.ch:25',
 'oai:www.zora.uzh.ch:26',
 'oai:www.zora.uzh.ch:27',
 'oai:www.zora.uzh.ch:28',
 'oai:www.zora.uzh.ch:29',
 'oai:www.zora.uzh.ch:30',
 'oai:www.zora.uzh.ch:31',
 'oai:www.zora.uzh.ch:32',
 'oai:www.zora.uzh.ch:33',
 'oai:www.zora.uzh.ch:35',
 'oai:www.zora.uzh.ch:39',
 'oai:www.zora.uzh.ch:40',
 'oai:www.zora.uzh.ch:41',
 'oai:www.

In [22]:
set1 = set(publication_ids)
set2 = set(publication_ids_mariadb)

missing = list(sorted(set1 - set2))
added = list(sorted(set2 - set1))

In [27]:
print(f'missing: {len(missing)}')

missing: 53371


In [28]:
missing

['oai:www.zora.uzh.ch:100',
 'oai:www.zora.uzh.ch:1000',
 'oai:www.zora.uzh.ch:100000',
 'oai:www.zora.uzh.ch:100001',
 'oai:www.zora.uzh.ch:100057',
 'oai:www.zora.uzh.ch:100058',
 'oai:www.zora.uzh.ch:100060',
 'oai:www.zora.uzh.ch:100061',
 'oai:www.zora.uzh.ch:100063',
 'oai:www.zora.uzh.ch:100064',
 'oai:www.zora.uzh.ch:100065',
 'oai:www.zora.uzh.ch:100067',
 'oai:www.zora.uzh.ch:100070',
 'oai:www.zora.uzh.ch:100073',
 'oai:www.zora.uzh.ch:100078',
 'oai:www.zora.uzh.ch:100092',
 'oai:www.zora.uzh.ch:100095',
 'oai:www.zora.uzh.ch:100096',
 'oai:www.zora.uzh.ch:1001',
 'oai:www.zora.uzh.ch:100101',
 'oai:www.zora.uzh.ch:100110',
 'oai:www.zora.uzh.ch:100180',
 'oai:www.zora.uzh.ch:100182',
 'oai:www.zora.uzh.ch:100183',
 'oai:www.zora.uzh.ch:100184',
 'oai:www.zora.uzh.ch:100185',
 'oai:www.zora.uzh.ch:100186',
 'oai:www.zora.uzh.ch:100188',
 'oai:www.zora.uzh.ch:100189',
 'oai:www.zora.uzh.ch:100190',
 'oai:www.zora.uzh.ch:100191',
 'oai:www.zora.uzh.ch:100192',
 'oai:www.zora.

In [24]:
print('added:', added)

added: ['oai:www.zora.uzh.ch:100150', 'oai:www.zora.uzh.ch:101319', 'oai:www.zora.uzh.ch:10178', 'oai:www.zora.uzh.ch:102287', 'oai:www.zora.uzh.ch:103236', 'oai:www.zora.uzh.ch:103237', 'oai:www.zora.uzh.ch:103252', 'oai:www.zora.uzh.ch:104079', 'oai:www.zora.uzh.ch:104815', 'oai:www.zora.uzh.ch:105456', 'oai:www.zora.uzh.ch:109898', 'oai:www.zora.uzh.ch:110185', 'oai:www.zora.uzh.ch:110938', 'oai:www.zora.uzh.ch:110952', 'oai:www.zora.uzh.ch:111076', 'oai:www.zora.uzh.ch:113946', 'oai:www.zora.uzh.ch:113947', 'oai:www.zora.uzh.ch:115922', 'oai:www.zora.uzh.ch:121583', 'oai:www.zora.uzh.ch:122467', 'oai:www.zora.uzh.ch:122468', 'oai:www.zora.uzh.ch:127783', 'oai:www.zora.uzh.ch:127904', 'oai:www.zora.uzh.ch:128401', 'oai:www.zora.uzh.ch:128703', 'oai:www.zora.uzh.ch:128968', 'oai:www.zora.uzh.ch:129207', 'oai:www.zora.uzh.ch:129219', 'oai:www.zora.uzh.ch:131322', 'oai:www.zora.uzh.ch:132556', 'oai:www.zora.uzh.ch:134448', 'oai:www.zora.uzh.ch:135683', 'oai:www.zora.uzh.ch:135944', 'oa

In [29]:
print(f'added: {len(added)}')

added: 4675


In [30]:
added

['oai:www.zora.uzh.ch:100150',
 'oai:www.zora.uzh.ch:101319',
 'oai:www.zora.uzh.ch:10178',
 'oai:www.zora.uzh.ch:102287',
 'oai:www.zora.uzh.ch:103236',
 'oai:www.zora.uzh.ch:103237',
 'oai:www.zora.uzh.ch:103252',
 'oai:www.zora.uzh.ch:104079',
 'oai:www.zora.uzh.ch:104815',
 'oai:www.zora.uzh.ch:105456',
 'oai:www.zora.uzh.ch:109898',
 'oai:www.zora.uzh.ch:110185',
 'oai:www.zora.uzh.ch:110938',
 'oai:www.zora.uzh.ch:110952',
 'oai:www.zora.uzh.ch:111076',
 'oai:www.zora.uzh.ch:113946',
 'oai:www.zora.uzh.ch:113947',
 'oai:www.zora.uzh.ch:115922',
 'oai:www.zora.uzh.ch:121583',
 'oai:www.zora.uzh.ch:122467',
 'oai:www.zora.uzh.ch:122468',
 'oai:www.zora.uzh.ch:127783',
 'oai:www.zora.uzh.ch:127904',
 'oai:www.zora.uzh.ch:128401',
 'oai:www.zora.uzh.ch:128703',
 'oai:www.zora.uzh.ch:128968',
 'oai:www.zora.uzh.ch:129207',
 'oai:www.zora.uzh.ch:129219',
 'oai:www.zora.uzh.ch:131322',
 'oai:www.zora.uzh.ch:132556',
 'oai:www.zora.uzh.ch:134448',
 'oai:www.zora.uzh.ch:135683',
 'oai:www