In [1]:
from abc import ABC, abstractmethod
import os
from dotenv import load_dotenv
from sqlalchemy.orm import scoped_session, Session
import sys
sys.path.append('..')
from src.repo.orm import OpenPool, PrayerRequestORM
import tqdm
load_dotenv()
pg_uri = os.environ.get('PRAYERS_PG_DATABASE_URL')

pool: scoped_session[Session] = OpenPool(pg_uri)

class Backfill(ABC):
    @abstractmethod
    def filter(self, session: Session)->list[PrayerRequestORM]:
        pass

    @abstractmethod
    def update_fields(self, ormRequest: PrayerRequestORM):
        pass

class NewFill(ABC):
    @abstractmethod
    def filter(self, session: Session)->list:
        pass

    @abstractmethod
    def insert(self, session: Session, item):
        pass

# Backfill the prayer request table
# Use tqdm to show progress
def PrayerRequestBackfill(backfill: Backfill):
    with pool() as session:
        results = backfill.filter(session)
        for ormRequest in tqdm.tqdm(results):
            backfill.update_fields(ormRequest, session)
        session.commit()
        session.close()

In [2]:
from src.models.models import BibleEmbeddings, ClassifierModels, EmbeddingResult, Embeddings
from src.repo.prayerRequests import PrayerRequestRepoImpl
from sentence_transformers import SentenceTransformer

gteBase = SentenceTransformer('thenlper/gte-base')

embedding_model = Embeddings()
# bible_model = BibleEmbeddings()
classifier_models = ClassifierModels()
repo = PrayerRequestRepoImpl(pool, embedding_model, None)

class PrayerTopicsBackfill(Backfill):
    def filter(self, session: Session)->list[PrayerRequestORM]:
        return session.query(PrayerRequestORM).all()

    def update_fields(self, ormRequest: PrayerRequestORM, session: Session):
        gte_base = gteBase.encode(ormRequest.request)
        embedding = EmbeddingResult(gte_base, None)
        repo._rebuild_prayer_topics(session, ormRequest.id, embedding)

PrayerRequestBackfill(PrayerTopicsBackfill())
        

  from .autonotebook import tqdm as notebook_tqdm
You try to use a model that was created with version 3.0.0.dev0, however, your version is 2.3.1. This might cause unexpected behavior or errors. In that case, try to update to the latest version.



Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 900/900 [03:19<00:0

In [3]:
from src.models.models import ClassifierModels

classifiers = ClassifierModels()

class BackfillSentiment(Backfill):
    def filter(self, session: Session)->list[PrayerRequestORM]:
        return session.query(PrayerRequestORM).filter(
            PrayerRequestORM.sentiment_analysis == None ).all()

    def update_fields(self, ormRequest: PrayerRequestORM):
        result = classifiers.classify(ormRequest.request)
        ormRequest.sentiment_analysis = result['sentiment']
        ormRequest.emotion_roberta = result['emotion']

backfiller = BackfillSentiment()
# PrayerRequestBackfill(backfiller)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 778/778 [03:53<00:00,  3.33it/s]


In [7]:
import pandas as pd
import numpy as np
from src.repo.orm import BibleTopicORM, TopicORM, BibleTopicsORM
from sentence_transformers import SentenceTransformer

gteBase = SentenceTransformer('thenlper/gte-base')

def InsertFill(backfill: NewFill):
    with pool() as session:
        results = backfill.filter(session)
        # Loop over index and item with tqdm
        for i, item in tqdm.tqdm(results.iterrows()):
            backfill.insert(session, item, i)
        session.commit()
        session.close()

class TagsFill(NewFill):
    def __init__(self):
        self.i = 0

    def filter(self, session: Session)->list:
        unique_topics = pd.read_csv('../src/scraped_data/topics.csv')
        self.embeddings = gteBase.encode(unique_topics['topic'], show_progress_bar=True)
        return unique_topics

    def insert(self, session: Session, item: pd.Series, i: int):
        topic = item['topic']
        embedding = self.embeddings[i]
        tag = TopicORM(
            name=topic,
            gte_base_embedding=embedding
        )
        session.add(tag)

class BibleTagsFill(NewFill):
    def filter(self, session: Session)->list:
        niv_topics = pd.read_csv('../src/scraped_data/NIV_tags.csv')
        niv_topics['verse_numbers'] = niv_topics['verse_numbers'].apply(lambda x: eval(x))
        niv_topics['verse_start'] = niv_topics['verse_numbers'].apply(lambda x: x[0])
        niv_topics['verse_end'] = niv_topics['verse_numbers'].apply(lambda x: x[1] if len(x) > 1 else None)
        niv_topics["verse_end"] = niv_topics["verse_end"].replace({np.nan: None})
        niv_topics['tags'] = niv_topics['tags'].apply(lambda x: eval(x))
        
        topic_ids = {}
        allTopics = session.query(TopicORM).all()
        for topic in allTopics:
            topic_ids[topic.name] = topic.id
        self.topic_ids = topic_ids
        self.embeddings = gteBase.encode(niv_topics['verse_text'], show_progress_bar=True)
        return niv_topics

    def insert(self, session: Session, item: pd.Series, i:int):
        bibleTopic = BibleTopicORM(
            book = item['book'],
            chapter = item['chapter'],
            verse_start = item['verse_start'],
            verse_end = item['verse_end'],
            content = item['verse_text'],
            gte_base_embedding = self.embeddings[i]
        )
        session.add(bibleTopic)
        session.commit()
        session.refresh(bibleTopic)
        for tag in item['tags']:
            topic_id = self.topic_ids[tag]
            bibleTopics = BibleTopicsORM(
                topic_id = topic_id,
                bible_topic_id = bibleTopic.id
            )
            session.add(bibleTopics)
        

In [3]:
topicBackfiller = TagsFill()
InsertFill(topicBackfiller)

Batches: 100%|██████████| 5/5 [00:01<00:00,  2.91it/s]
159it [00:00, 3526.97it/s]


In [8]:
bibleTagsBackfill = BibleTagsFill()
InsertFill(bibleTagsBackfill)

Batches: 100%|██████████| 45/45 [01:55<00:00,  2.56s/it]
1419it [00:23, 60.32it/s]
