## Load data into memory

In [1]:
from dotenv import load_dotenv
import sys
sys.path.append('..')
from src.repo.orm import OpenPool, PrayerRequestORM
from src.dto.prayerRequests import PrayerRequest
load_dotenv()
import os
import pandas as pd
pg_uri = os.environ.get('PRAYERS_PG_DATABASE_URL')
pool = OpenPool(pg_uri)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from datetime import datetime, timezone, UTC
from sklearn.preprocessing import StandardScaler

class FeatureBuilder:
    def __init__(self):
        pass

    def build_preprocess(self, data: list[dict])->pd.DataFrame:
        today = datetime.now(UTC)
        for obj in data:
            obj['dayLength'] = (today - obj['created_at']).days
        return pd.DataFrame(data)

    def build_postprocess(self, data: np.ndarray):
        pass


class Preprocessor:
    def __init__(self):
        self.scaler = StandardScaler()
        self.columns = ["dayLength"]

    def fit_transform(self, data: list[dict]):
        self.fit(data)
        return self.transform(data)

    def fit(self, df: pd.DataFrame):
        self.scaler.fit(df[self.columns])

    def transform(self, df: pd.DataFrame)->tuple[np.ndarray, np.ndarray]:
        self.scaler.transform(df[self.columns])
        df["embedding"].to_numpy()
        # nparray = df.apply(lambda x: np.append(x["embedding"], x["dayLength"]), axis=1).to_numpy()
            
        return df["embedding"].to_numpy(), df["dayLength"].to_numpy()


In [3]:
all_requests = []
with pool() as session:
    prayer_requests = session.query(PrayerRequestORM).all()
    for request in prayer_requests:
        requestObject = {
            "prayer_request": request.request,
            "embedding": request.gte_base_embedding,
            "link_id": request.link_id,
            "created_at": request.created_at,
            "contact_id": request.contact_id
        }
        all_requests.append(requestObject)

df = pd.DataFrame(all_requests)
df.head(2)

Unnamed: 0,prayer_request,embedding,link_id,created_at,contact_id
0,Mom is thinking she wants to move to Tennessee...,"[-0.009760048, -0.005248226, 0.0009992049, -0....",182.0,2024-05-20 23:19:10.768122+00:00,29
1,Spanish ministry has been twice a month gather...,"[-0.014935532, -0.032861583, 0.012076679, 0.00...",175.0,2024-04-15 23:14:53.639172+00:00,28


In [12]:

contact_groups = {}
feature_builder = FeatureBuilder()
preprocessor = Preprocessor()
preprocess_features = feature_builder.build_preprocess(all_requests)
preprocessed = preprocessor.fit_transform(preprocess_features)

for i in range(len(preprocessed[0])):
    if all_requests[i]['contact_id'] not in contact_groups:
        contact_groups[all_requests[i]['contact_id']] = []
    contact_groups[all_requests[i]['contact_id']].append((preprocessed[0][i], preprocessed[1][i]))

In [16]:
from itertools import combinations


paired_components = []
for index in contact_groups:
    group = contact_groups[index]
    paired_components.append(list(combinations(group, len(group)-1)))