## Load data into memory

In [130]:
from dotenv import load_dotenv
import sys
sys.path.append('..')
from src.repo.orm import OpenPool, PrayerRequestORM
from src.dto.prayerRequests import PrayerRequest
load_dotenv()
import os
import pandas as pd
pg_uri = os.environ.get('PRAYERS_PG_DATABASE_URL')
pool = OpenPool(pg_uri)

In [131]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cosine
import numpy as np
from datetime import datetime, UTC
from sklearn.preprocessing import StandardScaler

class FeatureBuilder:
    def __init__(self):
        pass

    def build_preprocess(self, data: list[dict])->pd.DataFrame:
        today = datetime.now(UTC)
        for obj in data:
            obj['dayLength'] = (today - obj['created_at']).days
        return pd.DataFrame(data)

    def build_postprocess(self, paired_components: np.ndarray, use_day_feature = True, use_similarity_feature = True)->np.ndarray:
        processed = []
        for contact_group in paired_components:
            for pairs in contact_group:
                left_pair = pairs[0]
                right_pair = pairs[1]
                left_vector: np.ndarray = left_pair[0]
                right_vector: np.ndarray = right_pair[0]
                left_days = left_pair[1]
                right_days = right_pair[1]
                y = False
                if left_pair[2] == right_pair[2] and not np.isnan(left_pair[2]):
                    y = True
                similarity = cosine(left_vector, right_vector)
                averaged_vector = (left_vector + right_vector) / 2.0

                extra_features = [y]
                if use_day_feature:
                    extra_features.insert(0, right_days)
                    extra_features.insert(0, left_days)
                if use_similarity_feature:
                    extra_features.insert(0, similarity)
                new_feature_set = np.concatenate([averaged_vector, extra_features])
                processed.append(new_feature_set)

        return np.array(processed)
                

class Preprocessor:
    def __init__(self):
        self.scaler = StandardScaler()
        self.columns = ["dayLength"]

    def fit_transform(self, data: list[dict]):
        self.fit(data)
        return self.transform(data)

    def fit(self, df: pd.DataFrame):
        self.scaler.fit(df[self.columns])

    def transform(self, df: pd.DataFrame)->tuple[np.ndarray, np.ndarray, np.ndarray]:
        self.scaler.transform(df[self.columns])
        df["embedding"].to_numpy()
        # nparray = df.apply(lambda x: np.append(x["embedding"], x["dayLength"]), axis=1).to_numpy()
            
        return df["embedding"].to_numpy(), df["dayLength"].to_numpy(), df["link_id"].to_numpy()


In [132]:
all_requests = []
with pool() as session:
    prayer_requests = session.query(PrayerRequestORM).all()
    for request in prayer_requests:
        requestObject = {
            "prayer_request": request.request,
            "embedding": request.gte_base_embedding,
            "link_id": request.link_id,
            "created_at": request.created_at,
            "contact_id": request.contact_id
        }
        all_requests.append(requestObject)

df = pd.DataFrame(all_requests)
df.head(2)

Unnamed: 0,prayer_request,embedding,link_id,created_at,contact_id
0,Mom is thinking she wants to move to Tennessee...,"[-0.009760048, -0.005248226, 0.0009992049, -0....",182.0,2024-05-20 23:19:10.768122+00:00,29
1,Spanish ministry has been twice a month gather...,"[-0.014935532, -0.032861583, 0.012076679, 0.00...",175.0,2024-04-15 23:14:53.639172+00:00,28


In [133]:

contact_groups = {}
feature_builder = FeatureBuilder()
preprocessor = Preprocessor()
preprocess_features = feature_builder.build_preprocess(all_requests)
preprocessed = preprocessor.fit_transform(preprocess_features)

for i in range(len(preprocessed[0])):
    if all_requests[i]['contact_id'] not in contact_groups:
        contact_groups[all_requests[i]['contact_id']] = []
    contact_groups[all_requests[i]['contact_id']].append((preprocessed[0][i], preprocessed[1][i], preprocessed[2][i]))

In [134]:
from itertools import combinations


paired_components = []
for index in contact_groups:
    group = contact_groups[index]
    paired_components.append(list(combinations(group, 2)))

In [147]:
final_feature_set = feature_builder.build_postprocess(paired_components, use_day_feature=True, use_similarity_feature=True)
len(final_feature_set)

25665

In [148]:
from sklearn.model_selection import train_test_split

y = final_feature_set[:,-1]
X = final_feature_set[:, :-1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [149]:
from lightgbm import LGBMClassifier

model = LGBMClassifier(random_state=42, learning_rate=0.01)
model.fit(X_train, y_train)
print(model.score(X_train, y_train))
model.score(X_test, y_test)

0.9908435612702123


0.9881161114358075

In [150]:
# Scores seem suspicious, and so do the weights. Seems like a classic case of overfitting
# Would like to experiment in production
model.feature_importances_

array([  1,   1,   2,   2,   3,   6,   5,   0,   6,  15,   0,   3,   2,
         4,   1,   0,   1,   1,   4,   0,   4,   2,   0,   0,   0,   4,
         1,   0,   2,   4,   0,   4,   0,   3,  48,   3,  19,   0,  32,
         0,   2,   2,   4,   0,   1,   0,   3,   1,   0,   1,   2,   0,
         1,   0,   6,   0,   2,   3,   1,   2,   9,   1,  24,   0,   9,
         7,   0,   9,  12,   1,   0,   3,   5,   2,   1,   1,   8,   1,
         0,   0,  36,   1,   3,   0,   3,   2,   4,   3,   1,   4,  13,
         1,   0,   0,   7,  19,   2,   6,   3,   3,   2,   1,   0,   7,
         0,   1,   5,   5,   0,   0,   3,   0,   0,   5,   0,   4,   1,
         0,   3,   0,   1,   3,   3,   8,   4,   6,   5,   1,   2,   0,
         1,   3,   2,  16,   6,   3,   0,   1,   1,   1,   1,   3,   1,
         0,   4,   2,   3,   2,   1,   0,   8,   0,   0,   0,   3,   2,
         0,   2,   3,   1,   0,   1,   5,   1,   6,   0,   0,   0,   1,
         1,   0,  15,  10,   9,   2,   6,  15,   2,   2,   2,  1