In [1]:
cd /Users/karolinegriesbach/Documents/Innkeepr/Git/evaluation-and-execution-scripts/

In [2]:
import re
import logging
import pandas as pd
import numpy as np
from datetime import datetime
from general_functions.conncet_s3 import S3Connection
from general_functions.sanitize_accout_name import sanitize_account_name
from general_functions.return_account_ids import return_account_ids
from general_functions.constants import return_api_url
from general_functions.call_api_with_account_id import call_api_with_accountId, send_to_innkeepr_api_paginated

In [3]:
date = "2025-08-01"
save_path = f"DataChecks/models/all_models.parquet"

In [4]:
s3 = S3Connection()
account_ids = return_account_ids()
url = return_api_url()
try:
    all_models = pd.read_parquet(save_path)
except:
    all_models = pd.DataFrame()
for account in account_ids:
        print(f"Account: {account['name']}")
        account_name_sanitized = sanitize_account_name(account["name"])
        
        models = call_api_with_accountId(
            endpoint_url=f"{url}/models/query",
            accountID=account["id"],
            content={"ceated": {"$gte": date}},
            logger=logging
        )
        models = pd.json_normalize(models)
        if models.empty:
             continue
        models = models[models["created"]>="2025-01-01"]
        print(f"modles min: {models['created'].min()}, max: {models['created'].max()}")
        models["account"] = account["name"]
        models["dir_date"] = None
        models["file_date"] = None
        for irow, row in models.iterrows():
            path = row.path
            audience = row.audience
            created = row.created
            exists = all_models[(all_models.path == path)&(all_models.audience == audience)&(all_models.created == created)][["dir_date", "file_date"]].drop_duplicates().values
            if None not in exists and len(exists) != 0:
                print(f"path exists: {path}")
                continue
            try:
                files = s3.list_files(f"innkeepr-targeting-{account_name_sanitized}", path)
            except Exception as e:
                print(e)
                continue
            if len(files) == 0:
                 continue
            print(f"Files: {files}")
            date = re.findall(r"\d{4}-\d{2}-\d{2}", files[0])
            rows_with_path = models[(models.path == path)].index
            models.loc[rows_with_path, "dir_date"] = date[0]
            if len(date) == 2:
                models.loc[rows_with_path, "file_date"] = date[1]
            all_models = pd.concat([all_models, models])
        all_models.to_parquet(save_path)
    

In [5]:
all_models[["path", "dir_date", "file_date"]]

In [6]:
all_models = pd.read_parquet(save_path)
all_models.head()

In [7]:
all_models = all_models.sort_values(by=["path","dir_date","file_date"]).reset_index(drop=True)
print(all_models.shape)
all_models_filtered = all_models.drop_duplicates(subset=["audience","path"]) 
#subset=["audience","path","created"],keep="last")
all_models_filtered["type"] = None
all_models_filtered["type"] = np.where(all_models_filtered["path"].astype("string").str.contains("-aud-"), "causal", "conversion")                                   
all_models_filtered

In [11]:
all_models_filtered["Quartal"] = None
all_models_filtered["Quartal"] = np.where(
    (all_models_filtered["created"]>="2025-01-01") & (all_models_filtered["created"]<="2025-03-31"),
    "Q1/25", all_models_filtered["Quartal"]
)
all_models_filtered["Quartal"] = np.where(
    (all_models_filtered["created"]>="2025-04-01") & (all_models_filtered["created"]<="2025-06-30"),
    "Q2/25", all_models_filtered["Quartal"]
)
all_models_filtered["Quartal"] = np.where(
    (all_models_filtered["created"]>="2025-07-01") & (all_models_filtered["created"]<="2025-09-30"),
    "Q3/25", all_models_filtered["Quartal"]
)
all_models_filtered["Quartal"] = np.where(
    (all_models_filtered["created"]>="2025-10-01") & (all_models_filtered["created"]<="2025-12-31"),
    "Q4/25", all_models_filtered["Quartal"]
)
all_models_filtered["Quartal"] = np.where(
    (all_models_filtered["created"]>="2026-01-01") & (all_models_filtered["created"]<="2026-03-31"),
    "Q1/26", all_models_filtered["Quartal"]
)

In [13]:
model_types_by_quartal = all_models_filtered.groupby(by=["Quartal"])["type"].value_counts(dropna=False).reset_index()
model_types_by_quartal

In [14]:
model_types_by_quartal = pd.pivot(model_types_by_quartal, index="Quartal", columns="type", values="count")
model_types_by_quartal["causal percentage"] = model_types_by_quartal["causal"] / (model_types_by_quartal["causal"] + model_types_by_quartal["conversion"])
model_types_by_quartal

In [25]:
models_by_account

In [None]:
models_by_account = all_models_filtered.groupby(by=["Quartal","account"])["type"].value_counts(dropna=False).reset_index()
model_types_by_quartal_and_account = pd.pivot(models_by_account, index=["Quartal","account"], columns="type", values="count")
model_types_by_quartal_and_account["causal percentage"] = model_types_by_quartal_and_account["causal"] / (model_types_by_quartal_and_account["causal"] + model_types_by_quartal_and_account["conversion"])
model_types_by_quartal_and_account=model_types_by_quartal_and_account.reset_index()
model_types_by_quartal_and_account