In [54]:
from simple_salesforce import Salesforce
import pandas as pd
import numpy as np
import os
from dotenv import load_dotenv

In [67]:
seasontickets = pd.read_csv('seasontickets.csv')
seasontickets = seasontickets[seasontickets['Status'] == 'Active']
seasontickets['User Id'] = seasontickets['User Id'].astype(int)
seasontickets = seasontickets.drop_duplicates(subset=['User Id'])

tickets = pd.read_csv('games.csv')
tickets = tickets.dropna(subset=['User Id']) # without cashier tickets
tickets['User Id'] = tickets['User Id'].astype(int)
tickets = tickets[tickets['Status'] == 'Active']

In [68]:
print('seasontickets shape:', seasontickets.shape)
print('tickets shape:', tickets.shape)

seasontickets shape: (3421, 54)
tickets shape: (4111, 54)


In [69]:
# without_members = tickets[~tickets['User Id'].isin(seasontickets['User Id'])]
# print('Tickets bought by non-members:', without_members.shape[0])
# without_members

# === NEW: add membership indicator instead of filtering ===
member_ids = set(seasontickets['User Id'])
tickets['Has membership card'] = tickets['User Id'].isin(member_ids)  # True/False

# (Optional) if you still want to see how many non-members bought:
print('Tickets bought by non-members:', (~tickets['Has membership card']).sum())

Tickets bought by non-members: 2305


In [70]:
# --- helpers (unchanged) ---
def concat_unique(s):
    s = s.dropna().astype(str).str.strip()
    if s.empty:
        return np.nan
    return ', '.join(pd.unique(s))

def first_nonnull(s):
    s = s.dropna()
    return s.iloc[0] if not s.empty else np.nan

def most_common_or_join(s):
    s = s.dropna().astype(str).str.strip()
    if s.empty:
        return np.nan
    m = s.mode()
    return m.iloc[0] if len(m) == 1 else ', '.join(pd.unique(s))

# --- tidy up & casting ---
df = tickets.copy()  # <-- use ALL tickets now, not just without_members

df['Fan / Company'] = df['Fan / Company'].astype(str).str.strip()

df['Price'] = (
    df['Price']
      .astype(str)
      .str.replace(r'[^0-9.\-]', '', regex=True)
      .replace({'': np.nan, '-': np.nan})
      .astype(float)
)

df['Birth date'] = pd.to_datetime(df['Birth date'], errors='coerce')

# --- 1) one line per (person, product) ---
per_product = (
    df.groupby(['Fan / Company', 'User Id', 'Product'], dropna=False)
      .agg(
          tickets_bought = ('Product', 'size'),
          total_price    = ('Price', 'sum'),
          ticket_price_types = ('Ticket price types', concat_unique),
          voucher_numbers    = ('Voucher number', concat_unique),
          voucher_names      = ('Voucher name', concat_unique),
          phone              = ('Phone', most_common_or_join),
          email              = ('Email', most_common_or_join),
          birth_date         = ('Birth date', first_nonnull),
          has_membership_card= ('Has membership card', 'max')  # True if user has it
      )
      .reset_index()
)

# --- 2) one line per person ---
person_summary = (
    df.groupby(['Fan / Company', 'User Id'], dropna=False)
      .agg(
          phone              = ('Phone', most_common_or_join),
          email              = ('Email', most_common_or_join),
          total_tickets      = ('Product', 'size'),
          product_list       = ('Product', concat_unique),
          distinct_products  = ('Product', 'nunique'),
          total_price        = ('Price', 'sum'),
          ticket_price_types = ('Ticket price types', concat_unique),
          voucher_numbers    = ('Voucher number', concat_unique),
          voucher_names      = ('Voucher name', concat_unique),
          birth_date         = ('Birth date', first_nonnull),
          has_membership_card= ('Has membership card', 'max')  # carries the True/False
      )
      .reset_index()
)

# --- age calc (unchanged) ---
today = pd.Timestamp.today().normalize()
bd = person_summary['birth_date']
has_had_bday = (
    (bd.dt.month < today.month) |
    ((bd.dt.month == today.month) & (bd.dt.day <= today.day))
)
age = (today.year - bd.dt.year) - (~has_had_bday).astype(int)
person_summary['age'] = age.where(bd.notna())

# order columns (added has_membership_card)
person_summary = person_summary[
    ['Fan / Company', 'User Id', 'has_membership_card', 'phone', 'email',
     'total_tickets', 'distinct_products', 'product_list',
     'total_price', 'ticket_price_types', 'voucher_numbers', 'voucher_names', 'age']
]

person_summary


Unnamed: 0,Fan / Company,User Id,has_membership_card,phone,email,total_tickets,distinct_products,product_list,total_price,ticket_price_types,voucher_numbers,voucher_names,age
0,Adam Berrous,1420654,False,0552864391,adamberrous@gmail.com,1,1,גביע ווינר שלב הבתים #1: מכבי ראשון לציון 🏠,50.0,Child,,,12.0
1,Alejandra Rosenberg,1435692,False,0515690960,alerosenberg81@hotmail.com,4,1,גביע ווינר שלב הבתים #2: מכבי עירוני רמת גן 🏠,440.0,Adult,,,44.0
2,Anthony Lamb,1435129,False,,,2,1,גביע ווינר שלב הבתים #1: מכבי ראשון לציון 🏠,0.0,Complementary,,,
3,Asaf Ataria,1424949,False,0503744476,asaf.ataria@gmail.com,1,1,חצי גמר גביע ווינר: מכבי תל אביב,75.0,Adult,,,17.0
4,Austin Wiley,1349202,False,,Austin.wiley50@gmail.com,1,1,חצי גמר גביע ווינר: מכבי תל אביב,0.0,Complementary,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2047,תמרה פקר,1332668,True,,,1,1,גביע ווינר שלב הבתים #2: מכבי עירוני רמת גן 🏠,0.0,Adult,יאללה יפה נוף,יאללה בתי ספר,12.0
2048,תמרי ברודר,1342303,True,,,1,1,חצי גמר גביע ווינר: מכבי תל אביב,75.0,Adult,,,14.0
2049,‪Hadar Glinert‬‏,13538,True,0526880605,hadar.glinert@gmail.com,1,1,חצי גמר גביע ווינר: מכבי תל אביב,75.0,Adult,,,23.0
2050,‫בניה כספי‬‎,1421903,False,0544487979,cbnaya@gmail.com,1,1,גביע ווינר שלב הבתים #1: מכבי ראשון לציון 🏠,0.0,Adult,יאללה גוננים,יאללה בתי ספר,43.0


In [71]:
def fetch_marketing_allowed_from_salesforce_smart(
    user_ids,
    sf_username=None, sf_password=None, sf_security_token=None,
    dotenv_path=None,
    key_field_api_name="hjbc_id__c",   # correct API name
    batch_size=150,                    # keep URL under limits
):
    """
    Fast path: batched SOQL using WHERE hjbc_id__c IN (...), SELECT hjbc_id__c.
    Fallback: per-ID queries if FLS hides hjbc_id__c.
    Returns: ['hjbc_id__c','Contact Name','Account Name','Marketing Allowed','SF_Contact_Id']
    """
    load_dotenv(dotenv_path=dotenv_path)
    sf = Salesforce(
        username=sf_username or os.getenv("SF_USERNAME"),
        password=sf_password or os.getenv("SF_PASSWORD"),
        security_token=sf_security_token or os.getenv("SF_SECURITY_TOKEN"),
    )

    ids = pd.Series(user_ids).dropna().astype(str).str.strip().unique().tolist()
    if not ids:
        return pd.DataFrame(columns=["hjbc_id__c","Contact Name","Account Name","Marketing Allowed","SF_Contact_Id"])

    def q(v: str) -> str:  # SOQL-escape
        return "'" + v.replace("'", "''") + "'"

    # ---------- FAST PATH: batched IN (...) and SELECT the key ----------
    batched_results = []
    fls_hidden = False

    for i in range(0, len(ids), batch_size):
        chunk = ids[i:i+batch_size]
        soql = f"""
            SELECT Id, Name, Account.Name, marketing_allowed__c, {key_field_api_name}
            FROM Contact
            WHERE {key_field_api_name} IN ({",".join(q(x) for x in chunk)})
        """
        res = sf.query_all(soql)
        recs = res.get("records", [])
        for r in recs:
            key_val = r.get(key_field_api_name)  # may be None if FLS hides
            batched_results.append({
                "hjbc_id__c": key_val,
                "SF_Contact_Id": r.get("Id"),
                "Contact Name": r.get("Name"),
                "Account Name": (r.get("Account") or {}).get("Name"),
                "Marketing Allowed": r.get("marketing_allowed__c"),
            })

    if batched_results:
        tmp = pd.DataFrame(batched_results)
        # If most keys came back None, assume FLS is hiding the field -> fallback
        null_rate = tmp["hjbc_id__c"].isna().mean()
        fls_hidden = null_rate > 0.5

        if not fls_hidden:
            # Collapse and also add rows for IDs with no match
            tmp["hjbc_id__c"] = tmp["hjbc_id__c"].astype(str).str.strip()
            collapsed = (
                tmp.groupby("hjbc_id__c", dropna=False)
                   .agg({
                       "Marketing Allowed": lambda s: bool(pd.Series(s).fillna(False).any()) if s.notna().any() else pd.NA,
                       "Contact Name":     lambda s: s.dropna().iloc[0] if s.dropna().size else pd.NA,
                       "Account Name":     lambda s: s.dropna().iloc[0] if s.dropna().size else pd.NA,
                       "SF_Contact_Id":    lambda s: s.dropna().iloc[0] if s.dropna().size else pd.NA,
                   })
                   .reset_index()
            )
            # ensure all requested IDs exist in the result
            missing = pd.Index(ids).difference(collapsed["hjbc_id__c"])
            if len(missing):
                collapsed = pd.concat([
                    collapsed,
                    pd.DataFrame({
                        "hjbc_id__c": list(missing),
                        "Marketing Allowed": pd.NA,
                        "Contact Name": pd.NA,
                        "Account Name": pd.NA,
                        "SF_Contact_Id": pd.NA,
                    })
                ], ignore_index=True)
            return collapsed

    # ---------- FALLBACK: per-ID queries, carry the key (FLS-safe) ----------
    results = []
    for uid in ids:
        soql = f"""
            SELECT Id, Name, Account.Name, marketing_allowed__c
            FROM Contact
            WHERE {key_field_api_name} = {q(uid)}
        """
        res = sf.query_all(soql)
        recs = res.get("records", [])
        if recs:
            for r in recs:
                results.append({
                    "hjbc_id__c": uid,  # carry queried key
                    "SF_Contact_Id": r.get("Id"),
                    "Contact Name": r.get("Name"),
                    "Account Name": (r.get("Account") or {}).get("Name"),
                    "Marketing Allowed": r.get("marketing_allowed__c"),
                })
        else:
            results.append({
                "hjbc_id__c": uid,
                "SF_Contact_Id": pd.NA,
                "Contact Name": pd.NA,
                "Account Name": pd.NA,
                "Marketing Allowed": pd.NA,
            })

    out = pd.DataFrame(results)
    out = (out.groupby("hjbc_id__c", dropna=False)
             .agg({
                 "Marketing Allowed": lambda s: bool(pd.Series(s).fillna(False).any()) if s.notna().any() else pd.NA,
                 "Contact Name":     lambda s: s.dropna().iloc[0] if s.dropna().size else pd.NA,
                 "Account Name":     lambda s: s.dropna().iloc[0] if s.dropna().size else pd.NA,
                 "SF_Contact_Id":    lambda s: s.dropna().iloc[0] if s.dropna().size else pd.NA,
             })
             .reset_index())
    return out

In [72]:
person_summary["User Id"] = person_summary["User Id"].astype(str).str.strip()

sf_flags = fetch_marketing_allowed_from_salesforce_smart(
    person_summary["User Id"].unique()
)

person_summary = (
    person_summary
      .merge(sf_flags, how="left", left_on="User Id", right_on="hjbc_id__c")
      .drop(columns=["hjbc_id__c"])   # tidy
)

person_summary = person_summary.drop(columns=['SF_Contact_Id', 'Account Name', 'Contact Name'])

In [73]:
person_summary

Unnamed: 0,Fan / Company,User Id,has_membership_card,phone,email,total_tickets,distinct_products,product_list,total_price,ticket_price_types,voucher_numbers,voucher_names,age,Marketing Allowed
0,Adam Berrous,1420654,False,0552864391,adamberrous@gmail.com,1,1,גביע ווינר שלב הבתים #1: מכבי ראשון לציון 🏠,50.0,Child,,,12.0,True
1,Alejandra Rosenberg,1435692,False,0515690960,alerosenberg81@hotmail.com,4,1,גביע ווינר שלב הבתים #2: מכבי עירוני רמת גן 🏠,440.0,Adult,,,44.0,True
2,Anthony Lamb,1435129,False,,,2,1,גביע ווינר שלב הבתים #1: מכבי ראשון לציון 🏠,0.0,Complementary,,,,False
3,Asaf Ataria,1424949,False,0503744476,asaf.ataria@gmail.com,1,1,חצי גמר גביע ווינר: מכבי תל אביב,75.0,Adult,,,17.0,True
4,Austin Wiley,1349202,False,,Austin.wiley50@gmail.com,1,1,חצי גמר גביע ווינר: מכבי תל אביב,0.0,Complementary,,,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2047,תמרה פקר,1332668,True,,,1,1,גביע ווינר שלב הבתים #2: מכבי עירוני רמת גן 🏠,0.0,Adult,יאללה יפה נוף,יאללה בתי ספר,12.0,False
2048,תמרי ברודר,1342303,True,,,1,1,חצי גמר גביע ווינר: מכבי תל אביב,75.0,Adult,,,14.0,False
2049,‪Hadar Glinert‬‏,13538,True,0526880605,hadar.glinert@gmail.com,1,1,חצי גמר גביע ווינר: מכבי תל אביב,75.0,Adult,,,23.0,True
2050,‫בניה כספי‬‎,1421903,False,0544487979,cbnaya@gmail.com,1,1,גביע ווינר שלב הבתים #1: מכבי ראשון לציון 🏠,0.0,Adult,יאללה גוננים,יאללה בתי ספר,43.0,False


In [74]:
person_summary['Marketing Allowed'].value_counts()

Marketing Allowed
False    1324
True      728
Name: count, dtype: int64

In [76]:
# save (filename updated since it now includes members too, optional)
person_summary.to_excel('winnercup_tickets_with_membership_flag.xlsx', index=False)
# per_product.to_excel('winnercup_tickets_per_product_with_membership_flag.xlsx', index=False)