In [120]:
import pandas as pd
import numpy as np
import os

In [121]:
game_files = [
    'AuthorizedAttendanceReportCsv_Round 2_ Hapoel Tel Aviv 🚗.csv',
    'AuthorizedAttendanceReportCsv_Round 4_ Hapoel Holon 🏠.csv',
    'AuthorizedAttendanceReportCsv_Round 8_ Hapoel Afula 🏠.csv',
    'AuthorizedAttendanceReportCsv_🏠 מחזור 9_ הפועל ״בנק יהב״ ירושלים -הפועל גליל עליון.csv',
    'AuthorizedAttendanceReportCsv_🏠 מחזור11_ הפועל ״בנק יהב״ ירושלים -מכבי ת"א.csv',
    'AuthorizedAttendanceReportCsv_ליגת ווינר סל מחזור 17_ גלבוע גליל 🏠.csv',
    'AuthorizedAttendanceReportCsv_ליגת ווינר סל מחזור 18_ הרצליה 🏠 .csv',
    'AuthorizedAttendanceReportCsv_ליגת ווינר סל מחזור 20_ הפועל חיפה 🏠.csv',
    'AuthorizedAttendanceReportCsv_מחזור 22_ אליצור עירוני נתניה 🏠.csv',
    'AuthorizedAttendanceReportCsv_מחזור 24_ הפועל באר שבע - דימונה 🏠.csv',
    'AuthorizedAttendanceReportCsv_מחזור 26_ עירוני נס ציונה 🏠.csv',
    'AuthorizedAttendanceReportCsv_רבע גמר 1_ מכבי  עירוני רמת גן 🏠.csv',
    'AuthorizedAttendanceReportCsv_רבע גמר 3_ מכבי  עירוני רמת גן 🏠.csv',
    'AuthorizedAttendanceReportCsv_חצי גמר משחק 2_ הפועל תל אביב 🏠.csv',
    'AuthorizedAttendanceReportCsv_גמר ליגת ווינר משחק 2_ מכבי תל אביב 🏠.csv'
]

summary_list = []
raw_data_list = [] 

for file in game_files:
    game_df = pd.read_csv(file)
    game_df['Full Name'] = game_df['First name'] + " " + game_df['Last name']

    # Add Event Name
    event_name = file.split('Csv_')[1].replace('.csv', '')
    game_df['Event Name'] = event_name

    # Store unfiltered raw data
    raw_data_list.append(game_df.copy())

    # Filter only single ticket purchases
    game_df = game_df[(game_df['STRefID'].isna()) & (game_df['Type'] == 'Ticket')]

    # Use Voucher or CloseLink
    game_df['Name'] = np.where(
        game_df['Voucher name'].notna(),
        game_df['Voucher name'],
        game_df['CloseLink reservation name']
    )

    game_df['Type'] = np.where(
        game_df['Voucher name'].notna(),
        'Voucher',
        'CloseLink'
    )

    # Drop NA group names
    game_df = game_df.dropna(subset=['Name'])

    # Group by
    grouped = game_df.groupby(['Name']).size().reset_index(name='Count')
    grouped['Event Name'] = event_name

    grouped = grouped.merge(
        game_df[['Type', 'Name']].drop_duplicates(subset='Name'),
        on='Name',
        how='left'
    )

    summary_list.append(grouped)


working_games = pd.concat(summary_list, ignore_index=True)

# Define unwanted substrings
unwanted_substrings = ['חוץ', 'אורחת', 'רמת גן קישור לאוהדים', 'הפועל באר שבע דימונה', 'ניסיון קהילה']

# Filter out rows where Name contains any of the unwanted substrings
pattern = '|'.join(unwanted_substrings)
working_games = working_games[~working_games['Name'].str.contains(pattern, na=False)]


working_games = working_games[['Event Name', 'Name', 'Type', 'Count']].sort_values(by=['Event Name', 'Name'])
working_games.to_excel('working_games.xlsx')

all_raw_data = pd.concat(raw_data_list, ignore_index=True)

working_games


Columns (16,21,31,35,38,40,44,45,64,65,70,71,75) have mixed types. Specify dtype option on import or set low_memory=False.



Unnamed: 0,Event Name,Name,Type,Count
0,Round 2_ Hapoel Tel Aviv 🚗,-50%t,Voucher,7
1,Round 2_ Hapoel Tel Aviv 🚗,-50t,Voucher,4
2,Round 2_ Hapoel Tel Aviv 🚗,00,Voucher,2
3,Round 2_ Hapoel Tel Aviv 🚗,lovehaphat,Voucher,9
4,Round 2_ Hapoel Tel Aviv 🚗,טסט קופון רב פעמי 1,Voucher,2
...,...,...,...,...
38,"🏠 מחזור11_ הפועל ״בנק יהב״ ירושלים -מכבי ת""א",HAPOELFAM2212,Voucher,1
39,"🏠 מחזור11_ הפועל ״בנק יהב״ ירושלים -מכבי ת""א",הפועל לב ירושלים,CloseLink,4
40,"🏠 מחזור11_ הפועל ״בנק יהב״ ירושלים -מכבי ת""א",טסט קופון רב פעמי 1,Voucher,2
41,"🏠 מחזור11_ הפועל ״בנק יהב״ ירושלים -מכבי ת""א",מאפס קרדיט,Voucher,1


### Same table with names and id of the people who used it:

In [122]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import pandas as pd
import numpy as np
import re

def normalize_name(name):
    if pd.isna(name):
        return name
    name = str(name)
    name = name.replace("'", "")  # remove apostrophes
    name = name.replace('"', "")  # remove double quotes
    name = name.replace('אשה', 'אישה')  # fix missing י
    name = name.replace('בית ספר כרמים', 'בית הספר כרמים')  # fix missing י
    name = re.sub(r'\bה\s+', '', name)  # remove 'ה' if it's at beginning of a word
    name = re.sub(r'\s+', ' ', name)  # normalize multiple spaces
    name = name.strip()  # remove leading/trailing spaces
    return name


summary_list = []
raw_data_list = [] 

for file in game_files:
    game_df = pd.read_csv(file)

    if 'Transaction date' in game_df.columns:
        game_df['Transaction date'] = pd.to_datetime(game_df['Transaction date'], errors='coerce').dt.date
    else:
        game_df['Transaction date'] = pd.NaT  # Fill with NaT if not present

    game_df['Full Name'] = game_df['First name'] + " " + game_df['Last name']

    # Add Event Name
    event_name = file.split('Csv_')[1].replace('.csv', '')
    game_df['Event Name'] = event_name

    # Store unfiltered raw data
    raw_data_list.append(game_df.copy())

    # Filter only single ticket purchases
    game_df = game_df[(game_df['STRefID'].isna()) & (game_df['Type'] == 'Ticket')]

    # Use Voucher or CloseLink
    game_df['Name'] = np.where(
        game_df['Voucher name'].notna(),
        game_df['Voucher name'],
        game_df['CloseLink reservation name']
    )

    game_df['Type'] = np.where(
        game_df['Voucher name'].notna(),
        'Voucher',
        'CloseLink'
    )

    # Drop NA group names
    game_df = game_df.dropna(subset=['Name'])

    # Instead of grouping: just select columns
    selected = game_df[['Event Name', 'Name', 'Full Name', 'assign using  ID number', 'Age', 'Type', 'Transaction date']].copy()

    selected['Count'] = 1  # each row = 1 ticket/person

    summary_list.append(selected)

working_games_with_ids = pd.concat(summary_list, ignore_index=True)

# Drop bad rows
working_games_with_ids = working_games_with_ids.dropna(subset=['assign using  ID number'])
working_games_with_ids = working_games_with_ids[
    working_games_with_ids['assign using  ID number'].astype(str).str.len() > 2
]
working_games_with_ids = working_games_with_ids.drop_duplicates()

# Normalize names
working_games_with_ids['Name_normalized'] = working_games_with_ids['Name'].apply(normalize_name)

# Fuzzy match similar names
unique_names = working_games_with_ids['Name_normalized'].unique()
name_mapping = {}

for name in unique_names:
    if name not in name_mapping:
        matches = process.extract(name, unique_names, scorer=fuzz.token_sort_ratio)
        for match_name, score in matches:
            if score > 90:  # adjust threshold if needed
                name_mapping[match_name] = name

# Apply mapping
working_games_with_ids['Name_normalized'] = working_games_with_ids['Name_normalized'].map(name_mapping).fillna(working_games_with_ids['Name_normalized'])

# Replace Name column
working_games_with_ids['Name'] = working_games_with_ids['Name_normalized']
working_games_with_ids = working_games_with_ids.drop(columns=['Name_normalized'])

# Define unwanted substrings
unwanted_substrings = ['חוץ', 'אורחת', 'רמת גן קישור לאוהדים', 'הפועל באר שבע דימונה', 'ניסיון קהילה']

# Filter out rows where Name contains any of the unwanted substrings
pattern = '|'.join(unwanted_substrings)
working_games_with_ids = working_games_with_ids[~working_games_with_ids['Name'].str.contains(pattern, na=False)]

# Final sorting
working_games_with_ids = working_games_with_ids.sort_values(by=['Event Name', 'Name'])

# Export to Excel
working_games_with_ids.to_excel('working_games_with_ids.xlsx', index=False)

working_games_with_ids


Columns (16,21,31,35,38,40,44,45,64,65,70,71,75) have mixed types. Specify dtype option on import or set low_memory=False.



Unnamed: 0,Event Name,Name,Full Name,assign using ID number,Age,Type,Transaction date,Count
8,Round 2_ Hapoel Tel Aviv 🚗,-50%t,יקי קוזאהינוף,027124510,51.0,Voucher,2024-10-13,1
11,Round 2_ Hapoel Tel Aviv 🚗,-50%t,גיל בש,GILBASH@GMAIL.COM,,Voucher,2024-10-13,1
13,Round 2_ Hapoel Tel Aviv 🚗,-50%t,אבי סמואלס,28327,24.0,Voucher,2024-10-13,1
4,Round 2_ Hapoel Tel Aviv 🚗,-50t,יאיר מרינוב,11206,24.0,Voucher,2024-10-09,1
2,Round 2_ Hapoel Tel Aviv 🚗,00,ELAD KASIR,217991017,15.0,Voucher,2024-10-09,1
...,...,...,...,...,...,...,...,...
930,"🏠 מחזור11_ הפועל ״בנק יהב״ ירושלים -מכבי ת""א",צוות דובדבן,עילי ליכטנשטיין,326612280,20.0,CloseLink,2024-12-16,1
931,"🏠 מחזור11_ הפועל ״בנק יהב״ ירושלים -מכבי ת""א",צוות דובדבן,איתי תורג׳מן,324187095,20.0,CloseLink,2024-12-16,1
932,"🏠 מחזור11_ הפועל ״בנק יהב״ ירושלים -מכבי ת""א",צוות דובדבן,יונתן אליעז,216009829,19.0,CloseLink,2024-12-16,1
949,"🏠 מחזור11_ הפועל ״בנק יהב״ ירושלים -מכבי ת""א",צוות דובדבן,נטלי אנגלמן,306930702,45.0,CloseLink,2024-12-19,1


In [123]:
working_games_with_ids['Name'].unique()

array(['-50%t', '-50t', '00', 'lovehaphat', 'טסט קופון רב פעמי 1',
       'Complementary game tickets', 'אישה לאישה', 'בית הלוחם',
       'בית הספר בית הכרם', 'בית ספר תבל', 'הפועל ירושלים כדורעף',
       'כרטיס ב-50% הנחה', 'מועצה אזורית עין גדי', 'מיכאל.', 'מקיף גילה',
       '-100T', 'Credit', 'Resale', 'כרטיסי חבר גלריה', '-250', '100%T',
       'זיכוי', 'חצי גמר 2025', 'יאללה הדסה', 'כרטיס חבר B',
       'בית הספר כרמים', 'גדוד 53', 'הפועל אורן מודיעין', 'מכון סאמיט',
       'מרכז קהילתי גוננים', 'עמותת עדי', 'שמחה לילד', 'תיכון הימלפרב',
       'early0203', 'hjfamilyfriends', 'גימנסיה ירושלים', 'קרוס ריבר בנק',
       'תיכון מבשרת', '-10%T', 'NCSY', 'מכינת יונתן',
       'עובדי הדסה - קישור כללי', 'עמותת שבט הנובה', '-30%',
       'בית ילדים רמת רחל', 'בית ספר גאולים', 'כפר שאול',
       'כרטיסי ארגונים', 'מחלקת הנוער - הר אדר', 'מינהל קהילתי הר חומה',
       'הפועל לב ירושלים', 'כרטיסים להגרלה', '-40T', '-80T', '40 ALL',
       'קבוצת נוער רמת השרון', 'שוברי פסח 2025', 'שערי צדק

### All row data:

In [124]:
columns_to_keep = [
    'Event Id', 'assign using  ID number', 'First name', 'Last name', 'School', 'Gender', 'Age', 'Email', 'Id', 'Type', 'Event Name',
    'Additional phone number', 'Product', 'STRefID', 'OwnerSTUserId', 'OwnerSTName', 'Subtype Ticket number', 'Barcode', 'Card number',
    'User Id', 'Street', 'House number City', 'Country', 'Zip code', 'Stand', 'Area', 'Row', 'Number', 'Price area', 'Entrance code',
    'Entrance text', 'Transaction identifier', 'Transaction date', 'Delivery type', 'Price type', 'Price paid', 'Phone No.',
    'Transaction owner first name', 'Transaction owner last name', 'Transaction owner email', 'CloseLink reservation name',
    'CloseLink code', 'Voucher name', 'Voucher batch name', 'Discount price', 'Role', 'User group', 'Payment method', 'Ticket note',
    'Season tickets', 'Custom field 1 UserIdentityVerified', 'Attendance', 'Attendance date'
]

# Keep only columns that actually exist in the final DataFrame
existing_columns = [col for col in columns_to_keep if col in all_raw_data.columns]
all_raw_data = all_raw_data[existing_columns]

all_raw_data.to_excel('all_raw_data.xlsx')

all_raw_data

Unnamed: 0,Event Id,assign using ID number,First name,Last name,School,Gender,Age,Email,Id,Type,...,Voucher name,Voucher batch name,Discount price,Role,User group,Payment method,Ticket note,Season tickets,Attendance,Attendance date
0,1694,,,,,Unknown,,,3045986,Ticket,...,,,No discount,Administrator,,Cash,,,No,No data
1,1694,43552660,עינת,מידן-דוד,,Female,42.0,einatth@gmail.com,3046265,Ticket,...,,,No discount,Fan,,Other,,,No,No data
2,1694,236546305,נועם,אהרוני,,Unknown,1.0,,3050463,Ticket,...,,,No discount,Administrator,,Pelecard_Credit Card,,,No,No data
3,1694,43467760,עדו,מידן דוד,,Male,43.0,idomedan@gmail.com,3053065,Ticket,...,,,No discount,Administrator,"פרימיום, חיילים בסדיר, מילואים, Hapoel Test gr...",Cash,,,No,No data
4,1694,215015306,עילי,שליו,,Unknown,20.0,,3053295,Ticket,...,,,No discount,Fan,,Pelecard_Credit Card,,,Yes,2024-10-13 20:05:30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90263,4172,339027559,זיו,קציר,,Unknown,11.0,,899550,SeasonTicket,...,,,No discount,Administrator,,External Payment Subscriptions,,,No,No data
90264,4172,222664401,מיטל,הערצקה,,Unknown,10.0,,899556,SeasonTicket,...,,,No discount,Administrator,,Pelecard_Credit Card,,,No,No data
90265,4172,332578384,איילה,הערצקה,,Female,15.0,isaacherzka8@gmail.com,899557,SeasonTicket,...,,,No discount,Administrator,,Pelecard_Credit Card,,,No,No data
90266,4172,25304320,ערן,ויס,,Male,51.0,eranwe36@gmail.com,899563,SeasonTicket,...,,,No discount,Administrator,,Pelecard_Credit Card,,,No,No data


## Community data for Irit:

In [137]:
users = pd.read_csv('UserCreatedReport_2025-06-18 13_18.csv')
users['Full Name'] = users['firstname'] + " " + users['lastname']
users = users[['createdon', 'Full Name', 'identifier']]
users['createdon'] = pd.to_datetime(users['createdon'], errors='coerce').dt.date

users

Unnamed: 0,createdon,Full Name,identifier
0,2024-03-28,אביתר שמחוביץ,339130957
1,2024-03-28,גיא סלוניקי,201463015
2,2024-03-28,Uriya Aharoni,039052600
3,2024-03-28,דבורה זקין,321153488
4,2024-03-28,גל נוברט,208287953
...,...,...,...
23194,2025-06-14,איתי קליין,213143431
23195,2025-06-14,נרקיס יפת,313748360
23196,2025-06-15,רזיאל זלקר,323074500
23197,2025-06-15,גיא שחורי,216993212


In [138]:
merged = pd.merge(
    working_games_with_ids,
    users,
    left_on='assign using  ID number',
    right_on='identifier',
    how='left'
)

merged

Unnamed: 0,Event Name,Name,Full Name_x,assign using ID number,Age,Type,Transaction date,Count,createdon,Full Name_y,identifier
0,Round 2_ Hapoel Tel Aviv 🚗,-50%t,יקי קוזאהינוף,027124510,51.0,Voucher,2024-10-13,1,2024-10-13,יקי קוזאהינוף,027124510
1,Round 2_ Hapoel Tel Aviv 🚗,-50%t,גיל בש,GILBASH@GMAIL.COM,,Voucher,2024-10-13,1,2024-10-13,גיל בש,GILBASH@GMAIL.COM
2,Round 2_ Hapoel Tel Aviv 🚗,-50%t,אבי סמואלס,28327,24.0,Voucher,2024-10-13,1,2024-03-28,אבי סמואלס,28327
3,Round 2_ Hapoel Tel Aviv 🚗,-50t,יאיר מרינוב,11206,24.0,Voucher,2024-10-09,1,2024-03-28,יאיר מרינוב,11206
4,Round 2_ Hapoel Tel Aviv 🚗,00,ELAD KASIR,217991017,15.0,Voucher,2024-10-09,1,2024-09-30,ELAD KASIR,217991017
...,...,...,...,...,...,...,...,...,...,...,...
3915,"🏠 מחזור11_ הפועל ״בנק יהב״ ירושלים -מכבי ת""א",צוות דובדבן,עילי ליכטנשטיין,326612280,20.0,CloseLink,2024-12-16,1,2024-12-16,עילי ליכטנשטיין,326612280
3916,"🏠 מחזור11_ הפועל ״בנק יהב״ ירושלים -מכבי ת""א",צוות דובדבן,איתי תורג׳מן,324187095,20.0,CloseLink,2024-12-16,1,2024-12-16,איתי תורג׳מן,324187095
3917,"🏠 מחזור11_ הפועל ״בנק יהב״ ירושלים -מכבי ת""א",צוות דובדבן,יונתן אליעז,216009829,19.0,CloseLink,2024-12-16,1,2024-12-16,יונתן אליעז,216009829
3918,"🏠 מחזור11_ הפועל ״בנק יהב״ ירושלים -מכבי ת""א",צוות דובדבן,נטלי אנגלמן,306930702,45.0,CloseLink,2024-12-19,1,2024-12-18,נטלי אנגלמן,306930702


In [139]:
test = merged.copy()
test = test[test['Type'] == 'CloseLink']
print('Shape:' ,test.shape)

Shape: (2969, 11)


In [140]:
merged = merged[merged['Type'] == 'CloseLink']

# Convert to proper datetime (not just .dt.date)
merged['Transaction date'] = pd.to_datetime(merged['Transaction date'], errors='coerce')
merged['createdon'] = pd.to_datetime(merged['createdon'], errors='coerce')

# Now calculate the difference in days
merged['days_diff'] = (merged['Transaction date'] - merged['createdon']).dt.days

merged['Age'] = merged['Age'].fillna(0).astype(int)
merged['days_diff'] = merged['days_diff'].fillna(0).astype(int)

merged['days_diff'].value_counts().sort_index()
merged = merged.drop_duplicates(subset=['assign using  ID number'])

merged



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

Unnamed: 0,Event Name,Name,Full Name_x,assign using ID number,Age,Type,Transaction date,Count,createdon,Full Name_y,identifier,days_diff
18,Round 8_ Hapoel Afula 🏠,אישה לאישה,איילת בן יעקב,301607347,36,CloseLink,2024-11-15,1,2024-11-15,איילת בן יעקב,301607347,0
19,Round 8_ Hapoel Afula 🏠,אישה לאישה,נועם בן יעקב,344013602,7,CloseLink,2024-11-15,1,2024-11-15,נועם בן יעקב,344013602,0
20,Round 8_ Hapoel Afula 🏠,אישה לאישה,נעמי ג'אנה,012438388,59,CloseLink,2024-11-15,1,2024-11-15,נעמי ג'אנה,012438388,0
21,Round 8_ Hapoel Afula 🏠,אישה לאישה,עמית שובל,328117718,20,CloseLink,2024-11-15,1,2024-11-15,עמית שובל,328117718,0
22,Round 8_ Hapoel Afula 🏠,אישה לאישה,יצחק שובל,038675211,56,CloseLink,2024-11-15,1,2024-11-15,יצחק שובל,038675211,0
...,...,...,...,...,...,...,...,...,...,...,...,...
3915,"🏠 מחזור11_ הפועל ״בנק יהב״ ירושלים -מכבי ת""א",צוות דובדבן,עילי ליכטנשטיין,326612280,20,CloseLink,2024-12-16,1,2024-12-16,עילי ליכטנשטיין,326612280,0
3916,"🏠 מחזור11_ הפועל ״בנק יהב״ ירושלים -מכבי ת""א",צוות דובדבן,איתי תורג׳מן,324187095,20,CloseLink,2024-12-16,1,2024-12-16,איתי תורג׳מן,324187095,0
3917,"🏠 מחזור11_ הפועל ״בנק יהב״ ירושלים -מכבי ת""א",צוות דובדבן,יונתן אליעז,216009829,19,CloseLink,2024-12-16,1,2024-12-16,יונתן אליעז,216009829,0
3918,"🏠 מחזור11_ הפועל ״בנק יהב״ ירושלים -מכבי ת""א",צוות דובדבן,נטלי אנגלמן,306930702,45,CloseLink,2024-12-19,1,2024-12-18,נטלי אנגלמן,306930702,1


In [141]:
df_to_irit = merged.copy()
df_to_irit = df_to_irit[['Event Name', 'Name', 'Full Name_x', 'assign using  ID number', 'Age', 'Type', 'Transaction date', 'Count', 'createdon', 'days_diff']]
df_to_irit

Unnamed: 0,Event Name,Name,Full Name_x,assign using ID number,Age,Type,Transaction date,Count,createdon,days_diff
18,Round 8_ Hapoel Afula 🏠,אישה לאישה,איילת בן יעקב,301607347,36,CloseLink,2024-11-15,1,2024-11-15,0
19,Round 8_ Hapoel Afula 🏠,אישה לאישה,נועם בן יעקב,344013602,7,CloseLink,2024-11-15,1,2024-11-15,0
20,Round 8_ Hapoel Afula 🏠,אישה לאישה,נעמי ג'אנה,012438388,59,CloseLink,2024-11-15,1,2024-11-15,0
21,Round 8_ Hapoel Afula 🏠,אישה לאישה,עמית שובל,328117718,20,CloseLink,2024-11-15,1,2024-11-15,0
22,Round 8_ Hapoel Afula 🏠,אישה לאישה,יצחק שובל,038675211,56,CloseLink,2024-11-15,1,2024-11-15,0
...,...,...,...,...,...,...,...,...,...,...
3915,"🏠 מחזור11_ הפועל ״בנק יהב״ ירושלים -מכבי ת""א",צוות דובדבן,עילי ליכטנשטיין,326612280,20,CloseLink,2024-12-16,1,2024-12-16,0
3916,"🏠 מחזור11_ הפועל ״בנק יהב״ ירושלים -מכבי ת""א",צוות דובדבן,איתי תורג׳מן,324187095,20,CloseLink,2024-12-16,1,2024-12-16,0
3917,"🏠 מחזור11_ הפועל ״בנק יהב״ ירושלים -מכבי ת""א",צוות דובדבן,יונתן אליעז,216009829,19,CloseLink,2024-12-16,1,2024-12-16,0
3918,"🏠 מחזור11_ הפועל ״בנק יהב״ ירושלים -מכבי ת""א",צוות דובדבן,נטלי אנגלמן,306930702,45,CloseLink,2024-12-19,1,2024-12-18,1


In [142]:
filtered = merged[(merged['days_diff'] >= 0) & (merged['days_diff'] <= 50)]
filtered = filtered.drop_duplicates(subset=['assign using  ID number'])
filtered

Unnamed: 0,Event Name,Name,Full Name_x,assign using ID number,Age,Type,Transaction date,Count,createdon,Full Name_y,identifier,days_diff
18,Round 8_ Hapoel Afula 🏠,אישה לאישה,איילת בן יעקב,301607347,36,CloseLink,2024-11-15,1,2024-11-15,איילת בן יעקב,301607347,0
19,Round 8_ Hapoel Afula 🏠,אישה לאישה,נועם בן יעקב,344013602,7,CloseLink,2024-11-15,1,2024-11-15,נועם בן יעקב,344013602,0
20,Round 8_ Hapoel Afula 🏠,אישה לאישה,נעמי ג'אנה,012438388,59,CloseLink,2024-11-15,1,2024-11-15,נעמי ג'אנה,012438388,0
21,Round 8_ Hapoel Afula 🏠,אישה לאישה,עמית שובל,328117718,20,CloseLink,2024-11-15,1,2024-11-15,עמית שובל,328117718,0
22,Round 8_ Hapoel Afula 🏠,אישה לאישה,יצחק שובל,038675211,56,CloseLink,2024-11-15,1,2024-11-15,יצחק שובל,038675211,0
...,...,...,...,...,...,...,...,...,...,...,...,...
3915,"🏠 מחזור11_ הפועל ״בנק יהב״ ירושלים -מכבי ת""א",צוות דובדבן,עילי ליכטנשטיין,326612280,20,CloseLink,2024-12-16,1,2024-12-16,עילי ליכטנשטיין,326612280,0
3916,"🏠 מחזור11_ הפועל ״בנק יהב״ ירושלים -מכבי ת""א",צוות דובדבן,איתי תורג׳מן,324187095,20,CloseLink,2024-12-16,1,2024-12-16,איתי תורג׳מן,324187095,0
3917,"🏠 מחזור11_ הפועל ״בנק יהב״ ירושלים -מכבי ת""א",צוות דובדבן,יונתן אליעז,216009829,19,CloseLink,2024-12-16,1,2024-12-16,יונתן אליעז,216009829,0
3918,"🏠 מחזור11_ הפועל ״בנק יהב״ ירושלים -מכבי ת""א",צוות דובדבן,נטלי אנגלמן,306930702,45,CloseLink,2024-12-19,1,2024-12-18,נטלי אנגלמן,306930702,1


In [143]:
filtered['days_diff'].value_counts().sort_index()

days_diff
0     1527
1       93
2       33
3       20
4       11
5       12
6        7
7       18
8        2
10       4
11       5
12       3
13       2
14       3
15       1
17       4
19       1
20       2
21       1
22       1
23       1
24      10
25       4
26       2
28       1
29       1
30       1
31       1
33       2
40       4
41       2
42       1
43       1
45       2
46       2
47       1
48       5
49       2
Name: count, dtype: int64

## Plot by days_diff:

In [None]:
import plotly.graph_objects as go
from datetime import datetime, timedelta
import pandas as pd

# Define your game dates
game_dates = [
    "2024-10-13", "2024-10-26", "2024-11-16", "2024-12-07", "2024-12-22",
    "2025-02-16", "2025-03-02", "2025-03-17", "2025-03-30", "2025-04-11",
    "2025-04-22", "2025-05-02", "2025-05-10", "2025-05-28", "2025-06-12"
]
game_dates = [datetime.strptime(d, "%Y-%m-%d") for d in game_dates]

# Ensure datetime types in merged
merged['createdon'] = pd.to_datetime(merged['createdon'], errors='coerce')
merged['Transaction date'] = pd.to_datetime(merged['Transaction date'], errors='coerce')

results = []

for i, game_date in enumerate(game_dates):
    if i == 0:
        # First game: just use 20 days
        start_window = game_date - timedelta(days=20)
        days_between_games = 20
    else:
        previous_game = game_dates[i - 1]
        days_diff = (game_date - previous_game).days
        days_between_games = min(20, days_diff)
        start_window = game_date - timedelta(days=days_between_games)

    game_attendees = merged[merged['Transaction date'] == game_date]

    attendees_window = game_attendees[
        (game_attendees['createdon'] >= start_window) & (game_attendees['createdon'] <= game_date)
    ]

    user_count = attendees_window['assign using  ID number'].nunique()

    results.append({
        'Game Date': game_date.date().isoformat(),
        'Users in Window': user_count,
        'Days Used for Window': days_between_games
    })

# Create DataFrame
df_plot = pd.DataFrame(results)
df_plot = df_plot[df_plot['Users in Window'] > 0]
df_plot['Game Date'] = pd.to_datetime(df_plot['Game Date']).dt.strftime('%Y-%m-%d')


In [None]:
# Plot
fig = go.Figure()

fig.add_trace(go.Bar(
    x=df_plot['Game Date'],
    y=df_plot['Users in Window'],
    marker_color='pink',
    text=df_plot['Users in Window'],
    textposition='outside',
    showlegend=False
))

fig.update_layout(
    title='New Users Created Between Previous Game and This One (Max 20 Days)',
    xaxis_title='Game Date',
    yaxis=dict(
        title='Number of New Users',
        range=[0, df_plot['Users in Window'].max() * 1.2]
    ),
    xaxis=dict(
        tickmode='array',
        tickvals=df_plot['Game Date'],
        ticktext=df_plot['Game Date'],
        tickangle=-45,
        type='category'
    ),
    plot_bgcolor='white'
)

fig.show()


## Pie chart of ages:
### 0-5
### 6-12
### 13-18
### 19+

In [151]:
import plotly.express as px

# Step 1: Categorize ages
def categorize_age(age):
    if pd.isna(age):
        return 'Unknown'
    try:
        age = int(age)
        if age <= 5:
            return '0–5'
        elif age <= 12:
            return '6–12'
        elif age <= 18:
            return '13–18'
        else:
            return '19+'
    except:
        return 'Unknown'


In [152]:
# Step 2: Apply to filtered table (the general table of interest)
merged['Age Group'] = merged['Age'].apply(categorize_age)

# Step 3: Count per age group
age_counts = merged['Age Group'].value_counts().reset_index()
age_counts.columns = ['Age Group', 'Count']
age_counts = age_counts.sort_values('Age Group')

# Step 4: Plot with Plotly
fig = px.pie(
    age_counts,
    names='Age Group',
    values='Count',
    title='Overall Age Distribution',
    color_discrete_sequence=px.colors.sequential.RdPu
)

fig.update_traces(textposition='inside', textinfo='percent+label')

fig.update_layout(
    title='Overall Age Distribution',
    width=500,  # 👈 reduce width
    height=400,  # 👈 optional: set height
    legend=dict(
        orientation="v",  # vertical legend
        x=1,  # place it just outside the pie
        y=0.5,
        xanchor='left'
    )
)

fig.show()


## New table for Irit with only closeLinks:

In [None]:
df_to_irit.to_excel('df_to_irit.xlsx', index=False)