In [1]:
import pandas as pd
import re
from itertools import combinations

In [4]:
df = pd.read_csv('FriendsReportCsv_2025-06-25.csv')
users = pd.read_csv('UserCreatedReport_2025-06-25 08_35.csv')

  users = pd.read_csv('UserCreatedReport_2025-06-25 08_35.csv')


In [5]:
df['Full Name'] = df['First name'] + " " + df['Last name']
df = df[['assign using  ID number', 'Full Name', 'FriendsList']]
df

Unnamed: 0,assign using ID number,Full Name,FriendsList
0,15394992,גוסטבו שנייברג,207877705 אור שנייברג
1,200466076,גיא יוחי הרפז,229318381 מיקה יוחי הרפז
2,217743160,שניר גרונוולד,208068189 איל שרמן
3,032531709,דן ורנר,344639463 אביתר ורנר
4,60464765,חיים טובול,333764967 נועם טובול
...,...,...,...
4159,28790319,איסי שלומוף,"216516914 עומר שלומוף , 000676379 Yigal Shlomoff"
4160,24283525,יעקב יאיר,59814376 משה אהרוני
4161,22943781,עומר שלף,"38397329 רעות סמדר , 214599706 ליה שלף , 20772..."
4162,043127976,מיכל קרני,"219147626 שלו קרני , 033039603 איתמר קרני"


## Create pairs of full name and friendslist with ids:

In [6]:
pair_df = []

for _, row in df.iterrows():
    # extracting the values so it will be easier to use
    person_id = row['assign using  ID number']
    person_name = row['Full Name']
    friends_raw = row['FriendsList']

    friends = [] 
    # extract the id and name pairs
    # [ ] = match any one character inside the brackets
    # \w = matches any word character (letters, digits, or underscores)
    # @ = matches @
    # . = matches a dot
    # \+ = matches a plus sign
    # \- = matches a - sign
    # + = one or more occurrences of the preceding character or group
    # ( ) = captures this group as the first match
    
    # \s = matches any whitespace character
    
    # [ ] = match any one character inside the brackets
    # -א-ת = matches any Hebrew character
    # A-Za-z = matches any English alphabet character

    friends = re.findall(r'([\w@.+\-]+)\s+([-א-תA-Za-z\s]+)', friends_raw)

    # append it to the table
    for friend_id, friend_name in friends:
        pair_df.append({
        'Person ID' : person_id,
        'Person Name' : person_name,
        'Friend ID' : friend_id,
        'Friend Name' : friend_name.strip()   # strip - cleanup spaces
        })

    # append it to the table
    for (person1_id, person1_name), (friend1_id, friend1_name) in combinations(friends, 2):
        pair_df.append({
            'Person ID': person1_id,
            'Person Name': person1_name.strip(), # strip - cleanup spaces
            'Friend ID': friend1_id,
            'Friend Name': friend1_name.strip() # strip - cleanup spaces
        })

pair_df = pd.DataFrame(pair_df)

pair_df.drop_duplicates()

pair_df

Unnamed: 0,Person ID,Person Name,Friend ID,Friend Name
0,15394992,גוסטבו שנייברג,207877705,אור שנייברג
1,200466076,גיא יוחי הרפז,229318381,מיקה יוחי הרפז
2,217743160,שניר גרונוולד,208068189,איל שרמן
3,032531709,דן ורנר,344639463,אביתר ורנר
4,60464765,חיים טובול,333764967,נועם טובול
...,...,...,...,...
19963,214599706,ליה שלף,207727983,תום שלף
19964,043127976,מיכל קרני,219147626,שלו קרני
19965,043127976,מיכל קרני,033039603,איתמר קרני
19966,219147626,שלו קרני,033039603,איתמר קרני


## Check if all went well and the data is correct:

In [7]:
check = df[~df['Full Name'].isin(pair_df['Person Name'])]
check

Unnamed: 0,assign using ID number,Full Name,FriendsList


## Add the age column to users based on the birthdate column:

In [8]:
from datetime import date

# convert the string to datetime
users['bithdate'] = pd.to_datetime(users['birthdate'], errors='coerce')

# extract the year from the bithdate column
users['Year'] = users['bithdate'].dt.year

# getting todays year
todays_year = date.today() 
todays_year = todays_year.year

# calculation of the age
users['Age'] = todays_year - users['Year']

## Adding the age column for both Person Name and Friend Name:

In [9]:
# ensure identifier in users is unique
users = users.drop_duplicates(subset=['identifier'])

# add Age column for Person ID
pair_with_age = pair_df.merge(users[['identifier', 'Age']], 
                     left_on='Person ID', 
                     right_on='identifier', 
                     how='left')

# rename 
pair_with_age.rename(columns={'Age': 'Person Age'}, inplace=True)

# add Age column for Friend ID
pair_with_age = pair_with_age.merge(users[['identifier', 'Age']], 
                  left_on='Friend ID', 
                  right_on='identifier', 
                  how='left')

# rename
pair_with_age.rename(columns={'Age': 'Friend Age'}, inplace=True)

# drop duplicates
pair_with_age.drop(columns=['identifier_x', 'identifier_y'], inplace=True)

pair_with_age = pair_with_age[['Person ID', 'Person Name', 'Person Age', 'Friend ID', 'Friend Name', 'Friend Age']]

pair_with_age

Unnamed: 0,Person ID,Person Name,Person Age,Friend ID,Friend Name,Friend Age
0,15394992,גוסטבו שנייברג,,207877705,אור שנייברג,
1,200466076,גיא יוחי הרפז,37.0,229318381,מיקה יוחי הרפז,5.0
2,217743160,שניר גרונוולד,,208068189,איל שרמן,
3,032531709,דן ורנר,39.0,344639463,אביתר ורנר,
4,60464765,חיים טובול,43.0,333764967,נועם טובול,
...,...,...,...,...,...,...
19963,214599706,ליה שלף,,207727983,תום שלף,27.0
19964,043127976,מיכל קרני,44.0,219147626,שלו קרני,
19965,043127976,מיכל קרני,44.0,033039603,איתמר קרני,
19966,219147626,שלו קרני,,033039603,איתמר קרני,


## Drop rows based on age conditions:

In [10]:
# drop the rows when ages are greater than 70 or are equals to each other
pair_cond = pair_with_age[
    ~((pair_with_age['Person Age'] == pair_with_age['Friend Age']) |
    ((pair_with_age['Person Age'] >= 70) | (pair_with_age['Friend Age'] >= 70)))
]

# minimum parent age
threshold = 24 

# drop rows where the age difference is less than the threshold - they cannot be a parent-child pair
pair_cond = pair_cond[~(abs(pair_cond['Person Age'] - pair_cond['Friend Age'] < threshold))]


# drop rows when when both ages are greater than 18
pair_cond = pair_cond[~((pair_cond['Person Age'] > 18) & (pair_cond['Friend Age'] > 18))]

# drop occurance of nan values in ages
pair_cond = pair_cond[~((pair_cond['Person Age'].isna()) | (pair_cond['Friend Age'].isna()))]

pair_cond

Unnamed: 0,Person ID,Person Name,Person Age,Friend ID,Friend Name,Friend Age
1,200466076,גיא יוחי הרפז,37.0,229318381,מיקה יוחי הרפז,5.0
26,043545409,תומר אדר,43.0,340053180,בן מוסיקי,10.0
76,034706317,אליעד פיבקו,40.0,226587384,אבישי פיבקו מתנס בקעה,7.0
80,200290393,עומר רזין,37.0,226988061,ארבל רזין,5.0
93,033295304,ללי קושניר,53.0,219842168,גבריאלה קושניר,14.0
...,...,...,...,...,...,...
19812,032459943,עודד ברויאר,50.0,217631167,אלעד ברויאר,17.0
19813,032459943,עודד ברויאר,50.0,219843042,אריאל ברויאר,14.0
19886,040194078,שלומי יהושע,45.0,218681229,יהונתן יהושע,16.0
19925,034770040,איתי שמיר,39.0,341656585,איילה שמיר,9.0


## Drop rows based on last name:

In [11]:
# add Age column for Person ID
final = pair_cond.merge(users[['identifier', 'lastname']], 
                     left_on='Person ID', 
                     right_on='identifier', 
                     how='left')

# rename 
final.rename(columns={'lastname': 'Person Last Name'}, inplace=True)

# add Age column for Friend ID
final = final.merge(users[['identifier', 'lastname']], 
                  left_on='Friend ID', 
                  right_on='identifier', 
                  how='left')

# rename
final.rename(columns={'lastname': 'Friend Last Name'}, inplace=True)

# drop rows when the last name is not equal
final = final[~(final['Person Last Name'] != final['Friend Last Name'])]

parent_child = final[['Person ID', 'Person Name', 'Person Age', 'Friend ID', 'Friend Name', 'Friend Age']]

# drop rows when the name contains only one word
# str.strip() - removes leading and trailing whitespace from the names
# str.count(r'\s') - counts the number of spaces in the name. If the count is 0 - the name contains only one word
parent_child = parent_child[
    (parent_child['Person Name'].str.strip().str.count(r'\s') > 0) &  
    (parent_child['Friend Name'].str.strip().str.count(r'\s') > 0)
]

parent_child

Unnamed: 0,Person ID,Person Name,Person Age,Friend ID,Friend Name,Friend Age
0,200466076,גיא יוחי הרפז,37.0,229318381,מיקה יוחי הרפז,5.0
2,034706317,אליעד פיבקו,40.0,226587384,אבישי פיבקו מתנס בקעה,7.0
3,200290393,עומר רזין,37.0,226988061,ארבל רזין,5.0
4,033295304,ללי קושניר,53.0,219842168,גבריאלה קושניר,14.0
5,027134501,איה בן פורת,51.0,222652455,גלי בן פורת,12.0
...,...,...,...,...,...,...
994,040013435,שלמה סעדון,45.0,040578023,מיטל סעדון,1.0
995,040698748,ניר זריפי,45.0,221678881,עמית זריפי,11.0
998,040194078,שלומי יהושע,45.0,218681229,יהונתן יהושע,16.0
999,034770040,איתי שמיר,39.0,341656585,איילה שמיר,9.0


In [12]:
parent_child.to_excel('parent_child_candidates.xlsx')