In [6]:
import pandas as pd
import json

In [7]:
df_w1 = pd.read_csv("Raw/W1_AllAppsWide_2024-11-13-4.csv")
df_w1 = df_w1.query('`session.code` == "3m87qmko" | `session.code` == "wt9ndgb1"')

df_w2 = pd.read_csv("Raw/W2_all_apps_wide_2024-12-10-2.csv")
df_w2 = df_w2.query('`session.code` == "2n8orvug"')
df_w2["participant.label"][244] = "nan" # fixing the missing label

df_w3 = pd.read_csv("Raw/W3_all_apps_wide_2025-01-29.csv")
df_w3 = df_w3.query('`session.code` == "7uy8unkt"')
df_w3["participant.label"][46] = "nan" # fixing the missing label

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df_w2["participant.label"][244] = "nan" # fixing the missing label
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to updat

In [8]:
def prep_function_w1(df):
    # Drop rows where 'network_app.1.player.participantcode' is NaN
    df = df.dropna(subset=['network_app.1.player.participantcode'])
    # Lowercase the 'network_app.1.player.participantcode' column
    df['network_app.1.player.participantcode'] = df['network_app.1.player.participantcode'].str.lower()


    # Identify duplicates based on 'network_app.1.player.participantcode'
    duplicates = df[df.duplicated(subset='network_app.1.player.participantcode', keep=False)]
    # Filter duplicates where 'participant._index_in_pages' > 15
    filtered_duplicates = duplicates[duplicates['participant._index_in_pages'] > 15]
    # Remove the duplicates from the original dataframe (only those where 'participantcode' is duplicated)
    df = df[~df['network_app.1.player.participantcode'].isin(duplicates['network_app.1.player.participantcode'])]
    # Concatenate the filtered duplicates back into the dataframe
    df = pd.concat([df, filtered_duplicates], axis=0, ignore_index=True)


    # Lowercase person columns ('network_app.1.player.person_1' to 'network_app.1.player.person_30')
    person_columns = [f'network_app.1.player.person_{i}' for i in range(1, 31)]
    df[person_columns] = df[person_columns].apply(lambda x: x.str.lower())
    # Replace missing values with 'x' in all person columns
    df[person_columns] = df[person_columns].fillna("x")

    for _, row in df.iterrows():
        # Other node and edge processing...
        # Check if participantcode and person_1 match
        if row["network_app.1.player.participantcode"] == row["network_app.1.player.person_1"]:
            df.loc[_, "network_app.1.player.linksrechts_self"] = row["network_app.1.player.linksrechts_1"]

    df['participant.label'] = df['network_app.1.player.participantcode']

    return df


In [9]:
def prep_function_w2(df):
    df = df.dropna(subset=['participant.label'])
    
    for i in range(1, 31):
        df[f'network_app.1.player.person_{i}'] = df[f'network_app.1.player.person_{i}'].str.lower()

    df['network_app.1.player.person_3'] = df['network_app.1.player.person_3'].str.replace("ny4", "ny3", case=False, regex=False) #added because of commentary

    # Replace missing values with "x" in columns: 'network_app.1.player.person_1', 'network_app.1.player.person_2' and 28 other columns
    df = df.fillna({'network_app.1.player.person_1': "x", 'network_app.1.player.person_2': "x", 'network_app.1.player.person_3': "x", 'network_app.1.player.person_4': "x", 'network_app.1.player.person_5': "x", 'network_app.1.player.person_6': "x", 'network_app.1.player.person_7': "x", 'network_app.1.player.person_8': "x", 'network_app.1.player.person_9': "x", 'network_app.1.player.person_10': "x", 'network_app.1.player.person_11': "x", 'network_app.1.player.person_12': "x", 'network_app.1.player.person_13': "x", 'network_app.1.player.person_14': "x", 'network_app.1.player.person_15': "x", 'network_app.1.player.person_16': "x", 'network_app.1.player.person_17': "x", 'network_app.1.player.person_18': "x", 'network_app.1.player.person_19': "x", 'network_app.1.player.person_20': "x", 'network_app.1.player.person_21': "x", 'network_app.1.player.person_22': "x", 'network_app.1.player.person_23': "x", 'network_app.1.player.person_24': "x", 'network_app.1.player.person_25': "x", 'network_app.1.player.person_26': "x", 'network_app.1.player.person_27': "x", 'network_app.1.player.person_28': "x", 'network_app.1.player.person_29': "x", 'network_app.1.player.person_30': "x"})

    return df

In [10]:
def prep_function_w3(df):
    df = df.dropna(subset=['participant.label'])
    
    for i in range(1, 31):
        df[f'network_app.1.player.person_{i}'] = df[f'network_app.1.player.person_{i}'].str.lower()

    df['network_app.1.player.person_3'] = df['network_app.1.player.person_3'].str.replace("ny4", "ny3", case=False, regex=False) #added because of commentary

    # Replace missing values with "x" in columns: 'network_app.1.player.person_1', 'network_app.1.player.person_2' and 28 other columns
    df = df.fillna({'network_app.1.player.person_1': "x", 'network_app.1.player.person_2': "x", 'network_app.1.player.person_3': "x", 'network_app.1.player.person_4': "x", 'network_app.1.player.person_5': "x", 'network_app.1.player.person_6': "x", 'network_app.1.player.person_7': "x", 'network_app.1.player.person_8': "x", 'network_app.1.player.person_9': "x", 'network_app.1.player.person_10': "x", 'network_app.1.player.person_11': "x", 'network_app.1.player.person_12': "x", 'network_app.1.player.person_13': "x", 'network_app.1.player.person_14': "x", 'network_app.1.player.person_15': "x", 'network_app.1.player.person_16': "x", 'network_app.1.player.person_17': "x", 'network_app.1.player.person_18': "x", 'network_app.1.player.person_19': "x", 'network_app.1.player.person_20': "x", 'network_app.1.player.person_21': "x", 'network_app.1.player.person_22': "x", 'network_app.1.player.person_23': "x", 'network_app.1.player.person_24': "x", 'network_app.1.player.person_25': "x", 'network_app.1.player.person_26': "x", 'network_app.1.player.person_27': "x", 'network_app.1.player.person_28': "x", 'network_app.1.player.person_29': "x", 'network_app.1.player.person_30': "x"})

    return df

In [11]:
def shiny_dataframe(df):
    """
    Preprocesses the input DataFrame by performing column drops and replacing missing values.

    Args:
        df (pd.DataFrame): The input DataFrame to preprocess.

    Returns:
        pd.DataFrame: The preprocessed DataFrame.
    """
    # Columns to drop
    drop_columns = [
        'participant._is_bot', 'participant._index_in_pages', 'conjoint_app.1.player.participant_label', 'conjoint_app.1.player.id_in_group',
        'participant._current_app_name', 'participant._current_page_name', 'conjoint_app.1.player.payoff', 'demographic_app.1.player.id_in_group',
        'participant.visited', 'participant.mturk_worker_id', 'participant.mturk_assignment_id',
        'participant.payoff', 'session.label', 'session.mturk_HITId', 'session.mturk_HITGroupId',
        'session.comment', 'session.is_demo', 'session.config.real_world_currency_per_point',
        'session.config.participation_fee', 'conjoint_app.1.player.language',
        'conjoint_app.1.group.id_in_subsession', 'conjoint_app.1.subsession.round_number',
        'demographic_app.1.player.role', 'demographic_app.1.player.payoff',
        'network_app.1.player.role', 'network_app.1.player.payoff',
        'demographic_app.1.group.id_in_subsession', 'demographic_app.1.subsession.round_number',
        'conjoint_app.1.player.role', 'session.config.name',
        'network_app.1.group.id_in_subsession', 'network_app.1.subsession.round_number',
        'political_app.1.player.role', 'political_app.1.player.id_in_group',
        'political_app.1.subsession.round_number', 'political_app.1.group.id_in_subsession',
        'end_app.1.player.role', 'end_app.1.player.payoff', 'end_app.1.player.group_assignment',
        'end_app.1.player.time_endpage', 'end_app.1.group.id_in_subsession',
        'end_app.1.subsession.round_number', 'political_app.1.player.payoff'
    ]

    # Columns with missing values to replace
    fill_values_zero = {
        'demographic_app.1.player.rent': 0,
        'demographic_app.1.player.income': 0,
        'participant.label': 0,
        'political_app.1.player.sunday_poll': 0,
        'political_app.1.player.sunday_party_vote': 0,
        'political_app.1.player.sunday_not_eligible': 0,
        'political_app.1.player.noteligible_sunday_party_vote': 0,
        'demographic_app.1.player.social_networks_1': 0,
        'demographic_app.1.player.social_networks_2': 0,
        'demographic_app.1.player.social_networks_3': 0,
        'demographic_app.1.player.social_networks_4': 0,
        'demographic_app.1.player.social_networks_5': 0,
        'demographic_app.1.player.social_networks_6': 0,
        'demographic_app.1.player.social_networks_7': 0,
        'demographic_app.1.player.social_networks_8': 0,
        'demographic_app.1.player.social_networks_9': 0,
        'demographic_app.1.player.social_networks_10': 0,
        'demographic_app.1.player.participation_demonstration_1': 0,
        'demographic_app.1.player.petition_signatory_1': 0,
        'conjoint_app.1.player.participant_label': 0,
        'network_app.1.player.participantcode': 0,
        'political_app.1.player.reason_no_vote': 0
    }

    fill_values_string_zero = {
        'demographic_app.1.player.social_networks_11': "0",
        'demographic_app.1.player.study_program_other': "0"
    }

    fill_values_negative = {
        'political_app.1.player.scalo_cdu': -999,
        'political_app.1.player.scalo_csu': -999,
        'political_app.1.player.scalo_spd': -999,
        'political_app.1.player.scalo_gruene': -999,
        'political_app.1.player.scalo_fdp': -999,
        'political_app.1.player.scalo_afd': -999,
        'political_app.1.player.scalo_linke': -999,
        'political_app.1.player.scalo_bsw': -999,
        'demographic_app.1.player.edu_mother': -999,
        'demographic_app.1.player.study_program': -999
    }

    # Drop specified columns
    df = df.drop(columns=[col for col in drop_columns if col in df.columns])

    # Replace missing values with 0
    df = df.fillna({col: val for col, val in fill_values_zero.items() if col in df.columns})

    # Replace missing values with "0"
    df = df.fillna({col: val for col, val in fill_values_string_zero.items() if col in df.columns})

    # Replace missing values with -999
    df = df.fillna({col: val for col, val in fill_values_negative.items() if col in df.columns})

    # Inconsistency in no answer variable
    df = df.replace(-888, -999)

    # List of prefixes to keep
    prefixes = ['conjoint', 
                'demographic', 
                'political', 
                'vignette', 
                'participant.label', 
                'network_app.1.player.participantcode',
                'network_app.1.player.linksrechts_self',
                'end_app.1.player.catdog',
                'end_app.1.player.rnumber']

    # Filter columns by prefixes
    df = df.loc[:, df.columns.str.startswith(tuple(prefixes))]

    return df

In [12]:
df_w1 = prep_function_w1(df_w1)
df_w2 = prep_function_w2(df_w2)
df_w3 = prep_function_w3(df_w3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['network_app.1.player.participantcode'] = df['network_app.1.player.participantcode'].str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f'network_app.1.player.person_{i}'] = df[f'network_app.1.player.person_{i}'].str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['network_a

In [13]:
df_filtered_w1 = shiny_dataframe(df_w1)
df_filtered_w2 = shiny_dataframe(df_w2)
df_filtered_w3 = shiny_dataframe(df_w3)

# Export dataframes as CSV
df_filtered_w1.to_csv('Cooked/df_w1_prepared.csv', index=False)
df_filtered_w2.to_csv('Cooked/df_w2_prepared.csv', index=False)
df_filtered_w3.to_csv('Cooked/df_w3_prepared.csv', index=False)

  df = df.fillna({col: val for col, val in fill_values_zero.items() if col in df.columns})


***
## Network Dictionaries
`dict_nodes = {participant_label : {participant_attributes : values, network_app.1.player.person_1...30 : {Friends : True, Politics : True}}}`

Note: We loose a lot of observation because of wrong entries for participantcodes of aquaintances 

In [14]:
# Get all unique participants across all DataFrames
all_participants = set(df_w1['participant.label']) | set(df_w2['participant.label']) | set(df_w3['participant.label'])

# Get participants present in all three DataFrames (intersection)
complete = set(df_w1['participant.label']) & set(df_w2['participant.label']) & set(df_w3['participant.label'])

# Find the participants that are in at least one DataFrame but NOT in all three
difference = all_participants - complete

# Filter the original DataFrames to get only rows that contain these "difference" participants
df_w1_diff = df_w1[df_w1['participant.label'].isin(difference)]
df_w2_diff = df_w2[df_w2['participant.label'].isin(difference)]
df_w3_diff = df_w3[df_w3['participant.label'].isin(difference)]

# Filter the DataFrames to keep only the matching rows
df_w1_filtered = df_w1[df_w1['participant.label'].isin(complete)]
df_w2_filtered = df_w2[df_w2['participant.label'].isin(complete)]
df_w3_filtered = df_w3[df_w3['participant.label'].isin(complete)]

# Combine them into a single DataFrame for easier viewing
df_diff = pd.concat([df_w1_diff, df_w2_diff, df_w3_diff])


In [23]:
df_w1_diff

Unnamed: 0,participant.id_in_session,participant.code,participant.label,participant._is_bot,participant._index_in_pages,participant._max_page_index,participant._current_app_name,participant._current_page_name,participant.time_started_utc,participant.visited,...,end_app.1.player.role,end_app.1.player.payoff,end_app.1.player.group_assignment,end_app.1.player.rnumber,end_app.1.player.rnumbercheck,end_app.1.player.feedback,end_app.1.player.time_firstendpage,end_app.1.player.time_endpage,end_app.1.group.id_in_subsession,end_app.1.subsession.round_number
0,2,oqjbypyi,wer,0,17,17,end_app,End,2024-11-12 22:30:23.985745,1,...,,0.0,1,970176,970176.0,,2024-11-12T22:31:58.745Z,-999,1,1
1,3,zyksik0c,ttz,0,11,17,political_app,ScaloParty,2024-11-12 22:32:06.922914,1,...,,0.0,2,708967,,,-999,-999,1,1
13,17,nzm2o4gr,4x3,0,17,17,end_app,End,2024-11-13 09:23:17.934250,1,...,,0.0,0,526679,526679.0,👍🏼,2024-11-13T09:35:22.130Z,-999,1,1
15,19,74it0yf1,c9f,0,17,17,end_app,End,2024-11-13 09:23:18.746927,1,...,,0.0,0,201820,201820.0,Werden die anonymen Ergebnisse der Umfrage ver...,2024-11-13T09:33:36.817Z,-999,1,1
23,29,3h1zyy8o,wfr,0,17,17,end_app,End,2024-11-13 09:23:23.098248,1,...,,0.0,0,569647,569647.0,,2024-11-13T09:38:01.174Z,-999,1,1
37,47,pgudpupc,fqt,0,17,17,end_app,End,2024-11-13 09:23:29.383273,1,...,,0.0,0,483538,483538.0,,2024-11-13T09:33:03.115Z,-999,1,1
38,48,ij7i31iw,hof,0,17,17,end_app,End,2024-11-13 09:23:29.935478,1,...,,0.0,1,749322,749322.0,,2024-11-13T09:46:29.460Z,-999,1,1
43,54,p8dnx8zf,hvs,0,17,17,end_app,End,2024-11-13 09:23:32.160951,1,...,,0.0,1,640600,640600.0,,2024-11-13T09:35:58.855Z,-999,1,1
55,69,ounzrlo2,hmu,0,17,17,end_app,End,2024-11-13 09:23:41.706775,1,...,,0.0,2,884440,884440.0,Wäre super wenn man zurück auf die voherige Fr...,2024-11-13T09:35:19.405Z,-999,1,1
63,79,2y5r7v5a,rbb,0,17,17,end_app,End,2024-11-13 09:23:43.046211,1,...,,0.0,0,502392,502392.0,,2024-11-13T09:37:21.488Z,-999,1,1


An interesting idea would be two create new variables from multiple columns in the survey. 
- Popularity of political people 
- Socialmedia apps 
- 

In [10]:
df_w3_filtered

Unnamed: 0,participant.id_in_session,participant.code,participant.label,participant._is_bot,participant._index_in_pages,participant._max_page_index,participant._current_app_name,participant._current_page_name,participant.time_started_utc,participant.visited,...,end_app.1.player.payoff,end_app.1.player.group_assignment,end_app.1.player.rnumber,end_app.1.player.rnumbercheck,end_app.1.player.feedback,end_app.1.player.catdog,end_app.1.player.time_firstendpage,end_app.1.player.time_endpage,end_app.1.group.id_in_subsession,end_app.1.subsession.round_number
0,1,491x3me2,xtk,0,22,22,end_app,End,2025-01-28 14:07:06.224129,1,...,0.0,1,743237,743237.0,,2.0,2025-01-28T14:25:56.707Z,-999,1,1
1,2,tran0vrr,pgx,0,22,22,end_app,End,2025-01-28 14:08:45.869348,1,...,0.0,0,589771,589771.0,,2.0,2025-01-28T14:19:42.147Z,-999,1,1
2,3,th3kdh78,amu,0,22,22,end_app,End,2025-01-28 14:10:11.983747,1,...,0.0,1,759942,759942.0,,2.0,2025-01-28T14:15:19.185Z,-999,1,1
3,4,c4fci2i2,qx3,0,22,22,end_app,End,2025-01-28 14:10:24.331086,1,...,0.0,2,439273,439273.0,,2.0,2025-01-28T14:29:25.046Z,-999,1,1
4,5,kvxfy3uf,3eh,0,22,22,end_app,End,2025-01-28 14:11:35.438489,1,...,0.0,1,310606,310606.0,,1.0,2025-01-28T14:34:28.265Z,-999,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164,165,umihyppx,mbf,0,22,22,end_app,End,2025-01-28 14:48:53.206209,1,...,0.0,1,698531,698531.0,,,2025-01-28T14:57:36.300Z,-999,1,1
165,166,w4iitsmr,bey,0,22,22,end_app,End,2025-01-28 15:20:02.834917,1,...,0.0,2,487844,487844.0,,2.0,2025-01-28T15:34:20.155Z,-999,1,1
166,167,wy3ddqlt,3hc,0,22,22,end_app,End,2025-01-28 15:23:36.706942,1,...,0.0,1,242428,242428.0,,2.0,2025-01-28T15:30:58.606Z,-999,1,1
167,168,t4fwhzn3,e4u,0,22,22,end_app,End,2025-01-28 16:39:08.328337,1,...,0.0,2,133647,133647.0,,2.0,2025-01-28T16:49:26.525Z,-999,1,1


In [24]:
def replace_missing_with_default(value, default_value=-999):
    if pd.isna(value) or value in ["", "NA", "-999", None, -888]:
        return default_value
    return value

def data_extraction(df, unique_id, wave):
    # Initialize nodes with empty dictionaries
    dict_nodes = {label: {} for label in df[unique_id]}
    dict_edges = {label: {} for label in df[unique_id]}
    dropped = 0
    list_dropped = []
    edge_info = {
        "friend": 0,
        "value": 0,
        "politics": 0,
        "study": 0,
        "council": 0,
        "leftright": 0,
        "sentiment": 0,
        "aquaintance": 0
    }

    # Iterate through rows to populate node and edge attributes
    for _, row in df.iterrows():
        # Add node-specific attributes
        label = row[unique_id]
        if label in dict_nodes:
            dict_nodes[label]["leftrightself"] = replace_missing_with_default(row["network_app.1.player.linksrechts_self"])
            dict_nodes[label]["gender"] = replace_missing_with_default(row["demographic_app.1.player.gender"])
            dict_nodes[label]["income"] = replace_missing_with_default(row["demographic_app.1.player.income"])
            dict_nodes[label]["rent"] = replace_missing_with_default(row["demographic_app.1.player.rent"])
            dict_nodes[label]["grade"] = replace_missing_with_default(row["demographic_app.1.player.grade"])
            dict_nodes[label]["partyvote"] = replace_missing_with_default(row['political_app.1.player.sunday_party_vote'])
            dict_nodes[label]["age"] = replace_missing_with_default(row['demographic_app.1.player.age'])
            #dict_nodes[label]["ID"] = replace_missing_with_default(row['participant.label'])
            dict_keys = [
                ("catdog", "end_app.1.player.catdog"),
                ("ocu_father", "demographic_app.1.player.ocu_father"),
                ("ocu_mother", "demographic_app.1.player.ocu_mother"),
                ("edu_father", "demographic_app.1.player.edu_father"),
                ("edu_mother", "demographic_app.1.player.edu_mother"),
                ("semester_of_study", "demographic_app.1.player.semester_of_study"),
                ("fresherscamp_student", "demographic_app.1.player.fresherscamp_student"),
                ("freshersweek_student", "demographic_app.1.player.freshersweek_student"),
                ("study_program", "demographic_app.1.player.study_program"),
                ("politics_question_one", "political_app.1.player.politics_question_one"),
                ("politics_question_two", "political_app.1.player.politics_question_two"),
                ("politics_question_three", "political_app.1.player.politics_question_three"),
                ("politics_question_four", "political_app.1.player.politics_question_four"),
                ("politics_question_five", "political_app.1.player.politics_question_five"),
                ("politics_question_six", "political_app.1.player.politics_question_six"),
                ("politics_question_seven", "political_app.1.player.politics_question_seven"),           
            ]
            for key, column in dict_keys:
                try:
                    dict_nodes[label][key] = replace_missing_with_default(row[column])
                except KeyError:
                    pass  # Skip only the missing keys   
            # add more attributes here

        # Process connections and add edge-specific attributes
        for i in range(1, 30):
            target_person = row[f"network_app.1.player.person_{i}"]
            if target_person != "x" and target_person != label:
                if target_person in dict_nodes:
                    dict_edges[label][target_person] = {
                        "aquaintance": True,
                        "friend": replace_missing_with_default(row[f"network_app.1.player.friend_{i}"]) == 1,
                        "value": replace_missing_with_default(row[f"network_app.1.player.value_{i}"]) == 1,
                        "politics": replace_missing_with_default(row[f"network_app.1.player.politics_{i}"]) == 1,
                        "study": replace_missing_with_default(row[f"network_app.1.player.study_{i}"]) == 1,
                        "council": replace_missing_with_default(row[f"network_app.1.player.council_{i}"]) == 1,
                        "leftright": replace_missing_with_default(row[f"network_app.1.player.linksrechts_{i}"])
                    }
                    try:
                        dict_edges[label][target_person]["sentiment"] = replace_missing_with_default(row[f"network_app.1.player.sentiment_{i}"])
                    except KeyError:
                        pass

                    # Update edge information counts
                    #edge_info["aquaintance"] += dict_edges[label][target_person]["aquaintance"]
                    edge_info["friend"] += dict_edges[label][target_person]["friend"]
                    edge_info["value"] += dict_edges[label][target_person]["value"]
                    edge_info["politics"] += dict_edges[label][target_person]["politics"]
                    edge_info["study"] += dict_edges[label][target_person]["study"]
                    edge_info["council"] += dict_edges[label][target_person]["council"]
                    edge_info["leftright"] += 1  # Assuming each edge has a leftright value
                    if "sentiment" in dict_edges[label][target_person]:
                        edge_info["sentiment"] += 1
                else:
                    # Log dropped connections
                    dropped += 1
                    list_dropped.append(target_person)


    # Save nodes and edges to JSON files
    with open(f'NA/nodes_{wave}.json', 'w') as json_file:
        json.dump(dict_nodes, json_file, indent=4)

    with open(f'NA/edges_{wave}.json', 'w') as json_file:
        json.dump(dict_edges, json_file, indent=4)

    # Print edge information summary
    print(f"Edge information for wave {wave}:")
    for key, value in edge_info.items():
        print(f"{key}: {value}")

    sum=0
    for key,value in dict_edges.items():
        #print(key)
        sum += len(value)
    print("Amount of Edges dropped:", len(list_dropped))

    return dict_nodes, dict_edges, list_dropped


In [25]:
dict_nodes_w1, dict_edges_w1, list_dropped_w1 = data_extraction(df_w1_filtered, 'network_app.1.player.participantcode', "W1")
dict_nodes_w2, dict_edges_w2, list_dropped_w2 = data_extraction(df_w2_filtered, "participant.label", "W2")
dict_nodes_w3, dict_edges_w3, list_dropped_w3 = data_extraction(df_w3_filtered, "participant.label", "W3")

Edge information for wave W1:
friend: 338
value: 253
politics: 224
study: 254
council: 324
leftright: 590
sentiment: 0
aquaintance: 0
Amount of Edges dropped: 83
Edge information for wave W2:
friend: 361
value: 234
politics: 207
study: 184
council: 316
leftright: 581
sentiment: 581
aquaintance: 0
Amount of Edges dropped: 92
Edge information for wave W3:
friend: 352
value: 262
politics: 216
study: 193
council: 336
leftright: 593
sentiment: 593
aquaintance: 0
Amount of Edges dropped: 57


In [89]:
df_w3["end_app.1.player.catdog"]

0      2.0
1      2.0
2      2.0
3      2.0
4      1.0
      ... 
165    2.0
166    2.0
167    2.0
168    2.0
169    4.0
Name: end_app.1.player.catdog, Length: 170, dtype: float64