In [43]:
import pandas as pd
import json

In [44]:
df_w1 = pd.read_csv("Raw/W1_AllAppsWide_2024-11-13-4.csv")
df_w1 = df_w1.query('`session.code` == "3m87qmko" | `session.code` == "wt9ndgb1"')

In [45]:
df_w2 = pd.read_csv("Raw/W2_all_apps_wide_2024-12-10-2.csv")
df_w2 = df_w2.query('`session.code` == "2n8orvug"')
df_w2["participant.label"][244] = "nan" # fixing the missing label

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df_w2["participant.label"][244] = "nan" # fixing the missing label


- drop columns that are not needed 
- drop missing values for unique-ID column 
- check for duplicates in unique-ID and decide what to do with them 
    - keep them based on finishing survey, so last page visited
- prepare networks and fill na with x
- drop columns based on prefixes, or keep columns based on prefixes of apps

In [46]:
def prep_function_w1(df):
    # Drop rows where 'network_app.1.player.participantcode' is NaN
    df = df.dropna(subset=['network_app.1.player.participantcode'])
    # Lowercase the 'network_app.1.player.participantcode' column
    df['network_app.1.player.participantcode'] = df['network_app.1.player.participantcode'].str.lower()


    # Identify duplicates based on 'network_app.1.player.participantcode'
    duplicates = df[df.duplicated(subset='network_app.1.player.participantcode', keep=False)]
    # Filter duplicates where 'participant._index_in_pages' > 15
    filtered_duplicates = duplicates[duplicates['participant._index_in_pages'] > 15]
    # Remove the duplicates from the original dataframe (only those where 'participantcode' is duplicated)
    df = df[~df['network_app.1.player.participantcode'].isin(duplicates['network_app.1.player.participantcode'])]
    # Concatenate the filtered duplicates back into the dataframe
    df = pd.concat([df, filtered_duplicates], axis=0, ignore_index=True)


    # Lowercase person columns ('network_app.1.player.person_1' to 'network_app.1.player.person_30')
    person_columns = [f'network_app.1.player.person_{i}' for i in range(1, 31)]
    df[person_columns] = df[person_columns].apply(lambda x: x.str.lower())
    # Replace missing values with 'x' in all person columns
    df[person_columns] = df[person_columns].fillna("x")

    return df


In [47]:
def prep_function_w2(df):
    df = df.dropna(subset=['participant.label'])
    
    for i in range(1, 31):
        df[f'network_app.1.player.person_{i}'] = df[f'network_app.1.player.person_{i}'].str.lower()

    # Replace missing values with "x" in columns: 'network_app.1.player.person_1', 'network_app.1.player.person_2' and 28 other columns
    df = df.fillna({'network_app.1.player.person_1': "x", 'network_app.1.player.person_2': "x", 'network_app.1.player.person_3': "x", 'network_app.1.player.person_4': "x", 'network_app.1.player.person_5': "x", 'network_app.1.player.person_6': "x", 'network_app.1.player.person_7': "x", 'network_app.1.player.person_8': "x", 'network_app.1.player.person_9': "x", 'network_app.1.player.person_10': "x", 'network_app.1.player.person_11': "x", 'network_app.1.player.person_12': "x", 'network_app.1.player.person_13': "x", 'network_app.1.player.person_14': "x", 'network_app.1.player.person_15': "x", 'network_app.1.player.person_16': "x", 'network_app.1.player.person_17': "x", 'network_app.1.player.person_18': "x", 'network_app.1.player.person_19': "x", 'network_app.1.player.person_20': "x", 'network_app.1.player.person_21': "x", 'network_app.1.player.person_22': "x", 'network_app.1.player.person_23': "x", 'network_app.1.player.person_24': "x", 'network_app.1.player.person_25': "x", 'network_app.1.player.person_26': "x", 'network_app.1.player.person_27': "x", 'network_app.1.player.person_28': "x", 'network_app.1.player.person_29': "x", 'network_app.1.player.person_30': "x"})

    return df

In [48]:
df_w1 = prep_function_w1(df_w1)
df_w2 = prep_function_w2(df_w2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['network_app.1.player.participantcode'] = df['network_app.1.player.participantcode'].str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f'network_app.1.player.person_{i}'] = df[f'network_app.1.player.person_{i}'].str.lower()


In [49]:
# List of prefixes to keep
prefixes = ['conjoint', 'demographic', 'political', 'vignette', 'participant.label', 'network_app.1.player.participantcode']

# Filter columns by prefixes
df_filtered_w2 = df_w2.loc[:, df_w2.columns.str.startswith(tuple(prefixes))]
df_filtered_w1 = df_w1.loc[:, df_w1.columns.str.startswith(tuple(prefixes))]

In [50]:
# Export dataframes as CSV
df_filtered_w1.to_csv('df_w1_prepared.csv', index=False)
df_filtered_w2.to_csv('df_w2_prepared.csv', index=False)

In [51]:
a = set(df_w1['network_app.1.player.participantcode']) 
b = set(df_w2['participant.label'])

c = a.intersection(b)

In [52]:
print(len(c),len(a),len(b))

163 180 180


Teilnehmer: w1:180, w2:180

Schnittmenge Wellen: 163 (-17)

***
## Dictionaries
`dict_nodes = {participant_label : {participant_attributes : values, network_app.1.player.person_1...30 : {Friends : True, Politics : True}}}`

Note: We loose a lot of observation because of wrong entries for participantcodes of aquaintances 

In [71]:
import pandas as pd
import json

def replace_missing_with_default(value, default_value=-999):
    """
    Replace missing values (None, NaN, or empty) with a default value.
    """
    if pd.isna(value) or value == "" or value is None:
        return default_value
    return value

def data_extraction(df, unique_id, wave):
    # Initialize nodes with empty dictionaries
    dict_nodes = {label: {} for label in df[unique_id]}
    dict_edges = {label: {} for label in df[unique_id]}
    dropped = 0
    list_dropped = []
    edge_info = {
        "friend": 0,
        "value": 0,
        "politics": 0,
        "study": 0,
        "council": 0,
        "leftright": 0,
        "sentiment": 0
    }

    # Iterate through rows to populate node and edge attributes
    for _, row in df.iterrows():
        # Add node-specific attributes
        label = row[unique_id]
        if label in dict_nodes:
            dict_nodes[label]["leftrightself"] = replace_missing_with_default(row["network_app.1.player.linksrechts_self"])
            dict_nodes[label]["gender"] = replace_missing_with_default(row["demographic_app.1.player.gender"])
            dict_nodes[label]["income"] = replace_missing_with_default(row["demographic_app.1.player.income"])
            dict_nodes[label]["rent"] = replace_missing_with_default(row["demographic_app.1.player.rent"])
            dict_nodes[label]["grade"] = replace_missing_with_default(row["demographic_app.1.player.grade"])
            dict_nodes[label]["partyvote"] = replace_missing_with_default(row['political_app.1.player.sunday_party_vote'])
            dict_nodes[label]["age"] = replace_missing_with_default(row['demographic_app.1.player.age'])
            dict_nodes[label]["ocu_father"] = replace_missing_with_default(row['demographic_app.1.player.ocu_father'])
            dict_nodes[label]["ocu_mother"] = replace_missing_with_default(row['demographic_app.1.player.ocu_mother'])
            dict_nodes[label]["edu_father"] = replace_missing_with_default(row['demographic_app.1.player.edu_father'])
            dict_nodes[label]["edz_mother"] = replace_missing_with_default(row['demographic_app.1.player.edu_mother'])        
            # add more attributes here

        # Process connections and add edge-specific attributes
        for i in range(1, 30):
            target_person = row[f"network_app.1.player.person_{i}"]
            if target_person != "x" and target_person != label:
                if target_person in dict_nodes:
                    dict_edges[label][target_person] = {
                        "friend": replace_missing_with_default(row[f"network_app.1.player.friend_{i}"]) == 1,
                        "value": replace_missing_with_default(row[f"network_app.1.player.value_{i}"]) == 1,
                        "politics": replace_missing_with_default(row[f"network_app.1.player.politics_{i}"]) == 1,
                        "study": replace_missing_with_default(row[f"network_app.1.player.study_{i}"]) == 1,
                        "council": replace_missing_with_default(row[f"network_app.1.player.council_{i}"]) == 1,
                        "leftright": replace_missing_with_default(row[f"network_app.1.player.linksrechts_{i}"])
                    }
                    try:
                        dict_edges[label][target_person]["sentiment"] = replace_missing_with_default(row[f"network_app.1.player.sentiment_{i}"])
                    except KeyError:
                        pass

                    # Update edge information counts
                    edge_info["friend"] += dict_edges[label][target_person]["friend"]
                    edge_info["value"] += dict_edges[label][target_person]["value"]
                    edge_info["politics"] += dict_edges[label][target_person]["politics"]
                    edge_info["study"] += dict_edges[label][target_person]["study"]
                    edge_info["council"] += dict_edges[label][target_person]["council"]
                    edge_info["leftright"] += 1  # Assuming each edge has a leftright value
                    if "sentiment" in dict_edges[label][target_person]:
                        edge_info["sentiment"] += 1
                else:
                    # Log dropped connections
                    dropped += 1
                    list_dropped.append(target_person)

    # Save nodes and edges to JSON files
    with open(f'NA/nodes_{wave}.json', 'w') as json_file:
        json.dump(dict_nodes, json_file, indent=4)

    with open(f'NA/edges_{wave}.json', 'w') as json_file:
        json.dump(dict_edges, json_file, indent=4)

    # Print edge information summary
    print(f"Edge information for wave {wave}:")
    for key, value in edge_info.items():
        print(f"{key}: {value}")

    return dict_nodes, dict_edges, list_dropped


In [72]:
dict_nodes_w2, dict_edges_w2, list_dropped_w2 = data_extraction(df_w2, "participant.label", "W2")
dict_nodes_w1, dict_edges_w1, list_dropped_w1 = data_extraction(df_w1, 'network_app.1.player.participantcode', "W1")

Edge information for wave W2:
friend: 436
value: 282
politics: 232
study: 226
council: 368
leftright: 719
sentiment: 719
Edge information for wave W1:
friend: 375
value: 277
politics: 242
study: 282
council: 367
leftright: 677
sentiment: 0


In [74]:
dict_nodes_w2

{'ddh': {'leftrightself': 3,
  'gender': 2.0,
  'income': 2100.0,
  'rent': 750.0,
  'grade': 2.0,
  'partyvote': 3.0,
  'age': 19.0,
  'ocu_father': 1.0,
  'ocu_mother': 1.0,
  'edu_father': 2.0,
  'edz_mother': 2.0},
 'evh': {'leftrightself': 2,
  'gender': 2.0,
  'income': 1300.0,
  'rent': 450.0,
  'grade': 4.0,
  'partyvote': 5.0,
  'age': 22.0,
  'ocu_father': -999.0,
  'ocu_mother': 1.0,
  'edu_father': -999.0,
  'edz_mother': 2.0},
 'bs3': {'leftrightself': 3,
  'gender': 1.0,
  'income': 400.0,
  'rent': 410.0,
  'grade': 3.0,
  'partyvote': 3.0,
  'age': 19.0,
  'ocu_father': 1.0,
  'ocu_mother': 1.0,
  'edu_father': 1.0,
  'edz_mother': 1.0},
 'k4w': {'leftrightself': 7,
  'gender': 1.0,
  'income': 850.0,
  'rent': 0.0,
  'grade': 4.0,
  'partyvote': 2.0,
  'age': 26.0,
  'ocu_father': 1.0,
  'ocu_mother': 1.0,
  'edu_father': 3.0,
  'edz_mother': 2.0},
 '39b': {'leftrightself': 4,
  'gender': 1.0,
  'income': 400.0,
  'rent': 0.0,
  'grade': -888.0,
  'partyvote': 3.0,
  '

In [55]:
sum=0
for key,value in dict_edges_w1.items():
    #print(key)
    sum += len(value)
print(sum,len(list_dropped_w1))

675 87


In [56]:
sum=0
for key,value in dict_edges_w2.items():
    #print(key)
    sum += len(value)
print(sum,len(list_dropped_w2))

718 55


w1: 675+87=762

w2: 718+55=773

