In [1]:
import pandas as pd
import numpy as np
import random as rand
import tqdm

# Preprocessing

## Group & Individual attributes

In [117]:
# Load group attributes
group_attributes = pd.read_csv('Nashville meetup data\meta-groups.csv')

# Only consider 10 least popular groups
considered = group_attributes['category_name'].value_counts().index[0:10].tolist()
group_attributes = group_attributes[group_attributes['category_name'].apply(lambda x: x in considered)]

In [118]:
# Load memberships
member_to_group = pd.read_csv('Nashville meetup data\member-to-group-edges.csv')

# Load individual member attributes
attributes = pd.read_csv('Nashville meetup data\meta-members.csv')

In [119]:
# Create membership dataframe including category_name
member_group_category = pd.merge(member_to_group, group_attributes[['group_id', 'category_name']], how = 'left', left_on = 'group_id', right_on = 'group_id').dropna(subset=['category_name'])

# Create dummies for the category names
member_group_category_oh = member_group_category.drop('category_name', axis=1).join(pd.get_dummies(member_group_category['category_name']))

# For every member, merge their attributes with the onehot encoded dummies, finding max for categories (all categories that they are members of)
final_attributes = pd.merge(attributes, member_group_category_oh.groupby('member_id').agg('max'), how='left', right_on='member_id', left_on='member_id')

# If a member has never attended any of the groups considered, fill the categories with False
final_attributes[considered] = final_attributes[considered].fillna(False)

In [120]:
events = group_attributes['group_id'].sample(30).tolist()

## Edgelist

In [121]:
attendance = pd.read_csv('Nashville meetup data/rsvps.csv')

In [122]:
edges = []
names = set()

# For all different events
for name, group in tqdm.tqdm(attendance.groupby('event_id')):

    # If sampled only consider events in sampled events
    if group['group_id'].iloc[0] in events:

        # for every group member combo n, x append the tuple to list
        for i, n in enumerate(group['member_id'].tolist()):
            names.add(n)
            for j, x in enumerate(group['member_id'].tolist()[i:]):
                names.add(x)
                if x != n:
                    edges.append(sorted((n,x)))

100%|██████████| 19031/19031 [00:01<00:00, 16535.91it/s]


In [123]:
edges = pd.DataFrame(edges, columns=['member1', 'member2'])

# Graph

In [124]:
import networkx as nx

G = nx.Graph()

for index, entry in tqdm.tqdm(edges.iterrows()):
    x, y = entry['member1'], entry['member2']
    G.add_edge(x,y)

121760it [00:04, 29747.56it/s]


# Methodology

## Prototyping

In [125]:
def prototype():
    prot = rand.choice([n for n in G])
    subgraph = nx.descendants(G, prot)
    subgraph.add(prot)

    return prot, G.subgraph(subgraph)

In [126]:
prototype()

(185291120, <networkx.classes.graph.Graph at 0x2bb319e69d0>)

## Quality Measure

In [127]:
lu = final_attributes.set_index('member_id')

In [130]:
def Q(S, G, target, lu = lu):
    S_size = len(S)
    G_size = len(G)
    cover = S_size / G_size
    n_target_S = lu.loc[S][target].sum()

    ser_G = pd.Series(G)
    G = ser_G[ser_G.isin(lu.index)].tolist()
    
    n_target_G = lu.loc[G][target].sum()

    WRAcc = (cover**0.5)* ((n_target_S/S_size) - (n_target_G/G_size))

    return abs(WRAcc)

    

## Testing

In [131]:
for _ in range(20):
    p, s = prototype()

    distances = nx.resistance_distance(s, p)
    
    distances = dict(sorted(distances.items(), key=lambda item: item[1]))
    WRAccs = []
    for x in range(50):
        SG = list(distances.keys())[0:x+5]
        RG = list(G.nodes)
        WRAccs.append(Q(SG,RG,'Tech'))
    print(f'Max WRAcc found for prototype {p} was {max(WRAccs)} with size {WRAccs.index(max(WRAccs)) + 5}')

Max WRAcc found for prototype 111592472 was 0.05643663649634879 with size 51
Max WRAcc found for prototype 195335106 was 0.07868212133867035 with size 54
Max WRAcc found for prototype 12125165 was 0.07868212133867035 with size 54
Max WRAcc found for prototype 238768583 was 0.05609086418646803 with size 54
Max WRAcc found for prototype 201497860 was 0.05889863471824174 with size 54
Max WRAcc found for prototype 11151756 was 0.05609086418646803 with size 54
Max WRAcc found for prototype 227037978 was 0.07868212133867035 with size 54
Max WRAcc found for prototype 186165686 was 0.07868212133867035 with size 54
Max WRAcc found for prototype 201154548 was 0.07868212133867035 with size 54
Max WRAcc found for prototype 223909273 was 0.061706405250015466 with size 54
Max WRAcc found for prototype 48232882 was 0.07868212133867035 with size 54
Max WRAcc found for prototype 11696972 was 0.05902772761625433 with size 54
Max WRAcc found for prototype 97825272 was 0.05609086418646803 with size 54
Max

In [97]:
lu

Unnamed: 0_level_0,name,hometown,city,state,lat,lon,group_id,weight,Career & Business,Community & Environment,Games,Health & Wellbeing,Music,New Age & Spirituality,Outdoors & Adventure,Socializing,Sports & Recreation,Tech
member_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2069,Wesley Duffee-Braun,Brentwood,Brentwood,TN,36.00,-86.79,19277993.0,4.0,False,False,False,False,False,False,False,False,False,True
8386,Tim,Nashville,Nashville,TN,36.07,-86.78,19654655.0,8.0,False,False,False,False,True,False,True,False,False,True
9205,Brenda,Brentwood,Brentwood,TN,36.00,-86.79,1585196.0,20.0,False,False,False,False,False,False,True,False,False,False
17903,Steve,,Nashville,TN,36.13,-86.80,7130232.0,1.0,False,False,False,False,False,False,False,False,False,True
20418,Andrea Reynolds,"Huntington, WV",Nashville,TN,36.17,-86.72,,,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
239513469,Madison Ray,,Nashville,TN,36.09,-86.82,19997487.0,1.0,False,False,False,False,False,False,False,False,False,True
239515413,Jay Hatchett,,La Vergne,TN,36.00,-86.57,19822479.0,1.0,True,False,False,False,False,False,False,False,False,False
239519977,Lukasz Bielawski,,Nashville,TN,36.17,-86.78,6335372.0,1.0,False,False,False,False,False,False,False,False,True,False
239520184,Terri Taylor,,Nashville,TN,36.14,-86.74,1585196.0,1.0,False,False,False,False,False,False,True,False,False,False
