In [421]:
import networkx as nx
import pandas as pd
import numpy as np
import pickle

Data Overview
- [Number] corresponds to file names '0','107','348','414','686','698','1684','1912','3437','3980'
1. facebook_combined.txt -> Gives all of the connected node pairs
2. [Number].featnames -> Gives the feature names and values of a certain set of nodes
3. [Number].feat ->  Gives the row of '1's or '0's for each node indicating whether the node has a feature or not in the corresponding [Number].featnames file rows

In [2]:
# Creates a array of file names

files = ['0','107','348','414','686','698','1684','1912','3437','3980']

In [3]:
# 1. Prints the connected node pairs
# Below shows that thre are 4,039 nodes in the data and 88,234 edges

fb = pd.read_csv('Datasets/facebook/facebook_combined.txt', delimiter = ' ', names = ['Node 1', 'Node 2'])
%store fb
fb

Stored 'fb' (DataFrame)


Unnamed: 0,Node 1,Node 2
0,0,1
1,0,2
2,0,3
3,0,4
4,0,5
...,...,...
88229,4026,4030
88230,4027,4031
88231,4027,4032
88232,4027,4038


In [4]:
# 2. Prints [Number].featnames -> the feature names of file 0
# Feature Names Structure: <feature #> <featuretype> ; <'anonymized feature'> <feature value #>

feat_names_0 = pd.read_csv('Datasets/facebook/0.featnames', names = ['Feature Names'])
feat_names_0

Unnamed: 0,Feature Names
0,0 birthday;anonymized feature 0
1,1 birthday;anonymized feature 1
2,2 birthday;anonymized feature 2
3,3 birthday;anonymized feature 3
4,4 birthday;anonymized feature 4
...,...
219,219 work;start_date;anonymized feature 170
220,220 work;start_date;anonymized feature 171
221,221 work;start_date;anonymized feature 203
222,222 work;start_date;anonymized feature 204


In [5]:
# 3. Prints [Number].feat -> displays rows for each node of '1's or '0's indicating whether the node has a feature or not in the corresponding [Number].featnames file rows

snode_feat_0 = pd.read_csv('Datasets/facebook/0.feat', names = ['Node Features'])
node_feat_0

Unnamed: 0,Node Features
0,1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
1,2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
2,3 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 ...
3,4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
4,5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
...,...
342,343 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
343,344 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
344,345 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
345,346 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 ...


In [6]:
# Prints the set of unique features

unique_feat_types = []

for i in range(len(feat_names_0)):
    unique_feat_types.append(feat_names_0['Feature Names'][i].split(' ')[1].rsplit(';',1)[0])
unique_feat_types = set(unique_feat_types)

print('{0} Unique Features:\n{1}'.format(len(unique_feat_types), unique_feat_types))

21 Unique Features:
{'location;id', 'work;end_date', 'hometown;id', 'work;position;id', 'first_name', 'education;year;id', 'education;with;id', 'education;type', 'locale', 'work;location;id', 'languages;id', 'gender', 'education;school;id', 'work;employer;id', 'work;start_date', 'last_name', 'education;degree;id', 'birthday', 'work;with;id', 'education;concentration;id', 'education;classes;id'}


In [417]:
# Matches the specific node attributes with the specific node

node_attr = {}

def match_node_attributes(featnames_path, nodefeat_path):
    attr_dict = {}
    attr_list = [] 
    
    # Creates a list of the features for each file in a array of dictionaries in the format of {attribute name: attribute value}
    feat_names = pd.read_csv(featnames_path, names = ['Feature Names'])
    for i in range(len(feat_names)):
        attr_dict.update({feat_names['Feature Names'][i].split(' ')[1].rsplit(';',1)[0] : feat_names['Feature Names'][i].split(' ')[3]})
        attr_list.append(attr_dict.popitem())
    
    # Iterates through rows of the 'Node Features' dataframe of 1s and 0s
    node_feats = pd.read_csv(nodefeat_path, names = ['Node Features'])
    for row_index, row in node_feats.iterrows():
        node_val = int(row['Node Features'].split(' ')[0])
        
        # Iterates through values in each of the rows to check if it is a 1
        for val_index, value in enumerate(row['Node Features'].split(' ')[1:]):         
            
            # updates feat_dict with attribute & its value when there is a '1' in the node features dataframe row
            #if int(value) == 1: attr_dict.update({attr_list[val_index][0] : int(attr_list[val_index][1])})
        
            if int(value) == 1: 
                values = attr_dict.get(attr_list[val_index][0], [])
                values.append(int(attr_list[val_index][1]))
                attr_dict.update({attr_list[val_index][0] : values})
                
        # Updates node_attr dictionary with key->node number and value->dict of attributes and their values
        node_attr[node_val] = attr_dict
        
        # Resets dictionary of attributes after traversing each row
        attr_dict = {}

In [418]:
# Iterates through different files of nodes and their attributes and produces a match of nodes and their associated attributes

for i in files:
    match_node_attributes('Datasets/facebook/' + i + '.featnames', 'Datasets/facebook/' + i + '.feat') 
print(node_attr)

{1: {'gender': [77], 'locale': [127]}, 2: {'education;school;id': [35], 'education;type': [53, 55], 'education;year;id': [57], 'gender': [78], 'languages;id': [92, 98], 'last_name': [114], 'locale': [126], 'location;id': [135]}, 3: {'birthday': [7], 'education;concentration;id': [14], 'education;school;id': [34, 50], 'education;type': [53, 55], 'education;year;id': [59, 65], 'gender': [78], 'languages;id': [92], 'locale': [127], 'location;id': [137], 'work;end_date': [168, 170], 'work;location;id': [137], 'work;start_date': [164, 202]}, 4: {'education;school;id': [50], 'education;type': [53, 55], 'education;with;id': [56], 'gender': [78], 'locale': [127]}, 5: {'education;school;id': [49, 50], 'education;type': [53, 54], 'education;year;id': [65], 'gender': [78], 'locale': [127]}, 6: {'birthday': [1], 'education;type': [53, 55], 'education;year;id': [62], 'gender': [78], 'last_name': [111], 'locale': [127], 'work;end_date': [157], 'work;start_date': [157]}, 7: {'education;concentration;

In [419]:
# Creates a graph using the data file of connected nodes and files with labeled node attributes

G = nx.from_pandas_edgelist(fb, 'Node 1', 'Node 2', create_using = nx.Graph())
nx.set_node_attributes(G, node_attr)
%store G 
%store node_attr

Stored 'G' (Graph)
Stored 'node_attr' (dict)


In [422]:
# Save the notebook data

file = open('./Pickle Files/Data_Wrangling_Data.pickle','wb')
pickle.dump(fb, file)
pickle.dump(G, file)
pickle.dump(node_attr, file)