## MST 698S - Data Science Tools and Techniques

## Network X - Build Network from Twitter Data

Import relevant libraries:

In [0]:
import networkx as nx
import json
import numpy as np
from copy import deepcopy

Define global parameters:

In [0]:
twitter_followers_path = r'twitter_followers.json'

Define relevant functions:

In [0]:
#Get set of user_ids
def get_unique_ids(followers_dict_list):
    all_screen_names_list = []
    for follower in followers_dict_list:
        followers_list = list(follower.values())[0]
        all_screen_names_list = all_screen_names_list + followers_list
        all_screen_names_list.append(list(follower.keys())[0])
    unique_screen_names = list(set(all_screen_names_list))
    return unique_screen_names

#Get unique edges
def clean_edge_list(edge_list):
    temp = []
    for (a,b) in edge_list:
        if (a,b) not in temp and (b,a) not in temp:
            if (a,b) != (b,a):
                temp.append((a,b))
    output = 1*temp
    return output

#Get edge list
def get_edges(followers_dict_list):
    edge_list = []
    edge_list_out = []
    for relationship in followers_dict_list:
        for friend in list(relationship.values())[0]:
            edge = (list(relationship.keys())[0],friend)
            edge_list.append(edge)
    edge_list = list(set(edge_list))
    edge_list_out = clean_edge_list(edge_list)
    return edge_list_out


#Build association matrix
def build_association_matrix(followers_dict_list):
    output_matrix = []
    unique_names = get_unique_ids(followers_dict_list)
    followers_2 = []
    for unique_name in unique_names:
        name_array = []
        for follower_list in followers_dict_list:
            if list(follower_list.keys())[0] == unique_name:
                followers_2 = list(follower_list.values())
        for unique_name in unique_names:
            if unique_name in followers_2:
                name_array.append(1)
            else:
                name_array.append(0)
        output_matrix.append(name_array)
    association_matrix = np.matrix(output_matrix)
    return association_matrix

#build graph
def build_ego_graph(unique_screen_names, edge_list_out):
    G = nx.Graph()
    G.add_nodes_from(unique_screen_names)
    G.add_edges_from(edge_list_out)
    return G

Let's import our twitter followers list and print the first few items:

In [4]:
with open(twitter_followers_path, 'r') as twitter_file:
    followers_list = json.load(twitter_file)

print(followers_list[:10])

[{'Komarova220899': ['Saelky1', 'namedmeklass', 'esia96', 'Arsenteva_katy']}, {'kmUloUTTY30jlMP': ['Komarova220899', 'vitalikcalm', '_brkv_', 'bokkarev1', 'Malyshe2002', '__ELIZAVETA__', 'moskalevaal01', 'hellofriends92', 'marina_alek', 'l0224m', 'shidagis04033', 'Alina_Kata_', 'Polina_Ivchenko', 'imlerachka', 'ninka_ushastyy', 'dashadasha25', 'Koshka23082012', '_Tomilina_']}, {'svLja3KwvMDcrGi': []}, {'sshhfq': ['87FvuoW2GbufOPU', 'k_chernenko', 'moskalevaal01', 'sofika_999', 'alexandr_25_17', '2QfpBoA43t1tzcL', '_Karimova_15', 'v_malenda', 'mokretsova_38', 'davidovaeliz02', 'vlasevskaya2016', 'TSerega_138RUS', '666Sofya', 'anastasssss_', 'shidagis04033', 'bezenkova_777']}, {'v71xk': []}, {'v_malenda': ['_wicked19_', 'sshhfq7', 'ti7s0d7v1tOh3K4', '9XbxzD6XfV73duz', 'alexandr_25_17', 'crybabycry050', 'rusaliiinaaa', '_vorotilova_', 'sshhfq', 'Petstore_ru']}, {'Km9873185647': ['_vorotilova_', 'mokretsova_38']}, {'neoeroakauraaka': ['dkkcp', 'yokopvawjbwm', 'sefurematuri', 'iKPE8GV55', '

Now lets clean up the followers list, extract the useful information, and build our graph object (note that building the graph object may take a while...):

In [0]:
#Get unique user ids:
unique_screen_names = get_unique_ids(followers_list)

#Get connections (edges)
edge_list_out = get_edges(followers_list)

#Build association matrix
association_matrix = build_association_matrix(followers_list)

#Instantiate graph object
G = build_ego_graph(unique_screen_names, edge_list_out)

Let's examine the association matrix (first 10 elements):

In [0]:
print(association_matrix[:10,:10])

Now, let's prune the graph to remove screen names with only one connection to aid in visualiation:

In [0]:
#Calculate degree centralities
degree_centralities = nx.degree_centrality(G)
one_degree = min(degree_centralities.values())

#Get nodes of one degree:
one_degree_nodes = []
for node in degree_centralities.keys():
    if degree_centralities[node] == one_degree:
        one_degree_nodes.append(node)

#Create copy of the graph 
H = copy.deepcopy(G)

#Remove nodes with only one connection
print('Total nodes before pruning: {0}').format(str(len(G.nodes())))

for node in one_degree_nodes:
    H.remove_node(node)

print('Total nodes after pruning: {0}').format(str(len(H.nodes())))

Now let's draw the pruned graph:

In [0]:
nx.draw(H)
plt.show()

Now let's use the eigenvector centralities to find the most influential nodes:

In [0]:
#Calculate centralities
eigenvector_centralities = nx.eigenvector_centralities(H)

#Print node with maximum eigenvector centrality
inverse = [(value, key) for key, value in eigenvector_centralities.items()]
print max(inverse)[1]