# Instagram Network Analysis

In [1]:
import networkx as nx
import pandas as pd

### Number of nodes and edges 

In [2]:
G = nx.read_adjlist('adjList.txt', nodetype=str,create_using=nx.DiGraph())
print(G.number_of_nodes(),G.number_of_edges())

20001 24978


In [3]:
print(nx.average_clustering(G))

0.009702593121140572


In [4]:
print(len(nx.dominating_set(G)))

15865


In [5]:
follower_count = [(i,k) for i,k in (G.in_degree)]
follower_sorted = sorted(follower_count, key=lambda x: x[1], reverse=True)

### Followers count

In [6]:
#print(follower_sorted[:25])
df = pd.DataFrame(follower_sorted[:10], columns=['Account','Follower count'])
df

Unnamed: 0,Account,Follower count
0,whitehouse,61
1,jeanne_andreaa,51
2,nytimestravel,51
3,sonymusicsoundtracks,50
4,susanabbott_art,50
5,fretin98,37
6,the_sstan,27
7,electronicartsintermix,26
8,donsonn,25
9,chenggao1999,25


In [7]:
following_count = [(i,k) for i,k in (G.out_degree)]
following_sorted = sorted(following_count, key=lambda x: x[1], reverse=True)

### Following count 

In [8]:
#print(following_sorted[:25])
df = pd.DataFrame(following_sorted[:10], columns=['Account','Following count'])
df

Unnamed: 0,Account,Following count
0,fretin98,1864
1,as_cool_as_cucumber_amber,202
2,khriswu8,201
3,aaannie_wang,200
4,dawn_rz_,200
5,ivyyyl_,200
6,schuwii_z,200
7,yitingzhang,200
8,donsonn,200
9,chenggao1999,200


In [9]:
avg_neighbor_degrees = nx.average_neighbor_degree(G)
avg_neighb_deg_list = [(name,val) for name,val in avg_neighbor_degrees.items()]
avg_neigh_d_sorted = sorted(avg_neighb_deg_list, key=lambda x: x[1], reverse=True)

In [10]:
print(avg_neigh_d_sorted[:20])

[('user1', 932.0), ('totoschnell2', 310.6666666666667), ('taizhe_xu', 203.8), ('dyuanb', 201.9), ('_alexbriggs', 200.0), ('_davelun', 200.0), ('_lucyhe_', 200.0), ('alex.hu.9693', 200.0), ('alishapeng24601', 200.0), ('arnaud.biebuyck', 200.0), ('brendan_thomas', 200.0), ('deerdeerwq', 200.0), ('dsaha02', 200.0), ('g4oza_no_easy', 200.0), ('guaaa_z', 200.0), ('jiahao9329', 200.0), ('viviii0799', 200.0), ('chuanhweeeee', 200.0), ('coconini_z', 200.0), ('copythattaylor', 200.0)]


In [11]:
accounts_with_zero_following=[]
for name,val in (G.out_degree):
    if val==0:
        accounts_with_zero_following.append((name,G.in_degree(name)))
        
accounts_with_0_following_sorted = sorted(accounts_with_zero_following, key=lambda x:x[1], reverse=True)

### Accounts with 0 following and their follower count within the network

In [12]:
#print(accounts_with_0_following_sorted)
df = pd.DataFrame(accounts_with_0_following_sorted, columns=['Account','Follower count'])
df

Unnamed: 0,Account,Follower count
0,aleha_84,11
1,wecouldgrowup2gether,11
2,eldenring,11
3,lib__rah,11
4,elonofficiall,11
...,...,...
13848,davidlu8889,1
13849,elaine_jiang99,1
13850,heytakki_,1
13851,marcellohdz,1


In [13]:
page_ranks = nx.pagerank(G)
page_ranks_sorted = sorted(page_ranks.keys(), key= lambda x: page_ranks[x], reverse=True)

### Page rank 

In [14]:
#print(page_ranks_sorted[:20])
df = pd.DataFrame(page_ranks_sorted[:20], columns=['Account'])
df

Unnamed: 0,Account
0,whitehouse
1,jeanne_andreaa
2,nytimestravel
3,sonymusicsoundtracks
4,susanabbott_art
5,takumiotanii
6,donsonn
7,electronicartsintermix
8,parissfch
9,amyzzeng


In [15]:
hits = nx.hits(G)
hits_sorted = sorted(hits[0], key=lambda x:hits[0][x], reverse=True)

### Hits 

In [16]:
#print(hits_sorted[:20])
df = pd.DataFrame(hits_sorted[:20], columns=['Account'])
df

Unnamed: 0,Account
0,fretin98
1,hhhzzz19
2,skyfish_yu
3,irving_ma
4,stephanie_dayhot
5,eon.zjy
6,cristal_xww
7,franck_wang
8,yz_jiang9
9,itsdottied_w


In [17]:
closeness = nx.closeness_centrality(G)
betweenness = nx.betweenness_centrality(G)

# These two take exponentially large time which could take hours to compute

## Graph with specific subset of nodes and edges between them

In [18]:
import essentialRoutines
adjFile = open("adjList.txt","r")
adjList = adjFile.readlines()
adjFile.close()
allNodes = essentialRoutines.adjList_to_dict(adjList)

In [19]:
my_neighbors = allNodes['fretin98']

In [20]:
G_small = nx.DiGraph()
for neighbor in my_neighbors:
    G_small.add_edge('fretin98',neighbor)
    
for node in allNodes.keys():
    if 'arjun_siva__' in allNodes[node]:
        my_neighbors.append(node)
        G_small.add_edge(node, 'fretin98')

In [21]:
for n1 in my_neighbors:
    for n2 in my_neighbors:
        try:
            if n1 in allNodes[n2]:
                G_small.add_edge(n2, n1)
        except:
            continue
            
print(G_small.number_of_nodes(), G_small.number_of_edges())

2 1


In [22]:
b_c = nx.betweenness_centrality(G_small)
b_c_sorted = sorted(b_c.items(), key=lambda x:b_c[x[0]], reverse=True)

### Betweenness centrality 

In [23]:
#print(b_c_sorted[:20])
df = pd.DataFrame(b_c_sorted[:20], columns=['Account','Value'])
df

Unnamed: 0,Account,Value
0,fretin98,0.0
1,zzzzzzshirley,0.0


In [24]:
c_c = nx.closeness_centrality(G_small)
c_c_sorted = sorted(c_c.items(), key=lambda x:c_c[x[0]], reverse=True)

### Closeness centrality 

In [25]:
#print(c_c_sorted[:20])
df = pd.DataFrame(c_c_sorted[:20], columns=['Account','Value'])
df

Unnamed: 0,Account,Value
0,zzzzzzshirley,1.0
1,fretin98,0.0


In [26]:
G_small_undir = G_small.to_undirected()

In [27]:
preds = nx.preferential_attachment(G_small_undir)
preds_sorted = sorted(preds, key=lambda x:x[2], reverse=True)

### Preferential attachment 

In [28]:
#print(preds_sorted[:20])
df = pd.DataFrame(preds_sorted[:20], columns=['Account 1','Account 2','Value'])
df

Unnamed: 0,Account 1,Account 2,Value


In [29]:
def find_unconnected_nodes(selected_node, node_list, G):
    
    # obtain all the nodes connected to the selected node
    connected_nodes = [n for _, n in G.edges(selected_node)]

    # a feasible node is one not in connected_nodes
    feasible_nodes = [feasible_n for feasible_n in node_list if feasible_n not in connected_nodes + [selected_node]]

    return feasible_nodes

In [30]:
unconnected_nodes = []
not_connected_list = find_unconnected_nodes('fretin98', G_small_undir.nodes(), G_small_undir)
for n in not_connected_list:
    unconnected_nodes.append(('fretin98',n))

In [31]:
j_coef = nx.jaccard_coefficient(G_small_undir, ebunch=unconnected_nodes)
j_coef_sorted = sorted(j_coef, key=lambda x:x[2], reverse=True)

### Jaccard Coefficient between a specific node and all nodes not connected to it 

In [32]:
#print(j_coef_sorted[:20])
df = pd.DataFrame(j_coef_sorted[:20], columns=['Account 1','Account 2','Value'])
df

Unnamed: 0,Account 1,Account 2,Value


### Exporting to gexf format for Gephi

In [33]:
nx.write_gexf(G, "Full_graph.gexf")

In [34]:
nx.write_gexf(G_small, "Subset.gexf")