In [1]:
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from datetime import datetime as dt
import time
from tqdm import tqdm
from collections import Counter 

## Loading The Data


In [2]:
def prepare_datasets():
    
    data_a2q = pd.read_csv('sx-stackoverflow-a2q.txt', delimiter = ' ', names=['Source','Target','Time'])
    data_c2a = pd.read_csv('sx-stackoverflow-c2a.txt', delimiter = ' ', names=['Source','Target','Time'])
    data_c2q = pd.read_csv('sx-stackoverflow-c2q.txt', delimiter = ' ', names=['Source','Target','Time'])
    
    data_a2q['Time']=pd.to_datetime(data_a2q['Time'], unit='s').dt.date
    data_c2a['Time']=pd.to_datetime(data_c2a['Time'], unit='s').dt.date
    data_c2q['Time']=pd.to_datetime(data_c2q['Time'], unit='s').dt.date
     
    data_a2q = data_a2q[data_a2q['Source'] != data_a2q['Target']]
    data_c2a = data_c2a[data_c2a['Source'] != data_c2a['Target']]
    data_c2q = data_c2q[data_c2q['Source'] != data_c2q['Target']]
    
    data_a2q['Year'] = pd.DatetimeIndex(data_a2q['Time']).year
    data_c2a['Year'] = pd.DatetimeIndex(data_c2a['Time']).year
    data_c2q['Year'] = pd.DatetimeIndex(data_c2q['Time']).year
    
    ipt1 = (data_a2q['Year'] >= 2013) & (data_a2q['Year'] <= 2016)
    data_a2q = data_a2q.loc[ipt1]
    ipt2 = (data_c2a['Year'] >= 2013) & (data_c2a['Year'] <= 2016)
    data_c2a = data_c2a.loc[ipt2]
    ipt3 = (data_c2q['Year'] >= 2013) & (data_c2q['Year'] <= 2016)
    data_c2q = data_c2q.loc[ipt3]
    
    return data_a2q, data_c2a, data_c2q    

In [3]:
data_a2q, data_c2a, data_c2q = prepare_datasets()

In [4]:
data_a2q.head()

Unnamed: 0,Source,Target,Time,Year
7563262,941240,584508,2013-01-01,2013
7563263,16007,1176091,2013-01-01,2013
7563265,1867379,1896848,2013-01-01,2013
7563266,1935971,1109988,2013-01-01,2013
7563267,1245254,1939771,2013-01-01,2013


In [5]:
time_interval = [dt.strptime('1-1-1970', "%d-%m-%Y").date(),dt.strptime('31-12-2022', "%d-%m-%Y").date()]

In [6]:
def create_graph(dataset, weight, time_interval):
    
    source = dataset['Source'].values
    target = dataset['Target'].values
    time = dataset['Time'].values
    
    start = time_interval[0]
    end = time_interval[1]
    G = nx.DiGraph()    
    
    for i in tqdm(range(len(source))):
        if start <= time[i] <= end:
            if source[i] != target[i]: #check if edge is not self referencing
                if (source[i], target[i]) in G.edges():
                    G[source[i]][ target[i]]['weight'] += weight
                else:
                    G.add_edge(source[i],target[i], weight = weight)   
    return G  

In [7]:
def merge_graph(time_interval):   
    
    G_a2q = create_graph(data_a2q, weight = 2, time_interval = time_interval)
    G_c2a = create_graph(data_c2a, weight = 1, time_interval = time_interval)
    G_c2q = create_graph(data_c2q, weight = 1, time_interval = time_interval)    
    
    #create a copy of first graph
    G = G_a2q.copy()
    
    #add edges of second graph
    for edge in G_c2a.edges:
        if (edge[0], edge[1]) in G.edges():
            G[edge[0]][edge[1]]['weight'] +=1
        else:
            G.add_edge(edge[0],edge[1], weight = 1)
            
    #add edges of third graph
    for edge in G_c2a.edges:
        if (edge[0], edge[1]) in G.edges():
            G[edge[0]][edge[1]]['weight'] +=1
        else:
            G.add_edge(edge[0],edge[1], weight = 1)
        
    return G
    

In [None]:
G_a2q = create_graph(data_a2q, weight = 2, time_interval = time_interval)
G_c2a = create_graph(data_c2a, weight = 1, time_interval = time_interval)
G_c2q = create_graph(data_c2q, weight = 1, time_interval = time_interval)

100%|██████████████████████████████████████████████████████████████████████| 9520434/9520434 [26:10<00:00, 6061.44it/s]
100%|████████████████████████████████████████████████████████████████████| 10494445/10494445 [31:33<00:00, 5543.39it/s]
 15%|██████████▍                                                          | 1504126/9885202 [00:18<01:32, 90881.24it/s]

In [None]:
nx.write_gexf(G_a2q,'a2q_graph.gexf')

## Functionality 1

In [None]:
# Whether the graph is directed or not
def graph_info(graph):
    
    # 1. Whether the graph is directed or not
    
    # I assume that the graph is undirected so I initialize my variable as False.
    d = False
    
    # Now I take all the edges.
    ed = [couple for couple in graph.edges()]
    
    # For each couple 
    for edge in ed:
        
        # If it exists a correspondece from A to B but not from B to A, the graph is directed.
        # So if there's not an reversed relatioship between the nodes, the graph must be directed 
        # and the loop can break.
        if (edge[1], edge[0] not in ed):
            d = True
            break
            
    # I print the output based on the previous results.       
    if d == True:
        print('The graph is directed')
    else:
        print('The graph is undirected')
        
    # 2. Number of users
    
    # I initialize an empty set in order to insert into it the nodes
    UserSet = ()
    
    # For each element in the graph, if that element is not in the set, add it.
    for i in tqdm(range(len(source))):
        if (source[i] not in UserSet):
            UserSet.append(source[i])
        if (target[i] not in UserSet):
            UserSet.append(target[i])
    
    # The set is made by all the unique values
    # so I can know the number of users finding how many elements are into UserSet
    print('The number of users is: ', len(UserSet))
    
    # 3. Average number of links per user
    
    # I initialize an empty string in order to insert into it the nodes (users)
    LinkUser = []
    
    # I consider the edges
    ed = [n for n in graph.edges()]
    edges_dictionaire = defaultdict(int)
    
    for i in ed:
        user = ed[0]
        edges_dictionaire[user] += 1
    
    for edge in edges_dictionaire.values():
        LinkUser.append(edge)
    
    print(f'The average number of links per user is {np.mean(LinkUser)}')
    
    # 4. Density degree of the graph
    
    if d == 'undirected':
        g_density = 2*graph.nodes.count()/(len(UserSet)*(len(UserSet)-1))
        print(g_density)
    if d == 'directed':
        g_density = graph.nodes.count()/(len(UserSet)*(len(UserSet)-1))
        print(g_density)
    
   # 5. Whether the graph is sparse or dense
      # Dense graph is a graph in which the number of edges is close to the maximal number of edges. 
      # Sparse graph is a graph in which the number of edges is close to the minimal number of edges.
    max_edges = (len(UserSet)*len(UserSet-1))/2
    if graph.edges.count() == max_edges:
        print('The graph is dense')
    if graph.edges.count() in range(0:100): # 100 ??
        print('The graph is sparse')

### Functionality 2 - Find the best users!

In [None]:
def Betweeness


def best_users(user, time_int, metric)

### Functionality 3