In [6]:
import pandas as pd

df = pd.read_csv('./data/merged-dataset.csv')

In [5]:
def get_network(df, from_hashtag="", from_country=""):
    ''' returns a dataframe with the edges and timestamp from tweets feed (df)'''
    
    if from_hashtag != "":
        mask = df["hashtags"].apply(lambda x: from_hashtag.lower() in x)
        filteredDf = df[mask]
    else:
        filteredDf = df

    if from_country != "":
        mask = filteredDf["country_code"] == from_country
        filteredDf = filteredDf[mask]

    edges_df = filteredDf[['screen_name', 'to', 'created_at', 'text', 'followers_count', 'friends_count', 'favourites_count', 'retweet_count', 'toxicity']]
    edges_df = edges_df.rename(columns={'screen_name': 'from'})
    edges_df = edges_df.explode('to')
    edges_df = edges_df.explode('to').reset_index(drop=True)
    
    edges_df["from"] = edges_df["from"].apply(lambda x: str("@" + x) if len(x) != 0 and x[0] != "@" else x)
    edges_df["to"] = edges_df["to"].apply(lambda x: str("@" + x) if len(x) != 0 and x[0] != "@" else x)
    
    G = nx.from_pandas_edgelist(edges_df, 'from', 'to', ["followers_count", "friends_count", "favourites_count", "retweet_count", "created_at", "text", "toxicity"], create_using=nx.DiGraph())

    return G

# takes our main dataset and combines `reply_to_screen_name` and `mentions` from tweets
# into a single `to` column
# also drops tweets that don't have any mentiosn ore replying to someone
# this should ALWAYS be used to get our primary data for the dataset
def transform_df(df):
    edges_df = df[['screen_name', 'reply_to_screen_name', 'created_at', 'hashtags', 'mentions', 'followers_count', 'friends_count', 'text', 'is_quote', 'is_retweet', 'favourites_count', 'retweet_count', 'country_code', 'verified', 'lang']]

    edges_df["hashtags"] = edges_df["hashtags"].apply(lambda x: ",".join(x))
    edges_df["mentions"] = edges_df["mentions"].apply(lambda x: ",".join(x))
    
    edges_df["reply_to_screen_name"] = edges_df["reply_to_screen_name"].fillna("")
    edges_df["reply_to_screen_name"] = edges_df["reply_to_screen_name"].apply(lambda x: "@" + x if x != "" else "")

    edges_df["to"] = edges_df["reply_to_screen_name"] + "," + edges_df["mentions"]
    edges_df["to"] = edges_df["to"].fillna("")

    edges_df["to"] = edges_df["to"].apply(lambda x: ",".join(list(set(x.split(",")))))
    edges_df["to"] = edges_df["to"].apply(lambda x: x[1:] if len(x)>0 and x[0]=="," else x)

    edges_df = edges_df.drop(["reply_to_screen_name", "mentions"], axis=1)
    edges_df["country_code"] = edges_df["country_code"].fillna("")

    noReplyFilter = edges_df["to"] != ""
    edges_df = edges_df[noReplyFilter]

    noRTFilter = edges_df["is_retweet"] != True
    edges_df = edges_df[noRTFilter]

    onlyEnglishFilter = edges_df["lang"] == "en"
    edges_df = edges_df[onlyEnglishFilter]

    rTfilters = edges_df["retweet_count"] >= 50
    edges_df = edges_df[rTfilters]

    likefilters = edges_df["favourites_count"] >= 50
    edges_df = edges_df[likefilters]

    edges_df = edges_df.drop_duplicates().reset_index(drop=True)

    return edges_df

def print_basic_stats(network):
    print("Number of nodes: " + str(network.number_of_nodes()))
    print("Number of edges: " + str(network.number_of_edges()))
    print("Average clustering coefficient: " + str(average_clustering(network)))

    G_deg = nx.degree_histogram(G)
    G_deg_sum = [a * b for a, b in zip(G_deg, range(0, len(G_deg)))]
    print('Average degree: {}'.format(sum(G_deg_sum) / G.number_of_nodes()))

    dirNet = network.to_undirected()
    print("Network diameter: " + str(diameter(dirNet)))
    # print("Eigenvector centrality: " + str(eigenvector_centrality(network)))
    # print("Closeness centrality: " + str(closeness_centrality(network)))

def create_gephi_from_network(network, name):
    nx.write_gexf(network, "./gephis/" + name + "-network.gexf")

def get_strongly_gcc(G):
    """ get the giant strongly connected component of G""" 
    SGcc = max(nx.strongly_connected_components(G), key=len)
    SGcc = G.subgraph(SGcc)
    return SGcc

def get_weakly_gcc(G):
    """ get the giant weakly connected component of G"""  
    WGcc = max(nx.weakly_connected_components(G), key=len)
    WGcc = G.subgraph(WGcc)
    return WGcc
   

def plot_network(G, G_degree=None, outputname="default", color="turquoise", n_color="blue"):
    ''' plot the graph with varying nodesize '''
    
    fig = plt.figure(num=None, figsize=(15, 15), dpi=60, facecolor='b', edgecolor='k')
    pos = nx.spring_layout(G)
    
    if G_degree:
        node_size=[v * 10 for v in dict(G_degree).values()]
    else:
        node_size = 1
        
    nx.draw(G, pos, nodelist=dict(G_degree).keys(), node_size=node_size, width=0.5, alpha=0.5, edge_color=color, node_color=n_color)
    plt.axis('off')
    plt.show()
    fig.savefig("./viz/" +outputname + "-network.svg", transparent=True)

In [7]:
import networkx as nx
import matplotlib.pyplot as plt

G = get_network(df)

degree_sequence = sorted([d for n, d in G.degree()], reverse=True)
dmax = max(degree_sequence)

plt.loglog(degree_sequence, "b-", marker="o")
plt.title("Degree rank plot")
plt.ylabel("degree")
plt.xlabel("rank")

# # draw graph in inset
# plt.axes([0.45, 0.45, 0.45, 0.45])
# Gcc = G.subgraph(sorted(nx.connected_components(G), key=len, reverse=True)[0])
# pos = nx.spring_layout(Gcc)
# plt.axis("off")
# nx.draw_networkx_nodes(Gcc, pos, node_size=20)
# nx.draw_networkx_edges(Gcc, pos, alpha=0.4)
# plt.show()

KeyError: "['toxicity', 'to'] not in index"