# Master's network summative

This code was used in my MSc to produce a paper about OSM Forum networks. It was based on data scraped from the OSM Forum in a previous project and then used as the basis for a more complete analysis as part of my MSc thesis.

For more information see [the full paper](https://docs.google.com/document/d/12grM5MfXpo88YOP7FeABqvumwlrL_wyY9rUpt5oZLD8/edit?usp=sharing) and the [short write up](https://www.jamiefawcett.org.uk/project/social-networks-osm).

In [None]:
import pandas as pd
from datetime import datetime
import ast
import numpy as np
import re
import string
import json
import requests

## Data import

Data was collected by scraping the online [OSM Forum](https://forum.openstreetmap.org/) and contains a record of each post made to the forum, the time it was made, the topic, the author and various other details. More details can be found on my [projects page](https://www.jamiefawcett.org.uk/projects/).

In [None]:
full_data = pd.read_csv("Clean_2019-03-10.csv",index_col=0)

#remove the rogue index and the additional details (as they now have their own columns)
for col in ['index', 'additional']:
    del full_data[col]

In [None]:
display(full_data.head())

## Extracting unique authors (and linking them to OSM UIDs)

Because display names on the Forum can change, we need to extract all the display names and link them to individual OSM accounts.

### Unique display name list

In [None]:
#list of unique authors with number of posts they have done (in data) + number of topics and sub-forums they have contributed to
author = full_data.groupby(['author'])[['time','topic_title','forum_title']].nunique()
making = [author]

#for each additional detail we want
additional = ['role','from','registr','num_post']
for addit in additional:
    addit_col = pd.Series(full_data.groupby(['author'])[addit].unique())
    making.append(addit_col)

#make into a df and reset the index 
author_df = pd.concat(making, axis =1, sort= False)
author_df = author_df.reset_index()

#unpack the columns
for addit in additional:
    author_df[addit] = author_df[addit].map(lambda l: l[0])

In [None]:
### Fix up dates and numbers
#fix today and yesterday
def makeDates(x):
    if "Today" in x:
        x = x.replace("Today","2019-03-10")
    if "Yesterday" in x:
        x = x.replace("Yesterday","2019-03-09")
    x = datetime.strptime(x, '%Y-%m-%d')
    return x

In [None]:
# Make 'registr' a date again
print(type(author_df['registr'][0]))
author_df['registr'] = author_df['registr'].map(lambda x: makeDates(x))
print(type(author_df['registr'][0]))

#make num post and int
print(type(author_df['num_post'][0]))
author_df['num_post'] = author_df['num_post'].map(lambda x: int(x))
print(type(author_df['num_post'][0]))

In [None]:
display(author_df)

### Get OSM UID using whosthat API

In [None]:
#replace punctuation with hex so that we can search using the API.

# define a dictionary of escaped punctuation and its replacement
hex_ver_plain = {}
for punc in string.punctuation:
    hex_ver_plain[re.escape(punc)] = punc.encode('utf-8').hex()

#use this to come up with a pattern
pattern = re.compile(r'|'.join(key for key in hex_ver_plain.keys()))

#create another dictionary with compile versions of that
hexver = {}
for punc in string.punctuation:
    hexver[re.compile(re.escape(punc))] = "%{}".format(punc.encode('utf-8').hex())

#define a function that allows us to use this dictionary to replace 
def encode_function(matchobj): 
    repl = hexver[matchobj.re]
    return matchobj.expand(repl)

#use this function to actually do it
def convert_to_hex(name):
    to_replace = re.findall(pattern,name)
    for punc in to_replace:
        name = re.sub(re.escape(punc),encode_function, name)
    return name

In [None]:
author_df['search_name'] = author_df['author'].map(lambda x: convert_to_hex(x))
# replace spaces with '+'
author_df['search_name'] = author_df['search_name'].map(lambda x: re.sub(" ","+", x))

In [None]:
display(author_df.head())

In [None]:
#Get UIDs from the whosthat API
id_list = []

for name in author_df['search_name']:
    details = {}
    api_location = "http://whosthat.osmz.ru/whosthat.php?action=info&name={}".format(name)
    response = requests.get(api_location)
    response_text = response.text
    result = json.loads(response_text)
    if result == []:
        details['id'] = None
        details['all_names'] = None
    else:
        details['id'] = result[0]['id']
        details['all_names'] = result[0]['names']
    details_df = pd.DataFrame([details.values()],index=[name], columns=details.keys())
    id_list.append(details_df)

id_df = pd.concat(id_list,axis=0)

In [None]:
display(id_df.head())

### Combine with unique author dataframe

Now we need to put current display name (and ID) into the main DF and rerun the aggregation function by current_name

In [None]:
#combine the two dataframes to get a full dataframe with ID and author in
id_author_df = author_df.merge(id_df,left_on="search_name",right_index=True)
print(len(id_author_df[id_author_df['id'].isnull()])) #missing 5582 (may not have made any map edits)

In [None]:
#get all the display names used (if multiple use dictionary)
#function to extract names with dates so all the same
def getAllNames(listed_name,all_names):
    if all_names is None or len(all_names) == 1:
        full_names = {}
        full_names[listed_name] = 1
    else:
        full_names = {}
        for item in all_names: #in the list
            alt_name = item['name']
            alt_date = item['last']
            full_names[alt_name] = alt_date
    return full_names

In [None]:
id_author_df['recent_names'] = id_author_df.apply(lambda x: getAllNames(x.loc['author'],x.loc['all_names']),axis=1) 

In [None]:
#get the current name:
def getCurrentName(recent_names):
    current = max(recent_names.keys(), key=(lambda key: recent_names[key]))
    return current

In [None]:
id_author_df['current_name'] = id_author_df['recent_names'].map(lambda x: getCurrentName(x)) 

In [None]:
def getPastNames(current_name,all_names):
    if all_names is None or len(all_names) == 1:
        return []
    else:
        extra_names = []
        for item in all_names: #in the list
            if item['name'] != current_name: #name in the dictionary is not current display name
                extra_names.append(item['name'])
        return extra_names

In [None]:
id_author_df['past_names'] = id_author_df.apply(lambda x: getPastNames(x.loc['current_name'],x.loc['all_names']),axis=1) 

In [None]:
display(id_author_df.head())

In [None]:
print(len(id_author_df))
print(id_author_df['current_name'].nunique())
print(id_author_df['id'].nunique() + len(id_author_df[id_author_df['id'].isnull()]))
print(id_author_df['id'].nunique())

In [None]:
#create a sparse version of this to insert into the full DF
check_author_df = id_author_df[['author','current_name','id','past_names']]

In [None]:
#merge this into the main dataframe
full_id_df = full_data.merge(check_author_df,on="author")

In [None]:
#save this
#full_id_df.to_csv('Full_wID.csv', mode ='w')

### Remake the author DF (to get edits/changeset)

In [None]:
display(full_id_df.head())

In [None]:
#list of unique current_names with number of posts they have done (in data) + number of topics and sub-forums they have contributed to
current_name = full_id_df.groupby(['current_name'])[['time','topic_title','forum_title']].nunique()
making = [current_name]

#for each additional detail we want (except past_names but we do author instead)
additional = ['role','from','registr','num_post','id','author'] #,'past_names'
for addit in additional:
    addit_col = pd.Series(full_id_df.groupby(['current_name'])[addit].unique())
    making.append(addit_col)
    
#make into a df and reset the index 
unique_author_df = pd.concat(making, axis =1, sort= False)
unique_author_df = unique_author_df.reset_index()

In [None]:
#unpack the columns which have only one value
for addit in ['id','role']:
    unique_author_df[addit] = unique_author_df[addit].map(lambda l: l[0])

#choose the earliest registr
unique_author_df['early_registr'] = unique_author_df['registr'].map(lambda x: min(x))

#sum contributions
unique_author_df['total_posts'] = unique_author_df['num_post'].map(lambda x: int(sum(x)))

In [None]:
display(unique_author_df)

### Get edits + reg date from OSM public profiles

In [None]:
def safe_find(element, error_message, *args, **kargs):
    try:
        return element.find(*args, **kargs)
    except:
        return error_message

def safe_findAll(element, error_message, *args, **kargs):
    try:
        return element.findAll(*args, **kargs)
    except:
        return error_message

def getEdit(current_name):
    edit_dict = {}
    profile_url = "https://www.openstreetmap.org/user/{}".format(current_name)
    response = requests.get(profile_url).text
    soup = bs(response)
    content = soup.find("div",{"class":"content-inner"})
    user_info = safe_find(content,None,"div",{"class":"userinformation-inner"})
    if user_info != None:
        joined = safe_find(user_info,None,"p",{"class":"deemphasize"}).small.text
        joined = re.sub("Mapper since: ","",joined).strip()
        edit_count = safe_find(user_info,None,"span",{"class":"count-number"}).text
        edit_count = int(re.sub(",","",edit_count))
        edit_dict['joined'] = joined
        edit_dict['num_edits'] = edit_count
    else:
        edit_dict['joined'] = None
        edit_dict['num_edits'] = None
    return edit_dict

In [None]:
#make a new dataframe
current_list = unique_author_df['current_name']
total_number = len(current_list)

In [None]:
edits_list = []
for c,name in enumerate(current_list):
    edits = getEdit(name)
    edit_series = pd.DataFrame([edits.values()],index=[name], columns=edits.keys())
    edits_list.append(edit_series)
    through = ((c+1) / total_number) * 100
    if c % 100 == 0:
        print("{}% through".format(through))
edits_df = pd.concat(edits_list,axis=0)

In [None]:
display(edits_df)

In [None]:
#merge this back in
full_num_df = unique_author_df.merge(edits_df,left_on="current_name",right_index=True)

In [None]:
#remove " | Contributor terms: Undecided" if present and make a date
def removeContrib(date):
    if date != None:
        clean = date.split("\n            |", 1)[0]
        clean_date = datetime.strptime(clean, '%B %d, %Y')
        return clean_date
    else: return None

In [None]:
full_num_df['joined_osm'] = full_num_df['joined'].map(lambda x: removeContrib(x))

In [None]:
display(full_num_df.head())

In [None]:
print(len(full_num_df[full_num_df['num_edits'].isnull()])) #267 have no record on OSM (might be private profiles)
print(len(full_num_df[full_num_df['id'].isnull()])) # 5582 have no id associated (havent made an edit since switching name?)
print(len(full_num_df[full_num_df['time']!=full_num_df['total_posts']])) #531 have a different number of posts than counted

In [None]:
current_names = full_num_df.columns.values
desired_dict = {'current_name':'current_name',
                 'time':'posts_found',
                 'topic_title':'num_topics',
                 'forum_title':'num_forums',
                 'role':'role',
                 'from':'from_list',
                 'registr':'registr_list',
                 'num_post':'post_list',
                 'id':'osm_id',
                 'author':'past_names',
                 'early_registr':'forum_registr',
                 'total_posts':'total_posts',
                 'joined':'joined_dirty',
                 'num_edits':'num_edits',
                 'joined_osm':'joined_osm'}
set_names = [desired_dict[x] for x in current_names]

In [None]:
full_num_df.columns = set_names

In [None]:
#full_num_df.to_csv('authors_with_num_edits.csv', mode = 'w') ##maybe make categorical here

### Combine all author details with the forum posts dataset

In [None]:
#again create a simplified version that can be merged back into the main dataframe
check_author_edit_df = full_num_df[['current_name','past_names','role','from_list','forum_registr','total_posts','osm_id','joined_osm','num_edits','edit_cat']]

In [None]:
display(check_author_edit_df)

In [None]:
#If loading data with list in a csv - need to remake list
def remakeList(x):
    y = re.sub("' '","','",x)
    y = ast.literal_eval(y)
    return y
check_author_edit_df['past_names_list'] = check_author_edit_df['past_names'].map(lambda x: remakeList(x))
type(check_author_edit_df['past_names_list'][0])

In [None]:
#Create a dictionary to join all current names (in the check_author_edit_df) to display names (in main df)
df2 = pd.DataFrame(check_author_edit_df['past_names_list'].tolist(), index=[check_author_edit_df['current_name']]).stack().reset_index(level=1, drop=True).reset_index()
df2.columns = ["current_name","author"]

In [None]:
print(len(df2))
display(df2)

In [None]:
#then merge that with the full list of unique authors so there is a record for each display name that links to the current name 
check_author_edit_full_df = df2.merge(check_author_edit_df,on ="current_name")

In [None]:
#merge this into the main dataframe
full_id_num_df = full_data.merge(check_author_edit_full_df,on="author")

In [None]:
display(full_id_num_df.head())

In [None]:
full_id_num_df.to_csv('Full_with_ID+num_edits.csv', mode = 'w')

## Building the networks

In [None]:
import networkx as nx
from networkx.algorithms import bipartite
import community

In [None]:
full_df = pd.read_csv('Full_with_ID+num_edits.csv',index_col=0)

In [None]:
display(full_df.head())
display(full_df.columns)

In [None]:
### Fix up dates and numbers
#fix today and yesterday
def makeDates(x):
    if "Today" in x:
        x = x.replace("Today","2019-03-10")
    if "Yesterday" in x:
        x = x.replace("Yesterday","2019-03-09")
    x = datetime.datetime.strptime(x, '%Y-%m-%d')
    return x

# Make 'registr' a date again
print(type(full_df['registr'][0]))
full_df['registr'] = full_df['registr'].map(lambda x: makeDates(x))
print(type(full_df['registr'][0]))

#convert time to a timestamp again
full_df['time'] = full_df['time'].map(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))

def makeStr(x):
    if pd.isnull(x):
        return x
    else: return str(int(round(x)))

#make OSM_id a str not a float
full_df['osm_id_str'] = full_df['osm_id'].map(lambda x: makeStr(x) )

### Make edits caregorical

Because edits are power-law distributed.

In [None]:
# Number of edits as log
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#shows distribution
contrib_plot = sns.distplot(full_num_df['num_edits'].dropna(),color="blue",kde=False)
plt.yscale('log')
# plt.xscale('log')
contrib_plot.set_title("Histogram of contributions by number of contributors") #(log) 
contrib_plot.set(xlabel="Number of contributions",ylabel="Number of contributors")
# fig = contrib_plot.get_figure()
# fig.savefig("contributor_hist.png")
plt.show()

#read out of groups - same size-ish
print(len(full_num_df['num_edits']))
print()
print("NA:",len(full_num_df[full_num_df['num_edits'].isnull()]))
print("None:",len(full_num_df[full_num_df['num_edits']==0]))
print("Less than 10:",len(full_num_df[full_num_df['num_edits']<=10]))
print("Between 10 and 100:",len(full_num_df[(full_num_df['num_edits']>10)&(full_num_df['num_edits']<=100)]))
print("Between 100 and 1000:",len(full_num_df[(full_num_df['num_edits']>100)&(full_num_df['num_edits']<1000)]))
print("Between 1000 and 10000:",len(full_num_df[(full_num_df['num_edits']>1000)&(full_num_df['num_edits']<10000)]))
print("Over 10000:",len(full_num_df[full_num_df['num_edits']>10000]))

In [None]:
###make edits categorical
def bin_edits(x):
    if pd.isnull(x):
        result = None
    elif x == 0:
        result = "No edits"
    elif (x > 0 ) & (x <= 10):
        result = "Less than 10"
    elif (x>10) & (x<=100):
        result = "Between 10 and 100"
    elif (x>100) & (x<=1000):
        result = "Between 100 and 1000"
    elif (x>1000) & (x<=10000):
        result = "Between 1000 and 10000"
    elif x>10000:
        result = "Over 10000"
    return result

In [None]:
full_df['edit_cat'] = full_df['num_edits'].map(lambda x: str(bin_edits(x)))

In [None]:
#make an attribute dict here for the nodes (would be better lower down but oh well)
full_df['attr'] = full_df.apply(lambda x: {'role':x['role_y'],'osm_id':str(x['osm_id_str']),
                                           'num_edits':x['num_edits'], 'num_edits_cat':x['edit_cat_str'],
                                           'num_posts':x['num_post'],'forum_registr':x['forum_registr'], 'joined_osm':x['joined_osm']},axis=1) 
#coerced to string to avoid nonetype

### Functions for making dates, networks and metrics

In [None]:
def allTime(data = full_df,jump=1):
    total_start = pd.Timestamp("2007-12-16 00:00:00")
    total_end = (max(data['time']) + datetime.timedelta(days=1) + pd.DateOffset(hour=1,normalize=True)) # want midnight on the day after data collection - datetime.timedelta(seconds=1) #to one second to mid that day
    date_range = pd.date_range(start= total_start,end = total_end - datetime.timedelta(days=30),freq=pd.DateOffset(days=jump),closed="right")
    return date_range

In [None]:
#function that makes all the good stuff given a start date
def makeData(start_date,period):
    data = {'start':start_date}
    end_date = start_date + datetime.timedelta(days=period)
    mask = full_df['time'].between(start_date,end_date)
    
    window_df = full_df[mask]

    ## unique authors in dataframe (bottom nodelist)
    data['authors_list'] = list(window_df['current_name'].unique())

    ## unique authors attributes
    data['authors_attrib'] = dict(zip(window_df['current_name'], window_df['attr']))

    ## unique topics in dataframe (top nodelist)
    data['topics_list'] = list(window_df['topic_title'].unique())

    ## edge list between the two
    bi_edge_df = window_df[['topic_title','current_name']].groupby(['topic_title','current_name'])['current_name'].count()
    bi_edge_df = bi_edge_df.rename(columns={'current_name':'count'}).reset_index().rename(columns={0:'count'})
    bi_edge_df['edge'] = bi_edge_df.apply(lambda x: (x['topic_title'],x['current_name']),axis=1)
    data['edge_list'] = list(bi_edge_df['edge'])
    
    return data

#function that builds only the projected graph from the data
def buildOnlyProjGraph(data):
    #build bipartite
    B = nx.Graph(name = data['start'])
    B.add_nodes_from(data['topics_list'], bipartite=0) # top nodes
    B.add_nodes_from(data['authors_list'], bipartite=1) # bottom nodes
    B.add_edges_from(data['edge_list'])
    top_nodes = {n for n, d in B.nodes(data=True) if d['bipartite']==0}
    bottom_nodes = set(B) - top_nodes
    
    #build full weighted projected
    G = bipartite.weighted_projected_graph(B, bottom_nodes)
    nx.set_node_attributes(G, data['authors_attrib']) #may actually want to remove attribute where 'none'?
    return G

In [None]:
# metrics functions 
def gini(list_of_values):
    sorted_list = sorted(list_of_values)
    height, area = 0, 0
    for value in sorted_list:
        height += value
        area += height - value / 2.
    fair_area = height * len(list_of_values) / 2.
    return (fair_area - area) / fair_area


def partition_from_attr(G, attr):
    # Part 1. We create a numeric mapping between attribute and
    # the partition. We will print this out afterwards. 
    
    attr_set = set([])
    for i in G.nodes: 
        if attr in G.nodes[i]:
            attr_set.add(G.nodes[i][attr])    
    attr_dict = {j:i for i,j in enumerate(attr_set)}
        
    part_dict = {}
    for i in G.nodes: 
        if attr in G.nodes[i]:
            part_dict[i] = attr_dict[ G.nodes[i][attr] ]
        else:
            part_dict[i] = len(part_dict)
    
    return part_dict

# function that calculates metrics for the whole graph
def getMetrics(g,name):
    giant = max(nx.connected_component_subgraphs(g),key=len)
    metrics = {}
    
    #1 -- Participatory
    #basics
    metrics['1_num_nodes'] = len(g.nodes)
    metrics['1_num_edges'] = g.size()
    metrics['1_weight_edges'] = g.size(weight='weight')
    ## Percentage isolates, giant component, other
    metrics['1_per_iso'] = len([node for node in g.nodes() if g.degree[node] == 0]) /len(g.nodes)
    #Percentage in giant component
    metrics['1_per_giant'] = len(giant)/len(g.nodes)
    
    
    #2 -- influence inequality
    
    ##inequality measure of interactions
    giant_deg_seq = sorted([d for n, d in giant.degree(weight="weight")], reverse=True)
    metrics['2_giant_gini'] = gini(giant_deg_seq)
    
    ##unique connections
    giant_deg_seq_unweight = sorted([d for n, d in giant.degree()], reverse=True)
    metrics['2_giant_gini_unweight'] = gini(giant_deg_seq_unweight)
    
    ## % contributions by top 10
    wdegree = pd.Series(dict(g.degree(weight="weight"))).sort_values(ascending=False)
    top10 = wdegree[0:(round(len(wdegree)/10) - 1)]
    metrics['2_top10_per_contrib'] = top10.sum()/wdegree.sum()
    
    ## % contributions by top 10 unweighted
    degree = pd.Series(dict(g.degree())).sort_values(ascending=False)
    top10_unweigh = degree[0:(round(len(degree)/10) - 1)]
    metrics['2_top10_per_contrib_unweigh'] = top10_unweigh.sum()/degree.sum()
    
    # 3 -- meritocracy
    
    # edit assortativity
    attribute = "num_edits_cat"

    try:
        nodelist = [node_name for node_name in giant.nodes if giant.nodes[node_name][attribute] != "None"]
        metrics['3_{}_assort'.format(attribute)] = nx.attribute_assortativity_coefficient(giant,attribute,nodelist)
    except KeyError:
        metrics['3_{}_assort'.format(attribute)] = None
        
    try:
        metrics['3_{}_assort_unfilt'.format(attribute)] = nx.attribute_assortativity_coefficient(giant,attribute)
    except KeyError:
        metrics['3_{}_assort_unfilt'.format(attribute)] = None
    
    # Atribute modularity
    attrib_part = partition_from_attr(giant,"num_edits_cat")
    metrics['3_edit_modularity'] = community.modularity(attrib_part,giant)
    
    #weighted degree assortativity
    metrics['3_giant_weightDA'] = nx.degree_assortativity_coefficient(giant,weight="weight")
    
    #4 -- Decentralisation
    
    ##identfying communities
    part = community.best_partition(giant,weight='weight')
    metrics['4_comm_modularity'] = community.modularity(part,giant)
    
    metrics['4_num_communities'] = len(pd.Series(part).unique())
    metrics['4_comm_size_gini'] = gini(list(pd.Series(part).value_counts()))
    
    #5 -- persistence
    
    metrics['5_all_nodes'] = list(g.nodes())
    metrics['5_giant_nodes'] = list(giant.nodes()) #list added here but not checked
    metrics['5_top10_nodes'] = list(top10.index)
       
    
    #RANDOM SHIT
    ##clustering coefficient
    metrics["cluster_coeff"] = nx.average_clustering(giant)
    ## Transitivity - tells us the number of open two paths in all potential triangles
    metrics["transitivity"] = nx.transitivity(giant)
    ## Density
    metrics["density"] = nx.density(giant)
    
    metrics_df = pd.DataFrame([metrics.values()],index=[name], columns=metrics.keys())
    return metrics_df

In [None]:
#Summary function that rolls them all together
def getAll(date_list):
    metric_list = []
    for c,date in enumerate(date_list):
        data = makeData(date, 30)
#         graphs = buildGraphs(data)
        graph = buildOnlyProjGraph(data)
        metrics = getMetrics(graph,data['start'])
        metric_list.append(metrics)
        if c % 30 == 0:
            print("{:.2%}".format((c+1)/len(date_list)))
    full_metric_df = pd.concat(metric_list, axis =0)
    return full_metric_df

### Run it all

In [None]:
#every 15 days 
all_days_15 = allTime(full_df,15)
all_days_15_df = getAll(all_days_15)

In [None]:
display(all_days_15_df.head())

In [None]:
### making Jaccard similarity (with a shift of 2 to create non-overlapping windows)

#shift the index by one to give the previous score
all_days_15_df['5_all_nodes_previous'] = all_days_15_df['5_all_nodes'].shift(2)
all_days_15_df['5_giant_nodes_previous'] = all_days_15_df['5_giant_nodes'].shift(2)
all_days_15_df['5_top10_nodes_previous'] = all_days_15_df['5_top10_nodes'].shift(2)

In [None]:
## Jaccard similarity 

def makeSim(past,present):
    if isinstance(past,list):
        present_s = set(present)
        past_s = set(past)
        jac_sim = len(present_s.intersection(past_s)) / len(present_s.union(past_s))
    else:
        jac_sim = None
    return jac_sim


In [None]:
all_days_15_df['5_all_nodes_jac_sim'] = all_days_15_df.apply(lambda x: makeSim(x.loc['5_all_nodes_previous'],x.loc['5_all_nodes']),axis=1) 
all_days_15_df['5_giant_nodes_jac_sim'] = all_days_15_df.apply(lambda x: makeSim(x.loc['5_giant_nodes_previous'],x.loc['5_giant_nodes']),axis=1) 
all_days_15_df['5_top10_nodes_jac_sim'] = all_days_15_df.apply(lambda x: makeSim(x.loc['5_top10_nodes_previous'],x.loc['5_top10_nodes']),axis=1) 


In [None]:
display(all_days_15_df.head())

In [None]:
all_days_15_df.to_csv('final_network_output.csv', mode = 'w')

### Unused graph building functions

In [None]:
#function that builds a Bipartite graph from the data
def buildAllGraphs(data,weight=2):
    #build bipartite
    B = nx.Graph(name = data['start'])
    B.add_nodes_from(data['topics_list'], bipartite=0) # top nodes
    B.add_nodes_from(data['authors_list'], bipartite=1) # bottom nodes
    B.add_edges_from(data['edge_list'])
    top_nodes = {n for n, d in B.nodes(data=True) if d['bipartite']==0}
    bottom_nodes = set(B) - top_nodes
    
    #build full weighted projected
    G = bipartite.weighted_projected_graph(B, bottom_nodes)
    nx.set_node_attributes(G, data['authors_attrib']) # may actually want to remove attribute where 'none'?
    
    #build cut-off weighted projected
    GG = G.copy()
    to_drop_list = [i for i in GG.edges if GG.edges[i]["weight"] < weight]
    GG.remove_edges_from( to_drop_list)

    to_drop_list = [i for i in GG.nodes if GG.degree[i]  < 1]
    GG.remove_nodes_from( to_drop_list)
    
    graphs = {'bipartite':B,'simple_projected':G,'cut_projected':GG,'top_nodes':top_nodes,'bottom_nodes':bottom_nodes}

#     print(nx.info(B))
#     print()
#     print(nx.info(G))
#     print()
#     print(nx.info(GG))
#     print()

    return graphs

#function that visualises graphs as desired -- make it better
def visGraphs(graphs,bipart=True,sim_proj=True,cut_proj=True):
    B = graphs['bipartite']
    G = graphs['simple_projected']
    GG = graphs['cut_projected']
    if bipart == True:
        pos=nx.spring_layout(B)
        
        #separates the nodes
        for node_id in graphs['top_nodes']:
            layout = pos[node_id]
            pos[node_id] = np.array([layout[0],(layout[1]+2)])

        nx.draw_networkx_nodes(B,pos,graphs['top_nodes'],node_color="skyblue",node_shape="D",alpha=.8,edgecolors="black")
        nx.draw_networkx_nodes(B,pos,graphs['bottom_nodes'],node_color="lightgreen",node_shape="s",alpha=.8,edgecolors="black")
        # nx.draw_networkx_labels(gbp, pos)
        nx.draw_networkx_edges(B,pos,alpha=.2)
        plt.axis('off')
        plt.show()
    if sim_proj == True:
        pos_forum =nx.spring_layout(G) 

        nx.draw_networkx_nodes(G,pos_forum,node_size=2,node_color="skyblue",alpha=.8)
        nx.draw_networkx_edges(G,pos_forum,width=1,alpha=.2)
        # nx.draw_networkx_labels(B,pos_forum,font_size=20,font_color='b',font_family='sans-serif')

        plt.axis('off')
        plt.show()
    if cut_proj == True:
        pos_forum =nx.spring_layout(GG) 

        nx.draw_networkx_nodes(GG,pos_forum,node_size=2,node_color="lightgreen",alpha=.8)
        nx.draw_networkx_edges(GG,pos_forum,width=1,alpha=.2)
        # nx.draw_networkx_labels(B,pos_forum,font_size=20,font_color='b',font_family='sans-serif')

        plt.axis('off')
        plt.show()
        