In [1]:
# This notebook takes the edgelist (edgelist_ids.csv) and 
# 1) maps the node names from Twitter ID -> Mapped index starting from 0
# 2) saves mapped index values
# 3) saves mapped edgelist to 'edges.csv'
import pandas as pd
import time
from fun.fun import *
import csv

In [2]:
# Variables
edges_fn = "../data/edgelist_ids.csv"
edges_total = 684_732_453 # hardcoded

In [3]:
# -> IN : Read edges to dataframe
perc = 100
nrows=int(edges_total*perc/100)
print("reading edges ... ", end='')
start = time.time()
df = pd.read_csv(edges_fn, nrows=nrows)
end = time.time()
print("read {:_} lines (took {:.1f}s)".format(len(df), (end-start)))

reading edges ... read 684_732_453 lines (took 340.8s)


In [4]:
# DICT : get unique nodes sorted
nodes = set(df['source']).union(set(df['target']))
print("Found {:_} unique nodes".format(len(nodes)))
print("sorting ... ", end='')
start = time.time()
nodes = sorted(nodes)
end = time.time()
print("done (took {:.1f}s)".format(end-start))
node_index = { node: i for i, node in enumerate(nodes) }

Found 410_885 unique nodes
sorting ... done (took 0.2s)


In [None]:
# <- OUT : write node ids to file
file = open("../data/node_ids.csv", 'w', newline='')
writer = csv.writer(file)
writer.writerow(['index', 'twitter id'])
for id, i in node_index.items():
    writer.writerow([i, id])
file.close()

In [5]:
# <- OUT : Write mapped values to csv
fn = "../data/edges.csv"
file = open(fn, 'w', newline='')
writer = csv.writer(file)
writer.writerow(['source', 'target'])
rows_written, total = 0, len(df)
for _, row in df.iterrows():
    row = [node_index[row['source']], node_index[row['target']]]
    writer.writerow(row)
    rows_written, perc = track_progress(total, rows_written, text="rows written:", inc=100)
file.close()

 rows written: 684_732_401/684_732_453 (99.99999%)