# Squashy Subredit Demo


In [3]:
# Demo dependencies

# ! pip install netwulf
! pip install wordcloud

Collecting wordcloud
  Downloading wordcloud-1.9.1.1.tar.gz (222 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m222.8/222.8 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: wordcloud
  Building wheel for wordcloud (setup.py) ... [?25ldone
[?25h  Created wheel for wordcloud: filename=wordcloud-1.9.1.1-cp310-cp310-macosx_13_0_arm64.whl size=155637 sha256=261eb72ae16f31d915778460d8072761ac3750f4bf9e81792b2b4072880b7267
  Stored in directory: /Users/james/Library/Caches/pip/wheels/30/ea/66/ee489514316741d85dc783d0dbd46e987171f3e63d666b1e6c
Successfully built wordcloud
Installing collected packages: wordcloud
Successfully installed wordcloud-1.9.1.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update

In [6]:
# Connect to Memgraph instance
from mini_memgraph import Memgraph

# Connect to instance
db = Memgraph(address='localhost', port=7687)


# WARNING!
This will replace all data in your memgraph instance. Please ensure you are using a new instance of Memgraph before proceeding.

# Loading the Data
The dataset is the Subreddit Hyperlink Network (body) available from [SNAP](https://snap.stanford.edu/data/soc-RedditHyperlinks.html).

Download the dataset from [SNAP](https://snap.stanford.edu/data/soc-RedditHyperlinks.html) and drop the .tsv into the same folder as this notebook.

```
@inproceedings{kumar2018community,
  title={Community interaction and conflict on the web},
  author={Kumar, Srijan and Hamilton, William L and Leskovec, Jure and Jurafsky, Dan},
  booktitle={Proceedings of the 2018 World Wide Web Conference on World Wide Web},
  pages={933--943},
  year={2018},
  organization={International World Wide Web Conferences Steering Committee}
}
```

In [7]:
data_path = 'soc-redditHyperlinks-body.tsv'

In [8]:
import pandas as pd
# Wipe existing data ready for import
db.write('MATCH (n) DETACH DELETE n')

# Set indexes
db.set_index('SUBREDDIT')
db.set_index('SUBREDDIT','id')
db.set_constraint('SUBREDDIT','id')

# Write all unique subreddit nodes
nodes = set()
with pd.read_csv(data_path, usecols=['SOURCE_SUBREDDIT','TARGET_SUBREDDIT'], sep='\t', chunksize=10000) as reader:
    for chunk in reader:
        nodes.update(chunk['SOURCE_SUBREDDIT'].tolist())
        nodes.update(chunk['TARGET_SUBREDDIT'].tolist())

node_list = [{'id':node} for node in list(nodes)]
db.write_nodes(node_list,id_val='id',label='SUBREDDIT')

# Write subreddit to subreddit link relations
# Edge frequencies are incremented for all duplicates found and saved as a 'weight' attribute
rename_cols = {'SOURCE_SUBREDDIT':'source','TARGET_SUBREDDIT':'target'}
with pd.read_csv(data_path,usecols=['SOURCE_SUBREDDIT','TARGET_SUBREDDIT'], sep='\t', chunksize=10000) as reader:
    for chunk in reader:
        edge_list = chunk.rename(columns=rename_cols).to_dict(orient='records')
        db.write_edges(edge_list,edge_label='LINKS_TO', source_label='SUBREDDIT',target_label='SUBREDDIT',
                      on_duplicate_edges='increment')


# Identify Core Nodes

In [11]:
import squashy

core_identifier = squashy.KCoreIdentifier(database=db,
                                          node_label='SUBREDDIT',
                                          rel_label='LINKS_TO',
                                          k=2,
                                          max_cores=500)
core_identifier.identify_core_nodes()

cores_identified:0 | graph_size:35,776 | n_remaining:35,776 | min_degree:1 | max_degree:2,524 | pass_:0 | loca…

In [12]:
print(core_identifier.metrics._major_fields)

['cores_identified', 'graph_size', 'n_remaining', 'min_degree', 'max_degree', 'pass_']


In [14]:
fig = core_identifier.metrics.visualize(x='pass_',y='cores_identified')
fig.show()


# Assign Representatives

In [16]:
squashy.GraphAgglomerator?

In [21]:
import squashy
agglomerator = squashy.GraphAgglomerator(database=db,
                                         node_label='SUBREDDIT',
                                         rel_label='LINKS_TO',
                                         weight='weight', # this was created when we loaded our data. You can also omit this argument to ignore weight when choosing representatives.
                                         min_hops=1,
                                         max_hops=2
                                         )

agglomerator.describe()

To traverse: (SUBREDDIT)-[LINKS_TO]-(SUBREDDIT)
To create: (CORE)-[REPRESENTS]->(SUBREDDIT)


In [22]:
agglomerator.agglomerate()

  0%|          | 0/1000 [00:00<?, ?it/s]

In [23]:
agglomerator.set_maximum_hop(3)

In [None]:
agglomerator.agglomerate()

  0%|          | 0/1500 [00:00<?, ?it/s]