# Single node analysis

In [1]:
from pyspark.sql import SparkSession
from pyspark.context import SparkContext
from pyspark.sql import SQLContext

from pyspark.sql import functions as f
from pyspark.sql import types as T
from pyspark.sql import Window

import os

In [2]:
spark = SparkSession.builder.appName("Python Spark SQL").config("spark.jars.packages", "graphframes:graphframes:0.8.2-spark3.2-s_2.12").config("spark.executor.memory", "8g").getOrCreate()
sqlContext = SQLContext(spark.sparkContext)



In [3]:
from graphframes import *

## Prepare data

In [4]:
vertices = spark.read.parquet('/user/data/graph_with_more_info/vertices/*.parquet')
edges = spark.read.parquet('/user/data/graph_with_more_info/edges/*.parquet')

In [5]:
vertices.show(5)

+--------------------+
|                  id|
+--------------------+
|de Azevedo Anibal...|
|de Azevedo E. R. ...|
|de Azevedo Fabio ...|
|de Azevedo J. L. ...|
|de Azevedo L. J. ...|
+--------------------+
only showing top 5 rows



In [6]:
edges.show(5)

+--------------+---------------+--------------+------------------+------------------+
|           src|            dst|articles_count|oldest_update_date|newest_update_date|
+--------------+---------------+--------------+------------------+------------------+
|"Osterberg K. |     Garcia F. |             1|        2021-08-11|        2021-08-11|
|"Osterberg K. |Gr"unendahl S. |             1|        2021-08-11|        2021-08-11|
|       "oser  |       Bose D. |             1|        2016-08-14|        2016-08-14|
|       "oser  |    Geisler M. |             1|        2016-08-14|        2016-08-14|
|       "oser  |  Johansson H. |             1|        2016-08-14|        2016-08-14|
+--------------+---------------+--------------+------------------+------------------+
only showing top 5 rows



### Prepare only a sample of 10k vertices

In [7]:
vertices = vertices.sample(fraction=0.006)

In [8]:
all_vertices = list(vertices.select('id').rdd.map(lambda x: (1, x)).reduceByKey(lambda x1, x2: x1 + x2).collect()[0][1])

In [9]:
len(all_vertices)

9609

In [10]:
edges = edges.filter((f.col("src").isin(all_vertices)) & (f.col("dst").isin(all_vertices)))

In [11]:
vertices.show(5)

+--------------------+
|                  id|
+--------------------+
|de Azevedo E. R. ...|
|       de Bruijn O. |
|      de Camargo A. |
|de Castro Davi Fe...|
|de Cataldo Mark A...|
+--------------------+
only showing top 5 rows



In [12]:
edges.show(5)

+-----------------+------------------+--------------+------------------+------------------+
|              src|               dst|articles_count|oldest_update_date|newest_update_date|
+-----------------+------------------+--------------+------------------+------------------+
|        Abreu P. |Bertolli C. Pérez |             7|        2021-04-16|        2021-12-14|
|Alekseyev Max A. |   Alexeev Nikita |             5|        2015-04-07|        2019-11-27|
|   Barclay S. E. |      e A. Vicer' |             1|        2019-11-19|        2019-11-19|
|        Cella G. |            Li B. |            26|        2021-03-22|        2022-01-04|
|   Cornish N. J. |            Li B. |             5|        2021-11-22|        2022-01-04|
+-----------------+------------------+--------------+------------------+------------------+
only showing top 5 rows



In [13]:
vertex_example = all_vertices[0]

In [14]:
g = GraphFrame(vertices, edges)

## Degree centrality (DC)


In [15]:
g.degrees.where(f.col("id") == vertex_example).show()

+---+------+
| id|degree|
+---+------+
+---+------+



In [16]:
degree_centrality = g.degrees
degree_centrality = degree_centrality.withColumnRenamed('id', 'author')

In [17]:
degree_centrality.orderBy("degree", ascending=False).show(10)

+--------------------+------+
|              author|degree|
+--------------------+------+
|De La Cruz-Burelo...|    84|
|            Wang F. |    80|
|           Huang J. |    78|
|             Cho K. |    72|
|           Cella G. |    66|
|           White S. |    60|
|        Mladenov D. |    60|
|      Karyotakis Y. |    60|
|   van Veggel A. A. |    56|
|         Liolios A. |    54|
+--------------------+------+
only showing top 10 rows



In [18]:
degree_centrality.write.mode("overwrite").parquet('../data/single-node-analysis/degree-centrality')

## Betweenness centrality (BC)

In [19]:
all_vertices[:5]

['Hurand S. ',
 'Hurley H. ',
 'Hurley K.  SSL, Univ. Berkeley',
 'Hurtado Mauricio Reyes ',
 'Husain Mushahid ']

In [20]:
shortest_paths = g.shortestPaths(landmarks=all_vertices)
shortest_paths.show()

+--------------------+--------------------+
|                  id|           distances|
+--------------------+--------------------+
|Papadopoulos Atha...|{Papadopoulos Ath...|
|Peñate-Rodriguez ...|{Peñate-Rodriguez...|
|  Middleton Richard |{Middleton Richar...|
|      Ali S. Tabrez |{Ali S. Tabrez  -...|
|Nonomura Yoshihik...|{Nonomura Yoshihi...|
|Georgoulis Stamat...|{Georgoulis Stama...|
|Ruitenberg Justus...|{Ruitenberg Justu...|
|        Zhong J. F. | {Zhong J. F.  -> 0}|
|       Rajaei Karim |{Rajaei Karim  -> 0}|
|Stobbart A-M.  Un...|{Stobbart A-M.  U...|
|Proshchenko Vital...|{Proshchenko Vita...|
|Sliwa K.  The\n  ...|{Happacher F.  Th...|
|        Pankov Mark | {Pankov Mark  -> 0}|
|       Flohr Fabian |{Flohr Fabian  -> 0}|
|       Hahn Joachim |{Hahn Joachim  -> 0}|
|      Papini Jon J. |{Papini Jon J.  -...|
|   Mander Adrian P. |{Mander Adrian P....|
|      Snel Ralph C. |{Snel Ralph C.  -...|
|Park Sungwoo  LAN...|{Park Sungwoo  LA...|
|Reznicek P.  IPNP...|{Reznicek 

In [21]:
shortest_paths.write.mode('overwrite').parquet('../data/single-node-analysis/shortest-paths')

In [22]:
shortest_paths = shortest_paths.withColumn('vertices_in_between', f.map_keys(f.col('distances')))

In [23]:
shortest_paths.printSchema()

root
 |-- id: string (nullable = true)
 |-- distances: map (nullable = true)
 |    |-- key: string
 |    |-- value: integer (valueContainsNull = false)
 |-- vertices_in_between: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [24]:
shortest_paths.filter(f.col('id') == vertex_example).show(truncate=False)

+----------+-----------------+-------------------+
|id        |distances        |vertices_in_between|
+----------+-----------------+-------------------+
|Hurand S. |{Hurand S.  -> 0}|[Hurand S. ]       |
+----------+-----------------+-------------------+



In [25]:
shortest_paths.withColumn('vertices_in_between_exploded', f.explode(f.col('vertices_in_between'))).filter(f.col('vertices_in_between_exploded') == vertex_example).show()

+----------+-----------------+-------------------+----------------------------+
|        id|        distances|vertices_in_between|vertices_in_between_exploded|
+----------+-----------------+-------------------+----------------------------+
|Hurand S. |{Hurand S.  -> 0}|       [Hurand S. ]|                  Hurand S. |
+----------+-----------------+-------------------+----------------------------+



In [26]:
betweenness_centrality = shortest_paths.withColumn('vertices_in_between_exploded', f.explode(f.col('vertices_in_between'))).groupBy('vertices_in_between_exploded').agg(
    f.count(f.col('id'))).withColumnRenamed(
        'vertices_in_between_exploded', 'author').withColumnRenamed('count(id)', 'degree')

In [27]:
betweenness_centrality.filter(f.col('author') == vertex_example).show()

+----------+------+
|    author|degree|
+----------+------+
|Hurand S. |     1|
+----------+------+



In [28]:
# Verify
shortest_paths.filter(f.array_contains(f.col("vertices_in_between"), vertex_example)).count()

1

In [29]:
betweenness_centrality.orderBy('degree', ascending=False).show(10)

+---------------+------+
|         author|degree|
+---------------+------+
| Galimberti M. |   311|
|      Usuki Y. |   311|
|       Chan K. |   311|
|      Stößl A. |   311|
|Chakraborty A. |   311|
|   Weidlich U. |   311|
|    Wronska A. |   311|
| Hidalgo S. L. |   311|
|   Evans D. W. |   311|
|   Montuori C. |   311|
+---------------+------+
only showing top 10 rows



In [30]:
betweenness_centrality.write.mode("overwrite").parquet('../data/single-node-analysis/betweeness-centrality')

## Timespan

In [31]:
edges.limit(2).toPandas()

Unnamed: 0,src,dst,articles_count,oldest_update_date,newest_update_date
0,Abreu P.,Bertolli C. Pérez,7,2021-04-16,2021-12-14
1,Alekseyev Max A.,Alexeev Nikita,5,2015-04-07,2019-11-27


In [32]:
edges.printSchema()

root
 |-- src: string (nullable = true)
 |-- dst: string (nullable = true)
 |-- articles_count: long (nullable = true)
 |-- oldest_update_date: string (nullable = true)
 |-- newest_update_date: string (nullable = true)



In [33]:
# edges_statistics = edges.withColumn('articles_update_date_exploded', f.explode(f.col('articles_update_date')))
# edges_statistics.limit(5).toPandas()

In [34]:
edges_statistics = edges
edges_statistics = edges_statistics.withColumn('oldest_update_date', f.to_timestamp(f.col('oldest_update_date'), format='yyyy-MM-dd'))
edges_statistics = edges_statistics.withColumn('newest_update_date', f.to_timestamp(f.col('newest_update_date'), format='yyyy-MM-dd'))
edges_statistics.limit(5).toPandas()

Unnamed: 0,src,dst,articles_count,oldest_update_date,newest_update_date
0,Abreu P.,Bertolli C. Pérez,7,2021-04-16,2021-12-14
1,Alekseyev Max A.,Alexeev Nikita,5,2015-04-07,2019-11-27
2,Barclay S. E.,e A. Vicer',1,2019-11-19,2019-11-19
3,Cella G.,Li B.,26,2021-03-22,2022-01-04
4,Cornish N. J.,Li B.,5,2021-11-22,2022-01-04


In [35]:
edges_statistics.printSchema()

root
 |-- src: string (nullable = true)
 |-- dst: string (nullable = true)
 |-- articles_count: long (nullable = true)
 |-- oldest_update_date: timestamp (nullable = true)
 |-- newest_update_date: timestamp (nullable = true)



In [36]:
window = Window().orderBy('oldest_update_date', 'newest_update_date').partitionBy('src')

In [37]:
timespan_data = edges_statistics.withColumn('timespan', f.year(f.last('newest_update_date').over(window)) - f.year(f.first('oldest_update_date').over(window)))
timespan_data = timespan_data.select('src', 'timespan').withColumnRenamed('src', 'author').dropDuplicates()
timespan_data.sort('timespan', ascending=False).limit(5).toPandas()

Unnamed: 0,author,timespan
0,Apyan A.,15
1,Marcello S.,15
2,Kulessa P.,14
3,Grenier G.,14
4,Marcello S.,14


In [38]:
timespan_data.write.mode("overwrite").parquet('../data/single-node-analysis/timespan')

## The clustering coefficient

### Global clustering

In [39]:
triangle_count = g.triangleCount()
triangle_count = triangle_count.withColumnRenamed('count', 'closed-triplets')
triangle_count.show(10, truncate=False)

+---------------+------------------------------------+
|closed-triplets|id                                  |
+---------------+------------------------------------+
|0              |Huwe Bernd                          |
|0              |Huynh Vu Anh                        |
|0              |Husárik M.                          |
|0              |Hussain R.  IceCube\n  collaboration|
|0              |Hurley H.                           |
|0              |Hurtado Mauricio Reyes              |
|0              |Hutton Charles                      |
|3              |Huterer D.  the DES Collaboration   |
|0              |Hurley K.  SSL, Univ. Berkeley      |
|0              |Husain Mushahid                     |
+---------------+------------------------------------+
only showing top 10 rows



In [40]:
triangle_count.withColumnRenamed('id', 'author').write.mode('overwrite').parquet('../data/single-node-analysis/triangle-count')

In [41]:
shortest_paths = spark.read.parquet('../data/single-node-analysis/shortest-paths/*.parquet')

In [42]:
shortest_paths = shortest_paths.withColumn('distances', f.map_values(f.col('distances')))
shortest_paths.show(5)
shortest_paths = shortest_paths.withColumn('distances', f.explode(f.col('distances')))

+--------------------+--------------------+
|                  id|           distances|
+--------------------+--------------------+
|   Bauer Ernst  NEEL|                 [0]|
|Benoit Michel  EC...|                 [0]|
|Bertram Wolfgang ...|                 [0]|
|Bhattacharya Anta...|                 [0]|
|Boom B. A.  ANTAR...|[1, 1, 1, 1, 1, 1...|
+--------------------+--------------------+
only showing top 5 rows



In [43]:
shortest_paths = shortest_paths.filter(f.col('distances') == 2)
shortest_paths.show(5)

+--------------+---------+
|            id|distances|
+--------------+---------+
|Chatterjee C. |        2|
|Chatterjee C. |        2|
|Chatterjee C. |        2|
|Chatterjee C. |        2|
|Chatterjee C. |        2|
+--------------+---------+
only showing top 5 rows



In [44]:
shortest_paths = shortest_paths.groupBy('id').count().withColumnRenamed('count', 'open-closed-triplets')
shortest_paths.show(5)

+-------------------+--------------------+
|                 id|open-closed-triplets|
+-------------------+--------------------+
|Kaafar Mohamed Ali |                   1|
|      Minervini G. |                  19|
|         Ptak Andy |                   8|
|     Sattari Zahra |                   2|
|   Shinnaga Hiroko |                   5|
+-------------------+--------------------+
only showing top 5 rows



In [45]:
nb_closed_triplets = triangle_count.select('closed-triplets').rdd.map(lambda x: (1, x[0])).reduceByKey(lambda y1, y2: y1 + y2).collect()[0][1]

In [46]:
nb_open_triplets = shortest_paths.select('open-closed-triplets').rdd.map(lambda x: (1, x[0])).reduceByKey(lambda y1, y2: y1 + y2).collect()[0][1] + nb_closed_triplets

In [47]:
clustering_coefficient = nb_closed_triplets / nb_open_triplets
clustering_coefficient

0.5187677340899879

### Local clustering

In [48]:
degree_centrality = spark.read.parquet('../data/single-node-analysis/degree-centrality/*.parquet')
degree_centrality.orderBy('degree', ascending=True).show(5)

+--------------------+------+
|              author|degree|
+--------------------+------+
|     Lecroq Thierry |     2|
|        Schreyer K. |     2|
|Colla A.  NA60 Co...|     2|
| Nickerson James E. |     2|
|Milovanov A. V.  ...|     2|
+--------------------+------+
only showing top 5 rows



In [49]:
triangle_count = spark.read.parquet('../data/single-node-analysis/triangle-count/*.parquet')
triangle_count = triangle_count.withColumn('closed-triplets', 2*f.col('closed-triplets')/3)
triangle_count.orderBy('closed-triplets', ascending=False).show(5)

+------------------+--------------------+
|   closed-triplets|              author|
+------------------+--------------------+
|193.33333333333334|De La Cruz-Burelo...|
|             144.0|        Mladenov D. |
|             140.0|           Huang J. |
|136.66666666666666|             Abi B. |
|112.66666666666667|        Igonkina O. |
+------------------+--------------------+
only showing top 5 rows



In [50]:
local_clustering_coefficient = degree_centrality.join(triangle_count, on='author')
local_clustering_coefficient = local_clustering_coefficient.withColumn('local_clustering_coefficient', f.round(f.col('closed-triplets') / (f.col('degree') * (f.col('degree') - 1)), scale = 2))
local_clustering_coefficient.orderBy('local_clustering_coefficient', ascending=False).show(5)

+--------------------+------+------------------+----------------------------+
|              author|degree|   closed-triplets|local_clustering_coefficient|
+--------------------+------+------------------+----------------------------+
|      Cornish N. J. |    20|              30.0|                        0.08|
|Jaranowski P.  De...|    18|              24.0|                        0.08|
|          Rai A. K. |    16|18.666666666666668|                        0.08|
|Ramos-Pollan R.  ...|    28|60.666666666666664|                        0.08|
|          Colton N. |    20|              30.0|                        0.08|
+--------------------+------+------------------+----------------------------+
only showing top 5 rows



In [51]:
local_clustering_coefficient.select('author', 'local_clustering_coefficient').write.mode('overwrite').parquet('../data/single-node-analysis/local_clustering_coefficient')

In [52]:
local_clustering_coefficient.select('author', 'local_clustering_coefficient').orderBy('local_clustering_coefficient', ascending=False).show(5)

+--------------------+----------------------------+
|              author|local_clustering_coefficient|
+--------------------+----------------------------+
|      Cornish N. J. |                        0.08|
|Jaranowski P.  De...|                        0.08|
|          Rai A. K. |                        0.08|
|Ramos-Pollan R.  ...|                        0.08|
|          Colton N. |                        0.08|
+--------------------+----------------------------+
only showing top 5 rows

