# Demo: preparation

## Import

#### Import for spark

In [1]:
import findspark
findspark.init()
from pyspark.sql import *
from pyspark import SparkConf
from pyspark.sql.functions import *
from pyspark.sql.types import *
import time

#### Import for visualization

In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
# plt.style.use('ggplot')
import seaborn as sns

from graphviz import *

from IPython.display import display, HTML

#### Import needed for interaction

In [4]:
# interaction
# from __future__ import print_function
# from ipywidgets import interact, interactive, fixed, interact_manual
# import ipywidgets as widgets

#### Start spark engine

In [3]:
#spark.stop()

local = "local[*]"
appName = "Essai graph spark"
memory="8G"

configLocale = SparkConf().setAppName(appName).setMaster(local).\
set("spark.executor.memory", memory).\
set("spark.driver.memory", memory).\
set("spark.sql.catalogImplementation","in-memory")
spark = SparkSession.builder.config(conf = configLocale).getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("ERROR")

print("Spark application id is:", sc.applicationId)

Spark application id is: local-1580404823067


## Datasets

In [4]:
DATA_ROOT = "data"
DATA_DIR = DATA_ROOT + "/wiley/"
#DATA_DIR = DATA_ROOT + "/glyphosate/"
#DATA_DIR = DATA_ROOT + "/istex/"

print('DATA-DIR:' + DATA_DIR)

DATA-DIR:data/wiley/


#### Similarity links

In [5]:
similarityLinks = spark.read.json(DATA_DIR + 'similarityLinks')
similarityLinks.show(1, False)

+--------+--------+------------------+
|idTopic1|idTopic2|similarity        |
+--------+--------+------------------+
|110     |129     |0.2637034579858007|
+--------+--------+------------------+
only showing top 1 row



#### Topic nodes

In [6]:
topicNodes = spark.read.json(DATA_DIR + 'topicNodes')
# topicNodes.show(1, False)
topicNodes.orderBy("idTopic").show(3, False)

+-------+------------+---------+------+-------------------+
|idTopic|localTopicId|period   |term  |weight             |
+-------+------------+---------+------+-------------------+
|0      |0           |1996-1998|229917|0.0315334897631547 |
|0      |0           |1996-1998|284153|0.05503559027472268|
|0      |0           |1996-1998|248801|0.02087863877952271|
+-------+------------+---------+------+-------------------+
only showing top 3 rows



In [7]:
topicNodes.where("idTopic = 110").show(5, False)

+-------+------------+---------+------+--------------------+
|idTopic|localTopicId|period   |term  |weight              |
+-------+------------+---------+------+--------------------+
|110    |10          |2006-2008|150733|0.021343152963741938|
|110    |10          |2006-2008|82283 |0.02032438978321691 |
|110    |10          |2006-2008|98297 |0.02008320493272858 |
|110    |10          |2006-2008|248801|0.018483870163718896|
|110    |10          |2006-2008|234634|0.018380649583723516|
+-------+------------+---------+------+--------------------+
only showing top 5 rows



#### Term dictionary (terms to display in the global graph)

In [8]:
vocabularies = spark.read.json(DATA_DIR + 'wiley_CS_voc_199620153220')
vocabularies.show(5, False)

+-----+---------------------+
|id   |term                 |
+-----+---------------------+
|156  |101002 cae 101002 cae|
|5745 |ABC concept          |
|6806 |Abstract study       |
|7809 |Anycast services     |
|10821|CR networks          |
+-----+---------------------+
only showing top 5 rows



In [9]:
vocabularies.where("id = 150733").show()

+------+------+
|    id|  term|
+------+------+
|150733|impact|
+------+------+



#### topicDictionary_unique_10

In [10]:
topicDictionary_unique_10 = spark.read.json(DATA_DIR + 'topicDictionary_unique_10')
topicDictionary_unique_10.show(2, False)
# topicDictionary_unique_10.groupBy().max("idTopic").show()

+-------+---------------------------------------------------------------------------------------------------+
|idTopic|topic                                                                                              |
+-------+---------------------------------------------------------------------------------------------------+
|34     |[solve, solution, problems, compute, case, approximation, analysis, literature, step, terms]       |
|52     |[case, interaction, capable, calculate, terms, validate, experiments, predict, methodology, motion]|
+-------+---------------------------------------------------------------------------------------------------+
only showing top 2 rows



In [11]:
topicDictionary_unique_10.where("idTopic = 110").show(1, False)

+-------+--------------------------------------------------------------------------------------+
|idTopic|topic                                                                                 |
+-------+--------------------------------------------------------------------------------------+
|110    |[impact, complexity, deployment, run, quality, tool, attacks, due, internet, response]|
+-------+--------------------------------------------------------------------------------------+



#### evolutionPath

In [12]:
beta=0.5

nbTopicPerPeriod = 20

pivotDir = str(beta)+"_" + str(nbTopicPerPeriod) + "_0.0_10/"

future = spark.read.json(DATA_DIR + '/evolutionPath/' + pivotDir + "future")
past = spark.read.json(DATA_DIR + '/evolutionPath/' + pivotDir + "past")
labels = spark.read.json(DATA_DIR + '/evolutionPath/' + pivotDir + "labels")

# future.show(1, False)
# past.show(1, False)
# labels.show(1, False)

In [13]:
future.printSchema()

root
 |-- Alpha: double (nullable = true)
 |-- Beta: double (nullable = true)
 |-- Pi: long (nullable = true)
 |-- Pj: long (nullable = true)
 |-- Pk: long (nullable = true)
 |-- Ti: long (nullable = true)
 |-- Tj: long (nullable = true)
 |-- Tk: long (nullable = true)
 |-- TopicI: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- TopicJ: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- TopicK: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- localRank: long (nullable = true)



In [14]:
#labels.printSchema()
labels.createOrReplaceTempView("labels")
print("done")

done


#### Metrics

In [15]:
stats_future = spark.read.json(DATA_DIR + 'statistics_future')
stats = spark.read.json(DATA_DIR + 'statistics')

In [16]:
stats_future.show(3, False)

+----+-----------------+-----+----------+--------------------+-----------------+-----------------------+-----------+-------+-----+----------------------------+-------------------------------------------------------------------------------------------+-----------+
|Beta|ConvergenceDegree|Depth|Liveliness|PivotEvolutionDegree|PublicationPeriod|RelativeEvolutionDegree|SplitDegree|TopicID|dying|emerging                    |genetic                                                                                    |special    |
+----+-----------------+-----+----------+--------------------+-----------------+-----------------------+-----------+-------+-----+----------------------------+-------------------------------------------------------------------------------------------+-----------+
|0.1 |7.25             |8    |1.0       |0.844               |1998-2000        |0.779                  |8.352      |20     |[]   |[goal]                      |[implementation, knowledge, structure, technology

In [17]:
stats.show(1,False)

+----+-----------------------+-----------+----------------+--------------------------+-----------------------------+-----------------+---------------------+---------+--------------+------------------------+---------------------------+---------------+-----------------+-------+-----+--------+-------------------------------------------------------------------------------------------+-----------+
|Beta|FutureConvergenceDegree|FutureDepth|FutureLiveliness|FuturePivotEvolutionDegree|FutureRelativeEvolutionDegree|FutureSplitDegree|PastConvergenceDegree|PastDepth|PastLiveliness|PastPivotEvolutionDegree|PastRelativeEvolutionDegree|PastSplitDegree|PublicationPeriod|TopicID|dying|emerging|genetic                                                                                    |special    |
+----+-----------------------+-----------+----------------+--------------------------+-----------------------------+-----------------+---------------------+---------+--------------+------------------------+--