# Demo: preparation

In [1]:
import os.path
from os import path

SPARK_VERSION="2.4.4"
SPARK_PREFIX="/local/" + os.environ["USER"]
print("SPARK_PREFIX:", SPARK_PREFIX)
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = f"{SPARK_PREFIX}/spark-{SPARK_VERSION}-bin-hadoop2.7"


if (path.isdir(os.environ["SPARK_HOME"])):
    print(os.environ["SPARK_HOME"] + " exists.")
else:
    CLOSER_LOCATION = f"https://www.apache.org/dyn/closer.lua/spark/spark-{SPARK_VERSION}/spark-{SPARK_VERSION}-bin-hadoop2.7.tgz"
    !wget {CLOSER_LOCATION} -O suggest.html
    !wget `grep "suggest the following mirror" -A2 suggest.html |grep -Po 'href="\K[^"]*'` -O {SPARK_PREFIX}/spark-{SPARK_VERSION}-bin-hadoop2.7.tgz
    !cd {SPARK_PREFIX} && tar xzf spark-{SPARK_VERSION}-bin-hadoop2.7.tgz && rm spark-{SPARK_VERSION}-bin-hadoop2.7.tgz
    !rm suggest.html
    !echo "spark installé dans {os.environ["SPARK_HOME"]}"
    !ls -ld {os.environ["SPARK_HOME"]}
    

SPARK_PREFIX: /local/naacke
/local/naacke/spark-2.4.4-bin-hadoop2.7 exists.


In [2]:
!pip3 install findspark
!pip3 install graphviz

# Requirement:
# ==============

# WIDGETS and interact 
#----------------------
# ref https://ipywidgets.readthedocs.io/en/latest/user_install.html
# !pip3 install ipywidgets
!pip3 install ipywidgets
# pour le notebook
#!jupyter nbextension enable --py widgetsnbextension

# vérifier extension pour le lab
# !jupyter labextension check @jupyter-widgets/jupyterlab-manager
!jupyter labextension check @jupyter-widgets/jupyterlab-manager

# if not installed, theN
#!jupyter labextension install @jupyter-widgets/jupyterlab-manager

# SIDECAR
# -----------
!jupyter labextension check @jupyter-widgets/jupyterlab-sidecar

# if not installed, then
# pip install sidecar
# jupyter labextension install @jupyter-widgets/jupyterlab-sidecar

@jupyter-widgets/jupyterlab-manager:[32m enabled [0m
@jupyter-widgets/jupyterlab-sidecar:[32m enabled [0m


#### Import

In [3]:
import findspark
findspark.init()

from pyspark.sql import *
from pyspark import SparkConf
from pyspark.sql.types import *

# functions
from pyspark.sql.functions import *
# desc() fucntion
import pyspark.sql.functions as fn 

# timer
from timeit import default_timer
# start = default_timer()

#### import needed for interaction

In [4]:
# interaction
# from __future__ import print_function
# from ipywidgets import interact, interactive, fixed, interact_manual
# import ipywidgets as widgets

#### import for visualisation

In [5]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
# plt.style.use('ggplot')
import seaborn as sns

from graphviz import Digraph


#### Start spark engine

In [6]:
#spark.stop()

local = "local[*]"
appName = "Essai graph spark"
memory="8G"

configLocale = SparkConf().setAppName(appName).setMaster(local).\
set("spark.executor.memory", memory).\
set("spark.driver.memory", memory).\
set("spark.sql.catalogImplementation","in-memory")
spark = SparkSession.builder.config(conf = configLocale).getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("ERROR")

print("Spark application id is:", sc.applicationId)

Spark application id is: local-1580306473162


## Datasets

In [7]:
# DATA_DIR = os.environ["HOME"]+ "/res/EPIQUE/Li_KE/DATA/like_2019-10-04/arXiv.2018-01-19_voc_199820173150/"
DATA_DIR = SPARK_PREFIX + "/localfiles/pldac_2020-01-22/arXiv/"

print('DATA-DIR:' + DATA_DIR)

DATA-DIR:/local/naacke/localfiles/pldac_2020-01-22/arXiv/


#### Similarity links

In [8]:
similarityLinks = spark.read.json(DATA_DIR + 'similarityLinks')
similarityLinks.show(1, False)

+--------+--------+--------------------+
|idTopic1|idTopic2|similarity          |
+--------+--------+--------------------+
|304     |367     |1.324177572352318E-4|
+--------+--------+--------------------+
only showing top 1 row



#### Topic nodes

In [9]:
topicNodes = spark.read.json(DATA_DIR + 'topicNodes')
# topicNodes.show(1, False)
topicNodes.orderBy("idTopic").show(3, False)

+-------+------------+---------+------+--------------------+
|idTopic|localTopicId|period   |term  |weight              |
+-------+------------+---------+------+--------------------+
|0      |0           |1998-2000|852487|0.036566569921422064|
|0      |0           |1998-2000|919446|0.020184340718255162|
|0      |0           |1998-2000|880953|0.028266144039501652|
+-------+------------+---------+------+--------------------+
only showing top 3 rows



#### topicDictionary_unique_10

In [10]:
topicDictionary_unique_10 = spark.read.json(DATA_DIR + 'topicDictionary_unique_10')
topicDictionary_unique_10.show(2, False)
# topicDictionary_unique_10.groupBy().max("idTopic").show()

+-------+----------------------------------------------------------------------------------------------------------+
|idTopic|topic                                                                                                     |
+-------+----------------------------------------------------------------------------------------------------------+
|41     |[analysi, system, paramet, vnc, factor, cach, control, form, techniqu, part]                              |
|401    |[observ, sourc, scheme, decod, encod, feedback, presenc, wireless sensor network, constrain, numer exampl]|
+-------+----------------------------------------------------------------------------------------------------------+
only showing top 2 rows



#### evolutionPath

In [11]:
beta=0.7

nbTopicPerPeriod = 50

pivotDir = str(beta)+"_" + str(nbTopicPerPeriod) + "_0.0_10/"

future = spark.read.json(DATA_DIR + 'evolutionPath/' + pivotDir + "future")
past = spark.read.json(DATA_DIR + 'evolutionPath/' + pivotDir + "past")
labels = spark.read.json(DATA_DIR + 'evolutionPath/' + pivotDir + "labels")

# future.show(1, False)
# past.show(1, False)
# labels.show(1, False)

In [12]:
#labels.printSchema()
labels.createOrReplaceTempView("labels")
print("done")

done
