In [1]:
import sys

from pyspark.sql import SparkSession
from pyspark.sql.functions import count
from pyspark.sql.functions import max, min, col

import matplotlib.pyplot as plt
import matplotlib.cbook as cbook

import numpy as np
import pandas as pd

from graphframes import *
from pyspark.sql.types import *
from graphframes import *

In [2]:
# Load external packages programatically
# Here, we assume that you use Spark 3.0.1 or later (compiled against Scala 2.12)
import os
packages = "graphframes:graphframes:0.8.1-spark3.0-s_2.12"
# jars = "graphframes-0.3.0-spark2.0-s_2.11.jar"

os.environ["PYSPARK_SUBMIT_ARGS"] = (
    "--repositories https://repos.spark-packages.org/ --packages {0} pyspark-shell".format(packages)
)
#os.environ["PYSPARK_SUBMIT_ARGS"] = (
#    "--packages {0} --jars {1} pyspark-shell".format(packages, jars)
#)

In [3]:
spark = (SparkSession     
         .builder     
         .appName("Stackoverflow")     
         .getOrCreate())

In [4]:
stack_nodes_df = (spark.read.format("csv")      
          .option("header", "true")      
          .option("inferSchema", "true")      
          .load("data/stack_network_nodes.csv"))

In [5]:
stack_links_df = (spark.read.format("csv")      
          .option("header", "true")      
          .option("inferSchema", "true")      
          .load("data/stack_network_links.csv"))

In [6]:
count_stack_df_1 = (stack_nodes_df     
                .select(col("id").alias("PopularTopic"), "nodesize")      
                .groupBy("PopularTopic")      
                .agg(max("nodesize").alias("nodesize"))      
                .orderBy("nodesize", ascending=False))

In [7]:
count_stack_df_1.show(n=60, truncate=False)   
print("Total Rows = %d" % (count_stack_df_1.count()))

+-------------------+--------+
|PopularTopic       |nodesize|
+-------------------+--------+
|javascript         |649.16  |
|java               |610.65  |
|python             |438.67  |
|php                |361.22  |
|css                |341.17  |
|c#                 |321.13  |
|html               |272.45  |
|c++                |268.11  |
|android            |229.86  |
|jquery             |208.29  |
|c                  |189.83  |
|mysql              |165.43  |
|sql                |154.23  |
|html5              |140.18  |
|asp.net            |129.55  |
|angularjs          |126.59  |
|node.js            |117.36  |
|linux              |108.54  |
|ios                |87.46   |
|.net               |75.08   |
|ruby               |70.14   |
|sql-server         |64.62   |
|swift              |63.62   |
|reactjs            |59.03   |
|ruby-on-rails      |55.31   |
|git                |54.48   |
|spring             |52.84   |
|r                  |52.7    |
|mongodb            |50.95   |
|objecti

In [8]:
count_stack_df_2 = (stack_nodes_df     
                .select(col("id").alias("LessPopularTopic"), "nodesize")      
                .groupBy("LessPopularTopic")      
                .agg(min("nodesize").alias("nodesize"))      
                .orderBy("nodesize", ascending=True))
print("Total Rows = %d" % (count_stack_df_2.count()))

Total Rows = 115


In [9]:
count_stack_df_2.show(n=60, truncate=False)   
print("Total Rows = %d" % (count_stack_df_2.count()))

+-------------------+--------+
|LessPopularTopic   |nodesize|
+-------------------+--------+
|drupal             |8.25    |
|linq               |8.32    |
|vue.js             |8.38    |
|ionic-framework    |8.44    |
|redux              |8.52    |
|api                |8.61    |
|testing            |8.95    |
|flask              |9.39    |
|tdd                |9.45    |
|regex              |9.46    |
|nginx              |9.49    |
|less               |9.73    |
|devops             |9.81    |
|powershell         |9.85    |
|jenkins            |10.02   |
|twitter-bootstrap-3|10.13   |
|maven              |10.3    |
|plsql              |10.32   |
|qt                 |10.53   |
|cloud              |10.66   |
|elasticsearch      |10.82   |
|mvc                |10.92   |
|apache-spark       |11.04   |
|xamarin            |11.18   |
|haskell            |11.18   |
|asp.net-web-api    |11.28   |
|xcode              |11.37   |
|excel-vba          |11.38   |
|eclipse            |11.39   |
|shell  

In [10]:
df_nodes = pd.read_csv('data/stack_network_nodes.csv', header=None)

In [11]:
df_nodes.head()

Unnamed: 0,0,1,2
0,id,group,nodesize
1,html,6,272.45
2,css,6,341.17
3,hibernate,8,29.83
4,spring,8,52.84


In [12]:
df_links = pd.read_csv('data/stack_network_links.csv', header=None)

In [13]:
df_links.head()

Unnamed: 0,0,1,2,3
0,id,src,dst,value
1,1,azure,.net,20.933192346640457
2,2,sql-server,.net,32.322524219339904
3,3,asp.net,.net,48.40702996199019
4,4,entity-framework,.net,24.37090250532431


In [14]:
def create_stackoverflow_graph():
    nodes = spark.read.csv("data/stack_network_nodes.csv", header=True)
    relationships = spark.read.csv("data/stack_network_links.csv", header=True)
    return GraphFrame(nodes, relationships)

In [15]:
g = create_stackoverflow_graph()

In [16]:
print(g)

GraphFrame(v:[id: string, group: string ... 1 more field], e:[src: string, dst: string ... 2 more fields])


In [17]:
spark.stop()