https://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.feature.CountVectorizer

https://spark.apache.org/docs/latest/ml-features#countvectorizer

https://spark.apache.org/docs/latest/ml-clustering.html#latent-dirichlet-allocation-lda

https://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.clustering.LDA

https://www.zstat.pl/2018/02/07/scala-spark-get-topics-words-from-lda-model/

https://stackoverflow.com/questions/51456838/match-index-from-pyspark-dataframe-in-pandas/51457137#51457137

https://databricks-prod-cloudfront.cloud.databricks.com/public/4027ec902e239c93eaaa8714f173bcfc/3741049972324885/3783546674231782/4413065072037724/latest.html

In [1]:
# START SPARKSESSION
import findspark
findspark.init()

In [2]:
import pyspark
#from pyspark.mllib.linalg import Vector, Vectors
from pyspark.sql import SparkSession,SQLContext
from pyspark.sql.types import *
from pyspark.ml.feature import Tokenizer, CountVectorizer, StopWordsRemover
from pyspark.ml.clustering import LDA
from pyspark.sql.functions import udf
#from pyspark.sql.functions import asc, count, col, collect_list

In [3]:
spark = SparkSession.builder.master("local[*]") \
                    .appName('Topic Modelling') \
                    .config("spark.driver.memory", "10g") \
                    .master("local[*]") \
                    .getOrCreate()

In [4]:
# PHASE 1: TOPIC MODELLING WITH SPARK

# Auxiliar functions
def equivalent_type(f):
    if f == 'datetime64[ns]': return TimestampType()
    elif f == 'int64': return LongType()
    elif f == 'int32': return IntegerType()
    elif f == 'float64': return FloatType()
    else: return StringType()

def define_structure(string, format_type):
    try: typo = equivalent_type(format_type)
    except: typo = StringType()
    return StructField(string, typo)

def pandas_to_spark(pandas_df):
    columns = list(pandas_df.columns)
    types = list(pandas_df.dtypes)
    struct_list = []
    for column, typo in zip(columns, types): 
      struct_list.append(define_structure(column, typo))
    p_schema = StructType(struct_list)
    return spark.createDataFrame(pandas_df, p_schema)

#data_df = pandas_to_spark(data)

In [5]:
# LOAD PROCESSED DATA 
data_df_full = spark.read.option('header', True).csv('/home/giangvdq/data/NIPS Papers/papers_processed.csv')

In [6]:
data_df = data_df_full.limit(3000)

In [7]:
data_df.show()

+----+--------------------+
|  id|    lemmatize_joined|
+----+--------------------+
|   1|self organization...|
|  10|mean field theory...|
| 100|store covariance ...|
|1000|bayesian query co...|
|1001|neural network en...|
|1002|sing neural insta...|
|1003|plasticity mediat...|
|1004|iceg morphology c...|
|1005|real time control...|
|1006|real time control...|
|1007|learn play game c...|
|1008|multidimensional ...|
|1009|experimental comp...|
| 101|train multilayer ...|
|1010|interference lear...|
|1011|active learn stat...|
|1012|rapid graph base ...|
|1013|ocular dominance ...|
|1014|associative decor...|
|1015|connectionist tec...|
+----+--------------------+
only showing top 20 rows



In [8]:
# STEP: TOKENIZE

# source: https://gist.github.com/Bergvca/a59b127afe46c1c1c479

tokenizer = Tokenizer(inputCol="lemmatize_joined", outputCol="words")
wordsDataFrame = tokenizer.transform(data_df)

In [9]:
wordsDataFrame.show()

+----+--------------------+--------------------+
|  id|    lemmatize_joined|               words|
+----+--------------------+--------------------+
|   1|self organization...|[self, organizati...|
|  10|mean field theory...|[mean, field, the...|
| 100|store covariance ...|[store, covarianc...|
|1000|bayesian query co...|[bayesian, query,...|
|1001|neural network en...|[neural, network,...|
|1002|sing neural insta...|[sing, neural, in...|
|1003|plasticity mediat...|[plasticity, medi...|
|1004|iceg morphology c...|[iceg, morphology...|
|1005|real time control...|[real, time, cont...|
|1006|real time control...|[real, time, cont...|
|1007|learn play game c...|[learn, play, gam...|
|1008|multidimensional ...|[multidimensional...|
|1009|experimental comp...|[experimental, co...|
| 101|train multilayer ...|[train, multilaye...|
|1010|interference lear...|[interference, le...|
|1011|active learn stat...|[active, learn, s...|
|1012|rapid graph base ...|[rapid, graph, ba...|
|1013|ocular dominan

In [10]:
# STEP: REMOVE X MOST OCCURING WORDS
cv_tmp = CountVectorizer(inputCol="words", outputCol="tmp_vectors")
cv_tmp_model = cv_tmp.fit(wordsDataFrame)

topWords = list(cv_tmp_model.vocabulary[0:200])

remover = StopWordsRemover(inputCol="words", outputCol="filtered", stopWords = topWords)
wordsDataFrame = remover.transform(wordsDataFrame)

In [11]:
topWords

['use',
 'learn',
 'data',
 'algorithm',
 'network',
 'time',
 'train',
 'problem',
 'figure',
 'give',
 'method',
 'value',
 'distribution',
 'number',
 'model',
 'result',
 'state',
 'input',
 'show',
 'image',
 'feature',
 'parameter',
 'point',
 'system',
 'example',
 'base',
 'sample',
 'process',
 'error',
 'vector',
 'probability',
 'case',
 'neural',
 'information',
 'weight',
 'matrix',
 'approach',
 'follow',
 'space',
 'kernel',
 'first',
 'estimate',
 'different',
 'linear',
 'test',
 'variable',
 'mean',
 'class',
 'find',
 'neuron',
 'define',
 'compute',
 'order',
 'performance',
 'cluster',
 'output',
 'obtain',
 'label',
 'term',
 'consider',
 'gaussian',
 'step',
 'however',
 'experiment',
 'structure',
 'task',
 'make',
 'correspond',
 'unit',
 'large',
 'function',
 'section',
 'since',
 'pattern',
 'present',
 'assume',
 'component',
 'work',
 'classification',
 'signal',
 'noise',
 'analysis',
 'represent',
 'set',
 'form',
 'solution',
 'average',
 'random',
 'we

In [12]:
wordsDataFrame.show(10)

+----+--------------------+--------------------+--------------------+
|  id|    lemmatize_joined|               words|            filtered|
+----+--------------------+--------------------+--------------------+
|   1|self organization...|[self, organizati...|[self, organizati...|
|  10|mean field theory...|[mean, field, the...|[cortex, artifici...|
| 100|store covariance ...|[store, covarianc...|[store, covarianc...|
|1000|bayesian query co...|[bayesian, query,...|[query, construct...|
|1001|neural network en...|[neural, network,...|[ensemble, cross,...|
|1002|sing neural insta...|[sing, neural, in...|[sing, instantiat...|
|1003|plasticity mediat...|[plasticity, medi...|[plasticity, medi...|
|1004|iceg morphology c...|[iceg, morphology...|[iceg, morphology...|
|1005|real time control...|[real, time, cont...|[real, tokamak, p...|
|1006|real time control...|[real, time, cont...|[real, tokamak, p...|
+----+--------------------+--------------------+--------------------+
only showing top 10 

In [13]:
# STEP: COUNTVECTORIZER

cv = CountVectorizer(inputCol="filtered", outputCol="vectors")
cvmodel = cv.fit(wordsDataFrame)
df_vect = cvmodel.transform(wordsDataFrame)

In [14]:
#transform the dataframe to a format that can be used as input for LDA.train. 
#LDA train expects a RDD with lists,
#where the list consists of a uid and (sparse) Vector
def parseVectors(line):
    return [ int(line[2]), line[0] ]

sparsevector = (df_vect.select('vectors', 'lemmatize_joined', 'id')
                .rdd.map(parseVectors) )

In [15]:
sparsevector = sparsevector.toDF()

In [16]:
#Train the LDA model

lda = LDA(k=4, maxIter=70, featuresCol='_2', seed=1, optimizer='em')
model = lda.fit(sparsevector)

In [27]:
model.vocabSize()

75334

In [28]:
cvmodel.vocabulary

['cortex',
 'exact',
 'vision',
 'illustrate',
 'direct',
 'significant',
 'trajectory',
 'capture',
 'advantage',
 'category',
 'query',
 'specify',
 'separate',
 'randomly',
 'various',
 'prove',
 'cambridge',
 'transform',
 'reinforcement',
 'amount',
 'manifold',
 'dependent',
 'always',
 'online',
 'boost',
 'adaptive',
 'coordinate',
 'mechanism',
 'still',
 'fast',
 'world',
 'expression',
 'ratio',
 'generative',
 'implementation',
 'population',
 'main',
 'expectation',
 'address',
 'free',
 'can',
 'user',
 'chain',
 'department',
 'variational',
 'not',
 'see',
 'identify',
 'activation',
 'language',
 'consistent',
 'frame',
 'turn',
 'assign',
 'relationship',
 'environment',
 'validation',
 'simply',
 'conclusion',
 'run',
 'underlie',
 'complete',
 'addition',
 'whether',
 'imply',
 'uniform',
 'bay',
 'interval',
 'movement',
 'chip',
 'significantly',
 'theoretical',
 'difficult',
 'formulation',
 'among',
 'code',
 'record',
 'artificial',
 'refer',
 'link',
 'typical

In [29]:
ll = model.logLikelihood(sparsevector)
lp = model.logPerplexity(sparsevector)
print("The lower bound on the log likelihood of the entire corpus: " + str(ll))
print("The upper bound on perplexity: " + str(lp))

The lower bound on the log likelihood of the entire corpus: -38942653.72330608
The upper bound on perplexity: 18.500134548336305


In [30]:
# Describe topics.

# Number of terms in topics
numTerms = 8

topics = model.describeTopics(numTerms)
print("The topics described by their top-weighted terms:")
topics.show(5, truncate=True)

The topics described by their top-weighted terms:
+-----+--------------------+--------------------+
|topic|         termIndices|         termWeights|
+-----+--------------------+--------------------+
|    0|[0, 68, 69, 81, 3...|[0.00445160280738...|
|    1|[10, 18, 41, 107,...|[0.00446965231844...|
|    2|[49, 9, 93, 33, 8...|[0.00466845859372...|
|    3|[20, 73, 83, 127,...|[0.00364356705031...|
+-----+--------------------+--------------------+



In [36]:
# DISPLAY THE TOPIC DISTRIBUTION

def indices_to_terms(vocabulary):
    def indices_to_terms(xs):
        return [vocabulary[int(x)] for x in xs]
    return udf(indices_to_terms, ArrayType(StringType()))

topics_with_terms = topics.withColumn(
    "topics_words", indices_to_terms(cvmodel.vocabulary)("termIndices"))

#topics_with_terms.select(['topic','topics_words']).show(20,False)
topics_with_terms.select(['topic','topics_words']).show(20,False)

topics_with_terms.show()

+-----+-----------------------------------------------------------------------------------+
|topic|topics_words                                                                       |
+-----+-----------------------------------------------------------------------------------+
|0    |[cortex, movement, chip, delay, population, motor, mechanism, cortical]            |
|1    |[query, reinforcement, user, robot, plan, patch, track, player]                    |
|2    |[language, category, hierarchical, generative, relation, perceptron, speaker, text]|
|3    |[manifold, formulation, dimensionality, unlabeled, embed, prove, subspace, semi]   |
+-----+-----------------------------------------------------------------------------------+

+-----+--------------------+--------------------+--------------------+
|topic|         termIndices|         termWeights|        topics_words|
+-----+--------------------+--------------------+--------------------+
|    0|[0, 68, 69, 81, 3...|[0.00445160280738...|[

In [32]:
# Shows the result
docTopic = model.transform(sparsevector)
docTopic.printSchema()
docTopic.select(['_1','topicDistribution']).show(10,truncate=False)

root
 |-- _1: long (nullable = true)
 |-- _2: vector (nullable = true)
 |-- topicDistribution: vector (nullable = true)

+----+-----------------------------------------------------------------------------------+
|_1  |topicDistribution                                                                  |
+----+-----------------------------------------------------------------------------------+
|1   |[0.14339740836927742,0.49296136110318556,0.23241814962348656,0.1312230809040504]   |
|10  |[0.8101573742702054,0.05280950885829635,0.07346853379876724,0.0635645830727309]    |
|100 |[0.9292264251707395,0.024301153439354503,0.027989042771909344,0.018483378617996475]|
|1000|[0.06386774955866174,0.505254604713889,0.256530875064119,0.17434677066333015]      |
|1001|[0.07547023431497453,0.11683414019216311,0.5198599207355684,0.2878357047572939]    |
|1002|[0.05789021243946618,0.24199817987376968,0.5102892432532927,0.18982236443347145]   |
|1003|[0.5176273149800396,0.0714811472793829,0.2451539195995

hello it's me