In [178]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType
from pyspark.sql import DataFrameWriter
from pyspark.ml.linalg import Vectors, SparseVector
from pyspark.ml.clustering import LDA
from pyspark.ml.feature import StringIndexer
import pandas as pd

db = "sm"


In [179]:
#tweets_sdf = spark.sql("SELECT status_id, textTranslated AS text FROM " + db + ".twitter")
df = spark.sql("SELECT status_id, text FROM " + db + ".twitter WHERE lang = 'en'")

In [180]:
regexTokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="\\W", minTokenLength=4, 
                                toLowercase=True)
df = regexTokenizer.transform(df)

In [181]:
#df.show()

In [182]:
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
df = remover.transform(df)

In [183]:
df.show()

+------------------+--------------------+--------------------+--------------------+
|         status_id|                text|               words|            filtered|
+------------------+--------------------+--------------------+--------------------+
|972175221481459712|When your wonderf...|[when, your, wond...|[wonderful, frien...|
|972082111606939648|@GrandHuit Rettet...|[grandhuit, rette...|[grandhuit, rette...|
|973220871551807490|"The melon patent...|[melon, patent, p...|[melon, patent, p...|
|971416727862108160|@NoPatentsOnSeed ...|[nopatentsonseed,...|[nopatentsonseed,...|
|973337751893479424|@BigCatRescue @Th...|[bigcatrescue, th...|[bigcatrescue, th...|
|972073473559420929|How can we achiev...|[achieve, equalit...|[achieve, equalit...|
|972052286439600129|Could a fish be s...|[could, fish, sai...|[fish, said, sust...|
|970597439383261184|@MareMundi Super ...|[maremundi, super...|[maremundi, super...|
|970366504293273600|Today, 30 years a...|[today, years, vi...|[today, years,

In [171]:
cv = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=100, minDF=10.0)
model = cv.fit(df)
model.transform(df).show()
df = model.transform(df)

+------------------+--------------------+--------------------+--------------------+--------------------+
|         status_id|                text|               words|            filtered|            features|
+------------------+--------------------+--------------------+--------------------+--------------------+
|972175221481459712|When your wonderf...|[when, your, wond...|[wonderful, frien...|(100,[0,19,48],[1...|
|972082111606939648|@GrandHuit Rettet...|[grandhuit, rette...|[grandhuit, rette...|     (100,[0],[1.0])|
|973220871551807490|"The melon patent...|[melon, patent, p...|[melon, patent, p...|     (100,[0],[1.0])|
|971416727862108160|@NoPatentsOnSeed ...|[nopatentsonseed,...|[nopatentsonseed,...|         (100,[],[])|
|973337751893479424|@BigCatRescue @Th...|[bigcatrescue, th...|[bigcatrescue, th...|         (100,[],[])|
|972073473559420929|How can we achiev...|[achieve, equalit...|[achieve, equalit...|(100,[0,22],[1.0,...|
|972052286439600129|Could a fish be s...|[could, fish, 

In [184]:
model = cv.fit(df)

In [185]:
vocabulary_local = pd.DataFrame(model.vocabulary, columns = ["word"])



In [186]:
vocabulary_local.head()

Unnamed: 0,word
0,https
1,islam
2,immigration
3,muslim
4,migration


In [173]:
df=df.drop('text', 'words','filtered')
df.show()


+------------------+--------------------+
|         status_id|            features|
+------------------+--------------------+
|972175221481459712|(100,[0,19,48],[1...|
|972082111606939648|     (100,[0],[1.0])|
|973220871551807490|     (100,[0],[1.0])|
|971416727862108160|         (100,[],[])|
|973337751893479424|         (100,[],[])|
|972073473559420929|(100,[0,22],[1.0,...|
|972052286439600129|(100,[0,86],[1.0,...|
|970597439383261184|         (100,[],[])|
|970366504293273600|(100,[0,32,69],[1...|
|970254178667855872|     (100,[0],[1.0])|
|970240136410693632|(100,[0,32],[2.0,...|
|970236661392138240|(100,[0,19,74],[1...|
|970211668130238464|     (100,[0],[1.0])|
|972393251180351488|     (100,[0],[1.0])|
|972043961081782273|     (100,[0],[2.0])|
|971707912488914944|(100,[0,31],[1.0,...|
|971387478237220865|(100,[30,53,78],[...|
|971037462511472640|     (100,[0],[1.0])|
|971035770487222272|(100,[0,52],[1.0,...|
|970935767429238784|(100,[0,73],[1.0,...|
+------------------+--------------

In [174]:
lda = LDA(k=10, maxIter=10)
model = lda.fit(df)

In [175]:
model.describeTopics().show()

+-----+--------------------+--------------------+
|topic|         termIndices|         termWeights|
+-----+--------------------+--------------------+
|    0|[20, 18, 1, 0, 65...|[0.12914028153688...|
|    1|[0, 2, 1, 31, 69,...|[0.18396579659795...|
|    2|[2, 0, 11, 9, 15,...|[0.30149179928421...|
|    3|[0, 1, 12, 16, 42...|[0.22684828921016...|
|    4|[10, 39, 52, 1, 0...|[0.30393725650092...|
|    5|[1, 5, 0, 8, 49, ...|[0.25417454693979...|
|    6|[1, 0, 22, 3, 19,...|[0.19677070507123...|
|    7|[0, 4, 2, 29, 45,...|[0.39747374003316...|
|    8|[0, 1, 26, 13, 9,...|[0.19733309422827...|
|    9|[0, 1, 3, 7, 6, 1...|[0.31556296399376...|
+-----+--------------------+--------------------+

