In [9]:
import os
os.environ['SPARK_HOME'] = '/opt/spark/spark-3.0.1-bin-hadoop2.7'
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession, functions
from pyspark.sql.functions import asc, count, col

import configuration

spark = SparkSession.builder.master("local[*]") \
                    .appName('Topic Modelling') \
                    .getOrCreate()

In [2]:
path = configuration.PATHS["docword"]["nytimes"]
df = spark.read.text(path)

In [3]:
metadata = df.head(3)
numDocs = int(metadata[0].value)
numWordsVocab = int(metadata[1].value)
numWordsCollection = int(metadata[2].value)

df = df.filter(df["value"].contains(" "))
split_col = pyspark.sql.functions.split(df['value'], ' ')
df = df.withColumn('docId', split_col.getItem(0)) \
        .withColumn('wordId', split_col.getItem(1)) \
        .withColumn('count', split_col.getItem(2)) \
        .drop(df['value'])

In [4]:
df.show()

+-----+------+-----+
|docId|wordId|count|
+-----+------+-----+
|    1|   413|    1|
|    1|   534|    1|
|    1|  2340|    1|
|    1|  2806|    1|
|    1|  3059|    1|
|    1|  3070|    1|
|    1|  3294|    1|
|    1|  3356|    1|
|    1|  4056|    1|
|    1|  4930|    1|
|    1|  5255|    1|
|    1|  6888|    1|
|    1|  6946|    1|
|    1|  6974|    2|
|    1|  7296|    1|
|    1|  7402|    1|
|    1|  7405|    1|
|    1|  7409|    1|
|    1|  7544|    1|
|    1|  7790|    1|
+-----+------+-----+
only showing top 20 rows



In [5]:
# total number of documents 
print(df.select(df.docId).distinct().count())

299752


In [6]:
documentWordCount = df.groupBy("docId").agg(count("count").alias("word_count"))

documentWordCount.show()

+-----+----------+
|docId|word_count|
+-----+----------+
|  296|       260|
|  467|       196|
|  675|       164|
|  691|       322|
|  829|       251|
| 1090|       153|
| 1159|       194|
| 1436|       203|
| 1512|       164|
| 1572|       234|
| 2069|        15|
| 2088|       166|
| 2136|       239|
| 2162|       112|
| 2294|       261|
| 2904|       160|
| 3210|       236|
| 3414|       168|
| 3606|       147|
| 3959|       300|
+-----+----------+
only showing top 20 rows



In [7]:
documentWordCount.sort(col("word_count").asc()).show()

+------+----------+
| docId|word_count|
+------+----------+
| 69866|         1|
| 37339|         1|
| 94290|         1|
|130691|         1|
| 82968|         1|
|130701|         1|
|161376|         1|
|196096|         1|
|130675|         1|
|130753|         1|
|142760|         1|
|130724|         1|
| 78562|         1|
|130757|         1|
|161374|         1|
|106369|         1|
|130730|         1|
| 92116|         1|
| 92129|         1|
| 54759|         1|
+------+----------+
only showing top 20 rows



'/home/giangvdq/data/docword.nytimes.txt.gz'