In [126]:
import json
from pathlib import Path
import pyspark.sql.functions as F
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer

In [127]:
configFile = Path('/export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/testproject/LDAmodels/S2CS_25/trainconfig.json')
with configFile.open() as fin:
    train_config = json.load(fin)
print(train_config)

{'name': 'S2CS_25', 'description': 'kk', 'visibility': 'Public', 'trainer': 'mallet', 'TrDtSet': '/export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/testproject/datasets/S2CS.json', 'Preproc': {'min_lemas': 15, 'no_below': 10, 'no_above': 0.75, 'keep_n': 100000, 'stopwords': ['/export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/wordlists/english_generic.json', '/export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/wordlists/S2_stopwords.json', '/export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/wordlists/S2CS_stopwords.json'], 'equivalences': ['/export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/wordlists/S2_equivalences.json', '/export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/wordlists/S2CS_equivalences.json']}, 'LDAparam': {'ntopics': 25, 'alpha': 5.0, 'optimize_interval': 10, 'num_threads': 4, 'num_iterations': 1000, 'doc_topic_thr': 0.0, 'thetas_thr': 0.003, 'token_regexp': '[\\p{L}\\p{N}][\\p{L}

In [128]:
stopWords = []
for stwFile in train_config['Preproc']['stopwords']:
    with Path(stwFile).open('r', encoding='utf8') as fin:
        stopWords += json.load(fin)['wordlist']
        
stopWords = list(set(stopWords))

In [129]:
equivalences = {}
for eqFile in train_config['Preproc']['equivalences']:
    with Path(eqFile).open('r', encoding = 'utf8') as fin:
        newEq = json.load(fin)['wordlist']
    newEq = [x.split(':') for x in newEq]
    newEq = [x for x in newEq if len(x) == 2]
    newEq = dict(newEq)

equivalences = {**equivalences, **newEq}

In [130]:
#Load information about all data that should be incorporated in the training set
trDtFile = Path(train_config['TrDtSet'])
with trDtFile.open() as fin:
    trDtSet = json.load(fin)

#Iterate over datasets, and append them to a single dataframe
for idx, DtSet in enumerate(trDtSet['Dtsets']):
    df = spark.read.parquet(f"file://{DtSet['parquet']}")
    if len(DtSet['filter']):
        pass #Need spark command to carry out the filtering df = df.filter ...
    df = (
        df.withColumn("all_lemmas", F.concat_ws(' ', *DtSet['lemmasfld']))
          .withColumn("all_rawtext", F.concat_ws(' ', *DtSet['rawtxtfld']))
          .withColumn("source", F.lit(DtSet["source"]))
          .select("id", "source", "all_lemmas", "all_rawtext")
    )
    if idx==0:
        trDF = df
    else:
        trDF = trDF.union(df).distinct()
    

In [131]:
print(trDF.count())

14801878


In [132]:
%%time 

trDF = trDF.sample(fraction=3e-4)

#tokenization
tk = Tokenizer(inputCol="all_lemmas", outputCol="tokens")
trDF = tk.transform(trDF)

#Removal of Stopwords
swr = StopWordsRemover(inputCol="tokens", outputCol="clean_tokens", stopWords=stopWords)
trDF = swr.transform(trDF)

#Filter according to number of lemmas in each document
trDF = trDF.where(F.size(F.col("clean_tokens")) >= train_config['Preproc']['min_lemas'])

#Equivalences replacement
df = trDF.select(trDF.id, F.explode(trDF.clean_tokens))
df = df.na.replace(equivalences, 1)
df = df.groupBy("id").agg(F.collect_list("col"))
trDF = (trDF.join(df, trDF.id == df.id, "left")
                      .drop(df.id)
                      .withColumnRenamed("collect_list(col)","final_tokens")
       )

#Select only the relevant columns
trDF = trDF.select("id", "source", "final_tokens", "all_rawtext")

print(trDF.count())
trDF.show(n=3, vertical=True, truncate=200)


                                                                                

3143


                                                                                

-RECORD 0----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 id           | 0148c2831c3651a513b7a3f8287458be285d624e                                                                                                                                                                 
 source       | Semantic Scholar                                                                                                                                                                                         
 final_tokens | [database, urdu, detection, recognition, natural, images, database, urdu, detection, recognition, natural, standard, benchmark, latin, publish, remarkable, classification, recognition, extraction, ... 
 all_rawtext  | A Database for Urdu Text Detection and Recognition in Natural Scene Images. This paper describes a novel databas

In [133]:
%%time

cntVec = CountVectorizer(inputCol="final_tokens",
            outputCol="bow", minDF=train_config['Preproc']['no_below'],
            maxDF=train_config['Preproc']['no_above'], 
            vocabSize=train_config['Preproc']['keep_n'])
cntVecModel = cntVec.fit(trDF)



CPU times: user 29.7 ms, sys: 0 ns, total: 29.7 ms
Wall time: 6.62 s


                                                                                

In [134]:
len(cntVecModel.vocabulary)

2344

In [140]:
%%time

trDFnew = cntVecModel.transform(trDF)
trDFnew.show(n=3, vertical=True, truncate=1000)

                                                                                

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [144]:
vocabulary = cntVecModel.vocabulary
spark.sparkContext.broadcast(vocabulary)

<pyspark.broadcast.Broadcast at 0x7f5359a7e610>

In [145]:
def back2text(bow):
    text=""
    for idx, tf in zip(bow.indices, bow.values):
        text += int(tf) * (vocabulary[idx] + ' ')
    return text.strip()

back2textUDF = F.udf(lambda z: back2text(z)) 

In [147]:
trDF = trDFnew.withColumn("bow_text", back2textUDF(F.col("bow")))
trDF.show(n=3, vertical=True, truncate=500)

[Stage 238:>                                                        (0 + 1) / 1]

-RECORD 0----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 id           | 0148c2831c3651a513b7a3f8287458be285d624e                                                                                                                                                                                                                                                                                                                                                                                                                                          

                                                                                

In [164]:
    trDF = trDF.withColumn("testcol", trDF["final_tokens"])
trDF.show(n=3)

[Stage 247:>                                                        (0 + 1) / 1]

+--------------------+----------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                  id|          source|        final_tokens|         all_rawtext|                 bow|            bow_text|             testcol|
+--------------------+----------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|0148c2831c3651a51...|Semantic Scholar|[database, urdu, ...|A Database for Ur...|(2344,[3,24,29,34...|algorithm include...|[database, urdu, ...|
|18a0b53e28ecf8c93...|Semantic Scholar|[disclosure, prev...|DISCLOSURE PREVEN...|(2344,[1,4,5,6,20...|data data data da...|[disclosure, prev...|
|263c3219c4882f3e2...|Semantic Scholar|[web, accessibili...|Web Accessibility...|(2344,[1,4,5,11,1...|data process info...|[web, accessibili...|
+--------------------+----------------+--------------------+--------------------+--------------------+--------------------+-------

                                                                                

In [190]:
trDFnew.withColumn("2mallet", F.concat_ws(" 0 ", "id", "bow_text")).select("2mallet").show(n=3, vertical=True, truncate=200)

[Stage 212:>                                                        (0 + 1) / 1]

-RECORD 0-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 2mallet | 200121 0 research research research research research research research research research technology aim European base include propose improve activity activity activity activity level large addres... 
-RECORD 1-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 2mallet | 205675 0 understanding high energy energy result work area environment property low water enhance enhance range form offer relate benefit benefit benefit growth food food food surface determine str... 
-RECORD 2-------------------------------------------------------------------------------------------------------------------------------------------

                                                                                

In [13]:
trDF.select("all_rawtext", "bow_text").write.parquet("file:///export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/test/S2CS_bow_v2.parquet",
    mode="overwrite",
)

22/06/17 11:35:33 WARN scheduler.DAGScheduler: Broadcasting large task binary with size 1344.3 KiB
                                                                                

In [14]:
trDF.sample(fraction=0.1).select("all_rawtext", "bow_text").write.parquet("file:///export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/test/S2CS_bow_medium_v2.parquet",
    mode="overwrite",
)

trDF.sample(fraction=0.01).select("all_rawtext", "bow_text").write.parquet("file:///export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/test/S2CS_bow_small_v2.parquet",
    mode="overwrite",
)

22/06/17 11:57:17 WARN scheduler.DAGScheduler: Broadcasting large task binary with size 1345.0 KiB
22/06/17 12:17:56 WARN scheduler.DAGScheduler: Broadcasting large task binary with size 1345.0 KiB
----------------------------------------                                        
Exception happened during processing of request from ('127.0.0.1', 36418)
Traceback (most recent call last):
  File "/usr/lib/python3.8/socketserver.py", line 316, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/usr/lib/python3.8/socketserver.py", line 347, in process_request
    self.finish_request(request, client_address)
  File "/usr/lib/python3.8/socketserver.py", line 360, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/usr/lib/python3.8/socketserver.py", line 747, in __init__
    self.handle()
  File "/opt/spark-3.1.1-bin-2.8.3/python/pyspark/accumulators.py", line 262, in handle
    poll(accum_updates)
  File "/opt/spark-3.1.1-bi