In [1]:
import json
from pathlib import Path
import pyspark.sql.functions as F
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer

In [2]:
configFile = Path('/export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/testproject/LDAmodels/S2CS_25/trainconfig.json')
with configFile.open() as fin:
    train_config = json.load(fin)
print(train_config)

{'name': 'S2CS_25', 'description': 'kk', 'visibility': 'Public', 'trainer': 'mallet', 'TrDtSet': '/export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/testproject/datasets/S2CS.json', 'Preproc': {'min_lemas': 15, 'no_below': 10, 'no_above': 0.75, 'keep_n': 100000, 'stopwords': ['/export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/wordlists/english_generic.json', '/export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/wordlists/S2_stopwords.json'], 'equivalences': ['/export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/wordlists/S2_equivalences.json']}, 'LDAparam': {'ntopics': 25, 'alpha': 5.0, 'optimize_interval': 10, 'num_threads': 4, 'num_iterations': 1000, 'doc_topic_thr': 0.0, 'thetas_thr': 0.003, 'token_regexp': '[\\p{L}\\p{N}][\\p{L}\\p{N}\\p{P}]*\\p{L}'}}


In [3]:
stopWords = []
for stwFile in train_config['Preproc']['stopwords']:
    with Path(stwFile).open('r', encoding='utf8') as fin:
        stopWords += json.load(fin)['wordlist']
        
stopWords = list(set(stopWords))

In [4]:
equivalences = {}
for eqFile in train_config['Preproc']['equivalences']:
    with Path(eqFile).open('r', encoding = 'utf8') as fin:
        newEq = json.load(fin)['wordlist']
    newEq = [x.split(':') for x in newEq]
    newEq = [x for x in newEq if len(x) == 2]
    newEq = dict(newEq)

equivalences = {**equivalences, **newEq}

In [5]:
#Load information about all data that should be incorporated in the training set
trDtFile = Path(train_config['TrDtSet'])
with trDtFile.open() as fin:
    trDtSet = json.load(fin)

#Iterate over datasets, and append them to a single dataframe
for idx, DtSet in enumerate(trDtSet['Dtsets']):
    df = spark.read.parquet(f"file://{DtSet['parquet']}")
    if len(DtSet['filter']):
        pass #Need spark command to carry out the filtering df = df.filter ...
    df = (
        df.withColumn("all_lemmas", F.concat_ws(' ', *DtSet['lemmasfld']))
          .withColumn("all_rawtext", F.concat_ws(' ', *DtSet['rawtxtfld']))
          .withColumn("source", F.lit(DtSet["source"]))
          .select("id", "source", "all_lemmas", "all_rawtext")
    )
    if idx==0:
        trDF = df
    else:
        trDF = trDF.union(df).distinct()
    

NameError: name 'spark' is not defined

In [206]:
%%time 

#tokenization
tk = Tokenizer(inputCol="all_lemmas", outputCol="tokens")
trDF = tk.transform(trDF)

#Removal of Stopwords
swr = StopWordsRemover(inputCol="tokens", outputCol="clean_tokens", stopWords=stopWords)
trDF = swr.transform(trDF)

#Filter according to number of lemmas in each document
trDF = trDF.where(F.size(F.col("clean_tokens")) >= train_config['Preproc']['min_lemas'])

#Equivalences replacement
df = trDF.select(trDF.id, F.explode(trDF.clean_tokens))
df = df.na.replace(equivalences, 1)
df = df.groupBy("id").agg(F.collect_list("col"))
trDF = (trDF.join(df, trDF.id == df.id, "left")
                      .drop(df.id)
                      .withColumnRenamed("collect_list(col)","final_tokens")
       )

#Select only the relevant columns
trDF = trDF.select("id", "source", "final_tokens", "all_rawtext")

print(trDF.count())
trDF.show(n=3, vertical=True, truncate=200)

22/06/13 16:26:48 WARN scheduler.DAGScheduler: Broadcasting large task binary with size 1197.0 KiB
                                                                                

10528046


[Stage 245:>                                                        (0 + 1) / 1]

-RECORD 0----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 id           | 00004ddfe8089303589fb12cddc05fefc7a0bd96                                                                                                                                                                 
 source       | Semantic Scholar                                                                                                                                                                                         
 final_tokens | [static, total, causal, ordering, protocols, achieve, ordered, view, synchrony, view, synchronous, communication, vsc, service, provide, multicast, group, 1, reliable, message, delivery, 2, informa... 
 all_rawtext  | Using Static Total Causal Ordering Protocols to Achieve Ordered View Synchrony. A View Synchronous Communication

                                                                                

In [207]:
%%time

cntVec = CountVectorizer(inputCol="final_tokens",
            outputCol="bow", minDF=train_config['Preproc']['no_below'],
            maxDF=train_config['Preproc']['no_above'], 
            vocabSize=train_config['Preproc']['keep_n'])
cntVecModel = cntVec.fit(trDF)

                                                                                

CPU times: user 392 ms, sys: 72.1 ms, total: 464 ms
Wall time: 12min 29s


In [208]:
len(cntVecModel.vocabulary)

100000

In [209]:
%%time

trDFnew = cntVecModel.transform(trDF)
trDFnew.show(n=3, vertical=True, truncate=200)

22/06/13 18:13:21 WARN scheduler.DAGScheduler: Broadcasting large task binary with size 1196.9 KiB
[Stage 259:>                                                        (0 + 1) / 1]

-RECORD 0----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 id           | 00004ddfe8089303589fb12cddc05fefc7a0bd96                                                                                                                                                                 
 source       | Semantic Scholar                                                                                                                                                                                         
 final_tokens | [static, total, causal, ordering, protocols, achieve, ordered, view, synchrony, view, synchronous, communication, vsc, service, provide, multicast, group, 1, reliable, message, delivery, 2, informa... 
 all_rawtext  | Using Static Total Causal Ordering Protocols to Achieve Ordered View Synchrony. A View Synchronous Communication

                                                                                

In [210]:
vocabulary = cntVecModel.vocabulary
spark.sparkContext.broadcast(vocabulary)

<pyspark.broadcast.Broadcast at 0x7f7eb10f0970>

In [211]:
def back2text(bow):
    text=""
    for idx, tf in zip(bow.indices, bow.values):
        text += int(tf) * (vocabulary[idx] + ' ')
    return text.strip()

back2textUDF = F.udf(lambda z: back2text(z)) 

In [212]:
trDF = trDFnew.withColumn("bow_text", back2textUDF(F.col("bow")))
trDF.show(n=3, vertical=True, truncate=200)

22/06/13 18:25:09 WARN scheduler.DAGScheduler: Broadcasting large task binary with size 1204.6 KiB
[Stage 262:>                                                        (0 + 1) / 1]

-RECORD 0----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 id           | 00004ddfe8089303589fb12cddc05fefc7a0bd96                                                                                                                                                                 
 source       | Semantic Scholar                                                                                                                                                                                         
 final_tokens | [static, total, causal, ordering, protocols, achieve, ordered, view, synchrony, view, synchronous, communication, vsc, service, provide, multicast, group, 1, reliable, message, delivery, 2, informa... 
 all_rawtext  | Using Static Total Causal Ordering Protocols to Achieve Ordered View Synchrony. A View Synchronous Communication

                                                                                

In [190]:
trDFnew.withColumn("2mallet", F.concat_ws(" 0 ", "id", "bow_text")).select("2mallet").show(n=3, vertical=True, truncate=200)

[Stage 212:>                                                        (0 + 1) / 1]

-RECORD 0-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 2mallet | 200121 0 research research research research research research research research research technology aim European base include propose improve activity activity activity activity level large addres... 
-RECORD 1-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 2mallet | 205675 0 understanding high energy energy result work area environment property low water enhance enhance range form offer relate benefit benefit benefit growth food food food surface determine str... 
-RECORD 2-------------------------------------------------------------------------------------------------------------------------------------------

                                                                                

In [213]:
trDF.select("all_rawtext", "bow_text").write.parquet("file:///export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/test/S2CS_bow.parquet",
    mode="overwrite",
)

22/06/13 18:25:56 ERROR scheduler.TaskSchedulerImpl: Lost executor 6 on node51.cluster.tsc.uc3m.es: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
22/06/13 18:25:56 WARN scheduler.TaskSetManager: Lost task 21.0 in stage 263.0 (TID 35881) (node51.cluster.tsc.uc3m.es executor 6): ExecutorLostFailure (executor 6 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
22/06/13 18:25:56 WARN scheduler.TaskSetManager: Lost task 11.0 in stage 263.0 (TID 35871) (node51.cluster.tsc.uc3m.es executor 6): ExecutorLostFailure (executor 6 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
22/06/13 18:25:56 WARN scheduler.TaskSetManager: Lost task 1.0

In [214]:
trDF.sample(fraction=0.1).select("all_rawtext", "bow_text").write.parquet("file:///export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/test/S2CS_bow_medium.parquet",
    mode="overwrite",
)

trDF.sample(fraction=0.01).select("all_rawtext", "bow_text").write.parquet("file:///export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/test/S2CS_bow_small.parquet",
    mode="overwrite",
)

22/06/13 18:57:20 WARN scheduler.DAGScheduler: Broadcasting large task binary with size 1337.6 KiB
2022-06-13 18:57:53,222:2221(0x7f854dfd6640):ZOO_ERROR@handle_socket_error_msg@1782: Socket [10.0.12.77:2181] zk retcode=-4, errno=112(Host is down): failed while receiving a server response
I0613 18:57:53.222728  3175 group.cpp:452] Lost connection to ZooKeeper, attempting to reconnect ...
2022-06-13 18:57:53,222:2221(0x7f854dfd6640):ZOO_INFO@check_events@1764: initiated connection to server [10.0.12.18:2181]
2022-06-13 18:57:53,225:2221(0x7f854dfd6640):ZOO_INFO@check_events@1811: session establishment complete on server [10.0.12.18:2181], sessionId=0x404355563510006, negotiated timeout=10000
I0613 18:57:53.225706  3205 group.cpp:341] Group process (zookeeper-group(1)@192.168.148.225:37855) reconnected to ZooKeeper
I0613 18:57:53.225760  3205 group.cpp:831] Syncing group operations: queue size (joins, cancels, datas) = (0, 0, 0)
22/06/13 19:17:17 WARN scheduler.DAGScheduler: Broadcasting