In [129]:
import json
from pathlib import Path
import pyspark.sql.functions as F
from pyspark.sql.types import StructType
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer

In [80]:
configFile = Path('/export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/testproject/LDAmodels/first_model/trainconfig.json')
with configFile.open() as fin:
    train_config = json.load(fin)
print(train_config)

{'name': 'first_model', 'description': 'model for implementation debugging', 'visibility': 'Public', 'trainer': 'mallet', 'TrDtSet': '/export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/testproject/datasets/S2Health.json', 'Preproc': {'min_lemas': 15, 'no_below': 10, 'no_above': 0.6, 'keep_n': 500000, 'stopwords': ['/export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/wordlists/english_generic.json', '/export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/wordlists/S2_stopwords.json'], 'equivalences': ['/export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/wordlists/S2_equivalences.json']}, 'LDAparam': {'ntopics': 25, 'alpha': 5.0, 'optimize_interval': 10, 'num_threads': 4, 'num_iterations': 1000, 'doc_topic_thr': 0.0, 'thetas_thr': 0.003, 'token_regexp': '[\\p{L}\\p{N}][\\p{L}\\p{N}\\p{P}]*\\p{L}'}}


In [86]:
stopWords = []
for stwFile in train_config['Preproc']['stopwords']:
    with Path(stwFile).open('r', encoding='utf8') as fin:
        stopWords += json.load(fin)['wordlist']
        
stopWords = list(set(stopWords))

In [91]:
equivalences = {}
for eqFile in train_config['Preproc']['equivalences']:
    with Path(eqFile).open('r', encoding = 'utf8') as fin:
        newEq = json.load(fin)['wordlist']
    newEq = [x.split(':') for x in newEq]
    newEq = [x for x in newEq if len(x) == 2]
    newEq = dict(newEq)

equivalences = {**equivalences, **newEq}

In [127]:
#Load information about all data that should be incorporated in the training set
trDtFile = Path(train_config['TrDtSet'])
with trDtFile.open() as fin:
    trDtSet = json.load(fin)

#Iterate over datasets, and append them to a single dataframe
for idx, DtSet in enumerate(trDtSet['Dtsets']):
    df = spark.read.parquet(f"file://{DtSet['parquet']}")
    if len(DtSet['filter']):
        pass #Need spark command to carry out the filtering df = df.filter ...
    df = (
        df.withColumn("all_lemmas", F.concat_ws(' ', *DtSet['lemmasfld']))
          .withColumn("all_rawtext", F.concat_ws(' ', *DtSet['rawtxtfld']))
          .withColumn("source", F.lit(DtSet["source"]))
          .select("id", "source", "all_lemmas", "all_rawtext")
    )
    if idx==0:
        trDF = df
    else:
        trDF = trDF.union(df).distinct()
    

In [128]:
%%time 

#tokenization
tk = Tokenizer(inputCol="all_lemmas", outputCol="tokens")
trDF = tk.transform(trDF)

#Removal of Stopwords
swr = StopWordsRemover(inputCol="tokens", outputCol="clean_tokens", stopWords=stopWords)
trDF = swr.transform(trDF)

#Filter according to number of lemmas in each document
trDF = trDF.where(F.size(F.col("clean_tokens")) >= train_config['Preproc']['min_lemas'])

#Equivalences replacement
df = trDF.select(trDF.id, F.explode(trDF.clean_tokens))
df = df.na.replace(equivalences, 1)
df = df.groupBy("id").agg(F.collect_list("col"))
trDF = (trDF.join(df, trDF.id == df.id, "left")
                      .drop(df.id)
                      .withColumnRenamed("collect_list(col)","final_tokens")
       )

#Select only the relevant columns
trDF = trDF.select("id", "source", "final_tokens", "all_rawtext")

print(trDF.count())
trDF.show(n=3, vertical=True, truncate=200)

                                                                                

36005467


[Stage 98:>                                                         (0 + 1) / 1]

-RECORD 0----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 id           | 00003f6077eb0dd01f1e0499a4ac10b50195d640                                                                                                                                                                 
 source       | Semantic Scholar                                                                                                                                                                                         
 final_tokens | [cumulative, risk, physiological, stress, responses, african, american, adolescents, objective, investigate, association, component, cumulative, risk, cognitive_radio, physiological, stress, respon... 
 all_rawtext  | Cumulative Risk and Physiological Stress Responses in African American Adolescents. Objective: To investigate as

                                                                                

In [131]:
%%time

cntVec = CountVectorizer(inputCol="final_tokens",
            outputCol="bow", minDF=train_config['Preproc']['no_below'],
            maxDF=train_config['Preproc']['no_above'], 
            vocabSize=train_config['Preproc']['keep_n'])
cntVecModel = cntVec.fit(trDF)

22/06/13 01:58:08 ERROR scheduler.TaskSchedulerImpl: Lost executor 0 on node22.cluster.tsc.uc3m.es: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
22/06/13 01:58:08 WARN scheduler.TaskSetManager: Lost task 321.0 in stage 104.0 (TID 21385) (node22.cluster.tsc.uc3m.es executor 0): ExecutorLostFailure (executor 0 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
22/06/13 01:58:08 WARN scheduler.TaskSetManager: Lost task 324.0 in stage 104.0 (TID 21388) (node22.cluster.tsc.uc3m.es executor 0): ExecutorLostFailure (executor 0 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
22/06/13 01:58:08 WARN scheduler.TaskSetManager: Lost task 3

CPU times: user 3.58 s, sys: 930 ms, total: 4.51 s
Wall time: 2h 37min 8s


In [132]:
len(cntVecModel.vocabulary)

500000

In [None]:
%%time

trDF = cntVecModel.transform(trDF)
trDF.show(n=3, vertical=True, truncate=200)

22/06/13 10:17:10 WARN scheduler.TaskSetManager: Lost task 171.0 in stage 136.0 (TID 26463) (node51.cluster.tsc.uc3m.es executor 6): java.io.IOException: No space left on device
	at java.io.FileOutputStream.writeBytes(Native Method)
	at java.io.FileOutputStream.write(FileOutputStream.java:326)
	at org.apache.spark.storage.TimeTrackingOutputStream.write(TimeTrackingOutputStream.java:59)
	at java.io.BufferedOutputStream.flushBuffer(BufferedOutputStream.java:82)
	at java.io.BufferedOutputStream.write(BufferedOutputStream.java:126)
	at net.jpountz.lz4.LZ4BlockOutputStream.flushBufferedData(LZ4BlockOutputStream.java:223)
	at net.jpountz.lz4.LZ4BlockOutputStream.write(LZ4BlockOutputStream.java:176)
	at java.io.BufferedOutputStream.flushBuffer(BufferedOutputStream.java:82)
	at java.io.BufferedOutputStream.write(BufferedOutputStream.java:126)
	at java.io.DataOutputStream.write(DataOutputStream.java:107)
	at org.apache.spark.sql.catalyst.expressions.UnsafeRow.writeToStream(UnsafeRow.java:539)
	