In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext, Row
import pyspark.sql.functions as f
from pyspark import SparkContext
from sparknlp.pretrained import PretrainedPipeline
from sparknlp import Finisher
from pyspark.ml import Pipeline
from sparknlp.base import *
from sparknlp.annotator import *
import pandas as pd
import re
import os

In [3]:
spark = SparkSession.builder \
    .appName("nlp converter")\
    .master("local[*]")\
    .config("spark.driver.memory","16G")\
    .config("spark.driver.maxResultSize", "300") \
    .config("spark.kryoserializer.buffer.max", "2000M")\
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:4.2.6")\
    .getOrCreate()
sqlContext = SQLContext(spark)
spark

KeyboardInterrupt: 

In [None]:
# path = r"files\raw_texts.feather"
# pandas_df =  pd.read_feather(source+path).drop_duplicates(subset = 'URL', keep = 'first').reset_index(drop=True)
# pandas_df = pandas_df[pandas_df['URL_TEXT']!=""]
# spark_df = sqlContext.createDataFrame(pandas_df)

source = os.path.dirname(os.path.realpath('__file__')).split("src")[0]
path = r"files\raw_texts.parquet"
spark_df = spark.read.parquet(source+path).where(f.col("URL_TEXT")!= "")
spark_df = spark_df.withColumn('URL_TEXT', 'text')

In [None]:
xml = ["(?:<from.*?>)(.*?)(?:<\\/from>)"]
html = ["<(?:\"[^\"]*\"['\"]*|'[^']*'['\"]*|[^'\">])+>"]        
random_pattern = [r'^@.*\{.*\}', r'^\..*\{.*\}',r'\s\s+',r'\n',r'\xa0',r'dbx707', r'\xe2',r'\x80',r"\x8b", r"{{\.*}}", r"\x9d", r"\u200b"]# only digits: r'\b[0-9]+\b\s*'
url = ["^https?:\\/\\/(?:www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[äöüßa-zA-Z0-9()]{1,6}\\b(?:[-a-zäöüßA-Z0-9()@:%_\\+.~#?&\\/=]*)$", "www\w*de","www\w*com"]
email = ["^\S+@\S+\.\S+$"]
zip = ["^[0-9]{5}(?:-[0-9]{4})?\s?\w*$"]
phone = ["^\\+?[1-9][0-9]{7,14}$"]
dates = ["^[0-9]{1,2}\\/[0-9]{1,2}\\/[0-9]{4}$","^[0-9]{1,2}\\-[0-9]{1,2}\\-[0-9]{4}$", "^[0-9]{4}\\-[0-9]{1,2}\\-[0-9]{1,2}$"]
website_stopwords = ["explore","allgemeine geschäftsbedingungen","allgemein\*",'richtlinie\w*',"\w*recht\w* hinweis\w*","\w*recht\w*","\w*datenschutz\w*", "privacy","policy\w*","cooky\w*","cookie\w*","content\w*"," to ",\
        "anmeld\w*",  "abmeld\w*", "login","log in","logout", "log out", "kunden login", "online","zurück","back","start","select\w*", "ausw\w*","close",\
            "extras","news","report\w*","impressum","newsletter\w*", "owner","internet", "website\w*", "email\w*", "e-mail\w*", "mail\w*", "isbn", "issn",\
                "produkte", "partner","übersicht", "veranstaltungen", "suche\w*","kauf\w*", "angebot\w*", "konfigur\w*", "configur\w*","nutzer\w*","icon\w*",\
                    "zubehör", "garantie", "mehr", "modell\w*", "kontakt\w*","contact\w*","anfrage\w*","skip",'useful links','link\w*',"pin\w*","passw\w*", "password\w*",\
                        "buchen","book" "anfahrt", "finanzdienstleistung\w*" "connected", "required", "sitemap\w*", "\w*\s?abo\w*", 'social media', "socialmedia",\
                            "englisch", "english","deutsch","german","google", "wikipedia", "navigation","\w*shop\w*", "\w*magazin\w*", "lifestyle",\
                                "facebook\w*", "youtube\w*","instagram\w*","xing\w*","linkedin\w*", "blog\w*","spiegel\w*","twitter\w*","sms","video"\
                                    "archiv\w*", "artikel\w*", "article\w*","side\w*", "seite\w*","site","app\w*","\s?abgerufen\s?\w*\s*\d*",\
                                        "januar", "februar", "märz", "april", "mai", "juni", "juli", "august", "september", "oktober", "november", "dezember",\
                                            "dbx707", "db11","\w*\s?straße\s?\d*","\w*\s?strasse\w*", "tel\w*", "\w*\s?download\w*",\
                                                "covid\w*\s?\d*", "corona\w*\s?\d*"]
                                
domain_stopwords = ["(g/km)","use case\w*", "unternehme\w*", "gmbh", "cokg", "co kg", "consult\w*", "handel\w*", "händler\w*", "leistung\w*"]
numbers_only = ["^\\d+$","^\s?[0-9]+(\s+[0-9]+)*\s?$", "\(.*\)","\[.*\]", "^\d+.\d+"," \\d+ "]
special_characters = ['[^äöüßA-Za-z0-9 ]+']#['[\(,.:\);^]']
short_words = ['^\w{0,3}$', '^\s+']

all_pattern_to_remove = xml+html+random_pattern+url+email+zip+phone+dates+website_stopwords+domain_stopwords+numbers_only+special_characters
spark_df = spark_df.withColumn('text', 'URL_TEXT')
for pattern in all_pattern_to_remove:
    spark_df = spark_df.withColumn("text", f.regexp_replace('text', pattern, "")) 
spark_df = spark_df.withColumn('text', f.trim(f.col('text')))

In [None]:
documentAssembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

language_detector = LanguageDetectorDL.pretrained("ld_wiki_tatoeba_cnn_21")\
    .setInputCols(["document"])\
    .setOutputCol("lang")\
    .setThreshold(0.8)\
    .setCoalesceSentences(True)

sentenceDetector = SentenceDetector() \
    .setInputCols(["document"]) \
    .setOutputCol("Sentence")

regexTokenizer = Tokenizer() \
    .setInputCols(["sentence"]) \
    .setOutputCol("token")

finisher = Finisher() \
    .setInputCols(["token"]) \
    .setIncludeMetadata(True)

pipeline = Pipeline() \
.setStages([
    documentAssembler,
    language_detector,
    sentenceDetector,
    regexTokenizer,
    finisher
    ])

result = pipeline.fit(spark_df).transform(spark_df)

In [None]:
spark_df.head()