## PySpark Dialog NLP

In [1]:
from os import environ

java8_path: str = r"C:\Java\jdk1.8.0_191"

environ["JAVA_HOME"] =  java8_path
environ["PATH"] = environ["JAVA_HOME"] + "/bin;" + environ["PATH"]

! java -version

java version "1.8.0_191"
Java(TM) SE Runtime Environment (build 1.8.0_191-b12)
Java HotSpot(TM) 64-Bit Server VM (build 25.191-b12, mixed mode)


In [2]:
import re
import sys
from pathlib import Path
from operator import add
import urllib.request as ureq
from io import BytesIO, StringIO

import lxml.etree

In [3]:
xml_raw_url: str = "https://github.com/oxford-cs-deepnlp-2017/practical-1/blob/master/ted_en-20160408.xml?raw=true"

### Data Processing Helpers

In [4]:
def get_xml_data(url):
    with ureq.urlopen(url) as resp:
        if resp.status == 200:
            _raw = BytesIO(resp.read())
            _xml_T = lxml.etree.parse(_raw)
    return _xml_T


In [5]:
# Retain only the subtitle test of the dataset.
xml_doc = get_xml_data(xml_raw_url)
input_text = "\n".join(xml_doc.xpath("//content/text()"))
del xml_doc

#### Clean up raw text data.

Uses regular expressions.

In [6]:
clean_pattern: str = r"\([^\)]*\)"
dialog_pattern: str = r"^([a-z ]+?:\s*)"

clean_input = re.sub(clean_pattern, "", input_text)

# comp_dialog = re.compile(dialog_pattern, flags = re.I | re.M)

# cleaner_input = comp_dialog.sub("", clean_input)

In [7]:
# Get dialog lines.

split_pattern: str = r"^(\w+)?:\s*(.+)"
p_split = re.compile(split_pattern, flags = re.M)
dialog_lines = p_split.findall(clean_input)

if dialog_lines:
    dialog_collection = []
    for line in dialog_lines:
#         tmp = " ".join(re.split(r"\s+", re.sub(r"([^\w]+)", " ", line.lower())))
        dialog_collection.append({"speaker": line[0], "phrase": line[1]})

In [None]:
# --- Split into sentence chunks and/or just create a single string of all sentances without the fluff.

enumlist = lambda iterable: {k:v for k, v in enumerate(iterable)}

# lines = re.split(r".\n+", cleaner_input)
# print(f"No. lines: {len(lines):,}\n")
# print("\n".join([f"{i}: {line}" for i, line in enumlist(lines[:5]).items()]))


#### Create rows of tokens.

In [None]:
# phrase_collection = []
# for line in lines:
#     tmp = " ".join(re.split(r"\s+", re.sub(r"([^\w]+)", " ", line.lower())))
#     phrase_collection.append({"phrase": tmp})

### Create SparkSession

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, split

sparkster = SparkSession.builder.appName("DialogRecommendr").getOrCreate()
# sc = sparkly.sparkContext

#### Create DataFrame

In [None]:
# https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.html#pyspark.sql.DataFrame
df = sparkster.createDataFrame(dialog_collection)

if df:
    type(df)
    df.printSchema()

#### Get word counts

In [None]:
# words = df.select(explode(split(df["phrase"], "\s+")).alias("token"))
# word_count = words.groupBy("token")
df.show(10)

### Recommender System Using Alternating Least-Squares (ALS)

In [None]:
# https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.recommendation.ALS.html

#TODO: Set user and items.

from pyspark.ml.recommendation import ALS

_als = ALS(maxIter=5, regParam=0.01, seed = 13)
_als.clear(_als.regParam)


#### Split data into training and testing
#### Fit ALS model to training data.

In [None]:
train_pct: float = 0.8
df_training, df_test = df.randomSplit([train_pct, 1 - train_pct])


In [None]:
_als_mod = _als.fit(df_training)