### Reading text data

In [0]:
df = (spark.read.format("csv")
      .option("header",True)
      .load("/Volumes/mycatalog/myschema/myvolume/repofiles/Data-Engineering-with-Databricks-Cookbook-main/data/Reviews.csv"))

In [0]:
df.printSchema()

In [0]:
df.show(10, truncate=False)

In [0]:
from pyspark.sql.functions import regexp_replace, split

In [0]:
# Apply regular expression to remove all non-alphabetic characters
df_clean = (df
            .withColumn("Text", regexp_replace("Text", "[^a-zA-Z ]", ""))
            .withColumn("Text", regexp_replace("Text", "  +", " ")))

df_clean.show()

In [0]:
df_with_words = (df_clean.withColumn("words", split(df_clean.Text, "\\s+")))
df_with_words.show()

In [0]:
from pyspark.ml.feature import Tokenizer

# Tokenize the text data
tokenizer = Tokenizer(inputCol='Text', outputCol='words')
df_with_words = tokenizer.transform(df_clean)
df_with_words.show()

In [0]:
from pyspark.ml.feature import StopWordsRemover

remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
df_stop_words_removed = remover.transform(df_with_words)

df_stop_words_removed.show()

In [0]:
df_exploded = (df_stop_words_removed
               .select(explode(df_stop_words_removed.filtered_words).alias("word")))
word_count = (df_exploded
              .groupBy("word")
              .count()
              .orderBy("count", ascending=False))
word_count.show(n=100)

In [0]:
from pyspark.ml.feature import CountVectorizer

# Convert the text data into numerical features
vectorizer = CountVectorizer(inputCol='filtered_words', outputCol='features')
vectorized_data = vectorizer.fit(df_stop_words_removed).transform(df_stop_words_removed)
vectorized_data.show(10, truncate=False)

In [0]:
(vectorized_data.repartition(1)
 .write.mode("overwrite")
 .json("../data/data_lake/reviews_vectorized.json"))

### Using the `regexp_extract()` function

In [0]:
from pyspark.sql.functions import regexp_extract

# Extract all words starting with "q"
df_q_words = (vectorized_data
              .withColumn("q_words", regexp_extract("text", "\\\\bq\\\\w*", 0)))
df_q_words.show()

### Using the `rlike()` function

In [0]:
# Check if text data contains the word "good"
df_good_word = (vectorized_data
              .withColumn("contains_qood", expr("text rlike 'quick'")))
df_good_word.show()

### Customizing Stop Words

In [0]:
custom_stopwords = ["/><br", "-", "/>I","/>The"]
stopwords_remover = StopWordsRemover(inputCol="words", outputCol="filtered_words", stopWords=custom_stopwords)

df_stop_words_removed = stopwords_remover.transform(df_with_words)

df_stop_words_removed.show()

In [0]:
custom_stopwords = ["/><br", "-", "/>I","/>The"]

stopwords_remover = StopWordsRemover(inputCol="words", outputCol="filtered_words", stopWords=custom_stopwords)
stopwords_remover.setStopWords(custom_stopwords)

df_stop_words_removed = stopwords_remover.transform(df_with_words)

df_stop_words_removed.show()