# Analisando Orgulho e Preconceito com PySpark

## Instalando o PySpark

In [None]:
!pip install pyspark py4j

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# Inicializando uma sessão no PySpark

from pyspark.sql import SparkSession
spark = SparkSession \
          .builder \
          .appName("CDIA4-22-PySpark") \
          .getOrCreate()


spark

## Passo #01. Leitura do livro

In [None]:
book = spark.read.text("/content/pride-and-prejudice.txt")
book.show()

+--------------------+
|               value|
+--------------------+
|The Project Guten...|
|                    |
|This eBook is for...|
|almost no restric...|
|re-use it under t...|
|with this eBook o...|
|                    |
|                    |
|Title: Pride and ...|
|                    |
| Author: Jane Austen|
|                    |
|Posting Date: Aug...|
|Release Date: Jun...|
|Last Updated: Mar...|
|                    |
|   Language: English|
|                    |
|Character set enc...|
|                    |
+--------------------+
only showing top 20 rows



## Passo #02. Tokenização

In [None]:
from pyspark.sql.functions import split

lines = book.select(
    split(book.value, " ").alias("line")
)
#lines.show()

# Dá pra mudar a truncagem, mostrando mais colunas que o padrão
lines.show(10, truncate=50)

+--------------------------------------------------+
|                                              line|
+--------------------------------------------------+
|[The, Project, Gutenberg, EBook, of, Pride, and...|
|                                                []|
|[This, eBook, is, for, the, use, of, anyone, an...|
|[almost, no, restrictions, whatsoever., , You, ...|
|[re-use, it, under, the, terms, of, the, Projec...|
|[with, this, eBook, or, online, at, www.gutenbe...|
|                                                []|
|                                                []|
|                   [Title:, Pride, and, Prejudice]|
|                                                []|
+--------------------------------------------------+
only showing top 10 rows



In [None]:
# Explodindo as linhas (que são compostas por listas)

from pyspark.sql.functions import explode, col

words = lines.select(
    explode(col("line")).alias("word")
)
words.show()

+----------+
|      word|
+----------+
|       The|
|   Project|
| Gutenberg|
|     EBook|
|        of|
|     Pride|
|       and|
|Prejudice,|
|        by|
|      Jane|
|    Austen|
|          |
|      This|
|     eBook|
|        is|
|       for|
|       the|
|       use|
|        of|
|    anyone|
+----------+
only showing top 20 rows



## Passo #03. Limpeza dos dados

In [None]:
# Mudar a caixa das palavras para minúsculo

from pyspark.sql.functions import lower

lowered_words = words.select(
    lower(
        col("word")
    ).alias("lowered_word")
)
lowered_words.show(15)

+------------+
|lowered_word|
+------------+
|         the|
|     project|
|   gutenberg|
|       ebook|
|          of|
|       pride|
|         and|
|  prejudice,|
|          by|
|        jane|
|      austen|
|            |
|        this|
|       ebook|
|          is|
+------------+
only showing top 15 rows



### Removendo a pontuação

In [None]:
from pyspark.sql.functions import regexp_extract

cleaned_words = lowered_words.select(
    regexp_extract(
        col("lowered_word"), # Indico a coluna
        "[a-z]+", # Seleciona somente o que parece uma palavra
        0 # 0 significa dar o match em toda expressão regular
    ).alias("cleaned_word")
)
cleaned_words.show()

+------------+
|cleaned_word|
+------------+
|         the|
|     project|
|   gutenberg|
|       ebook|
|          of|
|       pride|
|         and|
|   prejudice|
|          by|
|        jane|
|      austen|
|            |
|        this|
|       ebook|
|          is|
|         for|
|         the|
|         use|
|          of|
|      anyone|
+------------+
only showing top 20 rows



### Filtrando os valores NULL

In [1]:
words_nonull = cleaned_words.where(
    col("cleaned_word") != ""
)
words_nonull = words_nonull.select(
    col("cleaned_word").alias("word")
)
words_nonull.show()

NameError: name 'cleaned_words' is not defined

### Contagem das palavras

In [None]:
counts = words_nonull.groupBy(
    col("word")
).count()
counts.show()

+-------------+-----+
|         word|count|
+-------------+-----+
|       online|    4|
|         some|  209|
|        still|   72|
|          few|   72|
|         hope|  122|
|        those|   60|
|     cautious|    4|
|    imitation|    1|
|          art|    3|
|      solaced|    1|
|       poetry|    2|
|    arguments|    5|
| premeditated|    1|
|      elevate|    1|
|       doubts|    2|
|    destitute|    1|
|    solemnity|    5|
|   lieutenant|    1|
|gratification|    1|
|    connected|   14|
+-------------+-----+
only showing top 20 rows



### Ordenação das contagens

In [None]:
counts.orderBy(
    col("count"),
    ascending=False
).show()

+----+-----+
|word|count|
+----+-----+
| the| 4496|
|  to| 4235|
|  of| 3719|
| and| 3602|
| her| 2223|
|   i| 2052|
|   a| 1997|
|  in| 1920|
| was| 1844|
| she| 1703|
|that| 1582|
|  it| 1542|
| not| 1447|
| you| 1426|
|  he| 1334|
| his| 1267|
|  be| 1259|
|  as| 1191|
| had| 1176|
|with| 1099|
+----+-----+
only showing top 20 rows



# Retomando

## Passo #2.1

In [None]:
words_lst_df = words_nonull.select(
    split(
        col("word"),
        " "
    ).alias("word")
)
words_lst_df.show(10)

+-----------+
|       word|
+-----------+
|      [the]|
|  [project]|
|[gutenberg]|
|    [ebook]|
|       [of]|
|    [pride]|
|      [and]|
|[prejudice]|
|       [by]|
|     [jane]|
+-----------+
only showing top 10 rows



In [None]:
# Remove as stop words
from pyspark.ml.feature import StopWordsRemover

swr = StopWordsRemover(
    inputCol="word",
    outputCol="meaningful"
)

swr_df = swr.transform(words_lst_df)
swr_df.show()

+-----------+-----------+
|       word| meaningful|
+-----------+-----------+
|      [the]|         []|
|  [project]|  [project]|
|[gutenberg]|[gutenberg]|
|    [ebook]|    [ebook]|
|       [of]|         []|
|    [pride]|    [pride]|
|      [and]|         []|
|[prejudice]|[prejudice]|
|       [by]|         []|
|     [jane]|     [jane]|
|   [austen]|   [austen]|
|     [this]|         []|
|    [ebook]|    [ebook]|
|       [is]|         []|
|      [for]|         []|
|      [the]|         []|
|      [use]|      [use]|
|       [of]|         []|
|   [anyone]|   [anyone]|
| [anywhere]| [anywhere]|
+-----------+-----------+
only showing top 20 rows



In [None]:
# Removing the NULLs
meaningful = swr_df.select(
    (col("meaningful")[0]).alias("meaningful")
)
meaningful.show()

+----------+
|meaningful|
+----------+
|      null|
|   project|
| gutenberg|
|     ebook|
|      null|
|     pride|
|      null|
| prejudice|
|      null|
|      jane|
|    austen|
|      null|
|     ebook|
|      null|
|      null|
|      null|
|       use|
|      null|
|    anyone|
|  anywhere|
+----------+
only showing top 20 rows



In [None]:
# Filtrar os valores NULLs
meaningful_nonull = meaningful.where(
    col("meaningful") != "null"
)
meaningful_nonull.show(5)

+----------+
|meaningful|
+----------+
|   project|
| gutenberg|
|     ebook|
|     pride|
| prejudice|
+----------+
only showing top 5 rows



### Recontagem dos valores

In [None]:
words_counts = meaningful_nonull.groupby(
    col("meaningful")
).count()

words_counts.show()

+-------------+-----+
|   meaningful|count|
+-------------+-----+
|       online|    4|
|        still|   72|
|         hope|  122|
|     cautious|    4|
|    imitation|    1|
|          art|    3|
|      solaced|    1|
|       poetry|    2|
|    arguments|    5|
| premeditated|    1|
|      elevate|    1|
|       doubts|    2|
|    destitute|    1|
|    solemnity|    5|
|   lieutenant|    1|
|gratification|    1|
|    connected|   14|
|   likelihood|    5|
|        pools|    1|
|       coaxed|    1|
+-------------+-----+
only showing top 20 rows



In [None]:
# Classificação da contagem
words_counts = words_counts.orderBy(
    col("count"), 
    ascending=False
)
words_counts.show()


+----------+-----+
|meaningful|count|
+----------+-----+
|        mr|  783|
| elizabeth|  634|
|     darcy|  418|
|      said|  401|
|       mrs|  343|
|      much|  329|
|    bennet|  323|
|      must|  318|
|   bingley|  306|
|      jane|  295|
|      miss|  283|
|       one|  273|
|      know|  239|
|      well|  224|
|    though|  220|
|     never|  220|
|    sister|  218|
|      soon|  216|
|     think|  211|
|       may|  207|
+----------+-----+
only showing top 20 rows

