# NLP - Spark

![https://i.imgur.com/JRU5xR8.png](https://i.imgur.com/JRU5xR8.png)

In [1]:
import bs4
import requests
import pandas as pd
import numpy as np

## 1. Tokenização usando `map ()`, `flatMap()` e `reduceByKey()`

In [4]:
pags = str(np.arange(1,11))
 
texts = []
 
for pag in pags:
    base_url = "https://quotes.toscrape.com/page/"+ pag
    result = requests.get(base_url)
    soup = bs4.BeautifulSoup(result.text)
    for text_ in soup.select('.text'):
        texts.append(text_.text)

In [7]:
from pyspark.sql import SparkSession
spark = SparkSession \
        .builder \
        .master("local[4]") \
        .appName("nlp_quotes") \
        .getOrCreate()

In [8]:
sc = spark.sparkContext

In [10]:
textsRDD = sc.text_rdd = sc.parallelize(texts)

In [11]:
textsRDD.take(2)

['“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”',
 '“It is our choices, Harry, that show what we truly are, far more than our abilities.”']

In [15]:
pip install nltk

Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2024.9.11-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m50.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading regex-2024.9.11-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (797 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m797.0/797.0 kB[0m [31m28.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: regex, nltk
Successfully installed nltk-3.9.1 regex-2024.9.11
Note: you may need to restart the kernel to use updated packages.


In [33]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/jovyan/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [19]:
textsRDD2 = textsRDD.map(lambda x: x.replace('“',' ').strip())
textsRDD3 = textsRDD2.map(lambda x: x.replace('”',' ').strip())
textsRDD3.take(2)

['The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.',
 'It is our choices, Harry, that show what we truly are, far more than our abilities.']

In [20]:
textsRDD4  = textsRDD3.map(lambda x: x.lower())
textsRDD4.take(2)

['the world as we have created it is a process of our thinking. it cannot be changed without changing our thinking.',
 'it is our choices, harry, that show what we truly are, far more than our abilities.']

In [26]:
textsRDD5 = textsRDD4.flatMap(lambda x: x.split(' '))
textsRDD5.take(10)

['the', 'world', 'as', 'we', 'have', 'created', 'it', 'is', 'a', 'process']

In [27]:
textsRDD6 = textsRDD5.map(lambda x: (x, 1))
textsRDD6.take(5)

[('the', 1), ('world', 1), ('as', 1), ('we', 1), ('have', 1)]

In [28]:
textsRDD7 = textsRDD6.reduceByKey(lambda x,y: x+y)
textsRDD7.take(5)

[('world', 5), ('have', 18), ('it', 39), ('of', 49), ('our', 10)]

## 2. Tokenização usando o `tokenize.word_tokenize()`

In [30]:
textsRDD4.take(2)

['the world as we have created it is a process of our thinking. it cannot be changed without changing our thinking.',
 'it is our choices, harry, that show what we truly are, far more than our abilities.']

In [34]:
textsRDD_Token = textsRDD4.map(lambda x: nltk.tokenize.word_tokenize(x))

In [36]:
textsRDD_Token.take(1)

[['the',
  'world',
  'as',
  'we',
  'have',
  'created',
  'it',
  'is',
  'a',
  'process',
  'of',
  'our',
  'thinking',
  '.',
  'it',
  'can',
  'not',
  'be',
  'changed',
  'without',
  'changing',
  'our',
  'thinking',
  '.']]

In [37]:
stopWord = nltk.corpus.stopwords.words('english')

In [38]:
textsRDD_stopWord = textsRDD_Token.flatMap(lambda x: [word for word in x if word not in stopWord])

In [40]:
textsRDD_stopWord.take(5)

['world', 'created', 'process', 'thinking', '.']

In [41]:
textsRDD10 = textsRDD_stopWord.map(lambda x: (x,1))
textsRDD10.take(5)

[('world', 1), ('created', 1), ('process', 1), ('thinking', 1), ('.', 1)]

In [43]:
textsRDD11 = textsRDD10.reduceByKey(lambda x,y: x+y)
textsRDD11.take(5)

[('world', 7), ('thinking', 6), ('.', 173), ('changed', 2), ('changing', 2)]

In [45]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [61]:
lista_specChars = ['...','-','–',"'","n't","'s"]

In [62]:
textsRDD12 = textsRDD11.filter(lambda x: x[0] not in lista_specChars)

In [63]:
textsRDD12.take(5)

[('world', 7), ('thinking', 6), ('.', 173), ('changed', 2), ('changing', 2)]

In [64]:
textsRDD13 = textsRDD12.filter(lambda x: x[0] not in string.punctuation)

In [65]:
textsRDD13.take(5)

[('world', 7), ('thinking', 6), ('changed', 2), ('changing', 2), ('harry', 4)]

In [66]:
textsRDD13.takeOrdered(10,key=lambda x: -x[1])

[('love', 23),
 ('one', 15),
 ('life', 12),
 ('never', 12),
 ('think', 12),
 ('like', 11),
 ('make', 10),
 ('good', 9),
 ('know', 9),
 ('without', 8)]