In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import pyspark
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf

In [None]:
conf = SparkConf().set("spark.ui.port", "4050")

# creamos el contexto y la sesión
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

In [None]:
spark

### Retomemos el ejemplo de los trabajos de Shakespeare pero ahora usando Spark:

In [None]:
# from pydrive.auth import GoogleAuth
# from pydrive.drive import GoogleDrive
# from google.colab import auth
# from oauth2client.client import GoogleCredentials

# auth.authenticate_user()
# gauth = GoogleAuth()
# gauth.credentials = GoogleCredentials.get_application_default()
# drive = GoogleDrive(gauth)

# id='1SE6k_0YukzGd5wK-E4i6mG83nydlfvSa'
# downloaded = drive.CreateFile({'id': id})
# downloaded.GetContentFile('pg100.txt')

### Leemos el archivo `pg100.txt` y creamos un RDD usando SparkContext:

In [None]:
pg100 = sc.textFile('./data/pg100.txt')

###**Para la casa:**  Entender la diferencia entre SparkContext y SparkSession

In [None]:
pg100.take(10)

### Creamos una función que "limpia" cada línea:  se remueven signos de puntuación y otros caracteres.  Adicionalmente convertimos el texto a minúscula

In [None]:
def clean_text(line:str) -> str:
    punct = '!"#$%&\'()*+,./:;<=>?@[\\]^_`{|}~-'
    lowercased = line.lower()
    return lowercased.translate(str.maketrans('','',punct)).strip()

In [None]:
pg100 = pg100.map(clean_text)

In [None]:
pg100.take(10)

In [None]:
all_words = pg100.flatMap(lambda line: line.split())

In [None]:
all_words.take(20)

In [None]:
pg100_count = all_words.map(lambda word: (word,1))

In [None]:
pg100_rbk = pg100_count.reduceByKey(lambda a,b: a+b).sortByKey()

In [None]:
pg100_rbk.persist()

In [None]:
pg100_rbk.take(10)

###Truco para encontrar las palabras con mas ocurrencias:

In [None]:
pg100_rbk.map(lambda tupla: (tupla[1],tupla[0])).sortByKey(False).take(10)

In [None]:
import nltk

In [None]:
nltk.download('stopwords')

In [None]:
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
stopwords

###Dejamos solo las palabras que no sean stopwords:

In [None]:
pg100_rbk = pg100_rbk.filter(lambda tupla: tupla[0] not in stopwords)

In [None]:
pg100_rbk.map(lambda tupla: (tupla[1],tupla[0])).sortByKey(False).take(20)

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

In [None]:
word_dict = {tupla[0]:float(tupla[1]) for tupla in pg100_rbk.collect()}

In [None]:
wc = WordCloud(width=3000,height=1000).generate_from_frequencies(word_dict)

In [None]:
plt.figure(figsize=(30, 10))
plt.imshow(wc)
plt.axis("off")