In [None]:
%%html
<style>
.h1_cell, .just_text {
    box-sizing: border-box;
    padding-top:5px;
    padding-bottom:5px;
    font-family: "Times New Roman", Georgia, Serif;
    font-size: 125%;
    line-height: 22px; /* 5px +12px + 5px */
    text-indent: 25px;
    background-color: #fbfbea;
    padding: 10px;
}
.code_block {
    box-sizing: border-box;
    padding-top:5px;
    padding-bottom:5px;
    font-size: 75%;
    line-height: 22px; /* 5px +12px + 5px */
    #text-indent: 25px;
    #background-color: #fbfbea;
    padding: 5px;
}

hr { 
    display: block;
    margin-top: 0.5em;
    margin-bottom: 0.5em;
    margin-left: auto;
    margin-right: auto;
    border-style: inset;
    border-width: 2px;
}
</style>

<h2>
<center>
Building a Bag a Words with Spark
</center>
</h2>

<div class=h1_cell>
<p>
This notebook assumes you have already ran and understand the code in setup_spark.ipynb. Lets connect to our already installed spark cluster.
</div>

In [2]:
import os
import sys
import subprocess

In [3]:
os.environ['SPARK_HOME'] = os.environ['HOME'] + '/spark'
os.environ['PATH'] += ':' + os.environ['SPARK_HOME'] + '/bin'
sys.path.append(os.environ['SPARK_HOME'] + '/python')
sys.path.append(os.environ['SPARK_HOME'] + '/python/lib/py4j-0.10.6-src.zip')

In [4]:
# run start-all.sh
subprocess.call(os.environ['SPARK_HOME'] + "/sbin/start-all.sh", env=os.environ)

0

<div class=h1_cell>
<p>
Lets first configure the spark environment start the spark jvm application. We can then set some of spark's cluster settings like <i>spark.executor.memory</i>, which controls how much RAM an spark worker process gets. Finally, we can connect to the spark app and get our spark session object.
</div>

In [5]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
conf = SparkConf().set('spark.executor.cores', 2).set('spark.executor.memory', '8g')
spark = SparkSession(SparkContext(master='spark://instance-6:7077', conf=conf))
spark

<h3>
<center>
Why Spark?
</center>
</h3>

<div class=h1_cell>
<p>
Building a bag of words requires us to parse every sentence in each essay for every row, identifying unique words and adding them with the correct class label. We'll call this function F. We can speed F up by parallelizing it, i.e. using multithreading. While this may satisfy a need for processing power, it is common to also require large amounts of memory and disk space to apply F or another function to a large dataset, creating a large bag of words.
<p>
Spark is exellent for this reason. Spark virtualizes both the compute and memory resources of a group of computers (a cluster) and allows users to use the cluster throught a single 'spark' session object.
<p>
This notebook is a walkthrough of using Spark to apply F (bag-of-words) to a dataset.
<ol>
    <li>We'll the load dataset into a pandas dataframe.</li>
    <li>We'll distribute this dataset across our spark cluster by passing the dataframe to spark.</li>
    <li>We'll register a F with spark, allowing spark to call F on each of its worker nodes</li>
    <li>We'll use spark to apply F to the distributed dataset.</li>
</ol>
<p>
The result will be a single table of unique words (the joining of the distributed tables) and their class label counts, summarizing the vocabulary used between two different sources.
</div>

In [None]:
import pandas as pd
url = 'https://www.dropbox.com/s/2hdbltrl8bh6kbu/train.csv?raw=1'
donate_table = pd.read_csv(url, encoding='utf-8')

In [10]:
donate_table = donate_table[['project_essay_1', 'project_essay_2', 'project_title', 'project_is_approved']]
donate_table.head(5)

Unnamed: 0,project_essay_1,project_essay_2,project_title,project_is_approved
0,Most of my kindergarten students come from low...,I currently have a differentiated sight word c...,Super Sight Word Centers,1
1,Our elementary school is a culturally rich sch...,We strive to provide our diverse population of...,Keep Calm and Dance On,0
2,Hello;\r\nMy name is Mrs. Brotherton. I teach ...,We are looking to add some 3Doodler to our cla...,Lets 3Doodle to Learn,1
3,My students are the greatest students but are ...,"The student's project which is totally \""kid-i...","\""Kid Inspired\"" Equipment to Increase Activit...",0
4,My students are athletes and students who are ...,For some reason in our kitchen the water comes...,We need clean water for our culinary arts class!,1


In [None]:
#ok here is my where I used the whole table

import time

start = time.time()

bag_of_words = {}

for i,row in donate_table.iterrows():
    essay1_sentences = row['project_essay_1'].split('.')
    essay2_sentences = row['project_essay_2'].split('.')
    title = row['project_title']
    label = row['project_is_approved']
    
    for sentence in essay1_sentences:
        words = sentence_wrangler(sentence, swords, legals)[0]
        for word in words:
            if word not in bag_of_words:
                bag_of_words[word] = [0,0]
            bag_of_words[word][label] += 1

    for sentence in essay2_sentences:
        words = sentence_wrangler(sentence, swords, legals)[0]
        for word in words:
            if word not in bag_of_words:
                bag_of_words[word] = [0,0]
            bag_of_words[word][label] += 1

    words = sentence_wrangler(title, swords, legals)[0]
    for word in words:
        if word not in bag_of_words:
            bag_of_words[word] = [0,0]
        bag_of_words[word][label] += 1
        
    if i%4000 == 0: print('4000 more')
            
end = time.time()
print(end - start)  # roughly 6 minutes