In [1]:
%%html
<style>
.h1_cell, .just_text {
    box-sizing: border-box;
    padding-top:5px;
    padding-bottom:5px;
    font-family: "Times New Roman", Georgia, Serif;
    font-size: 125%;
    line-height: 22px; /* 5px +12px + 5px */
    text-indent: 25px;
    background-color: #fbfbea;
    padding: 10px;
}
.code_block {
    box-sizing: border-box;
    padding-top:5px;
    padding-bottom:5px;
    font-size: 75%;
    line-height: 22px; /* 5px +12px + 5px */
    #text-indent: 25px;
    #background-color: #fbfbea;
    padding: 5px;
}

hr { 
    display: block;
    margin-top: 0.5em;
    margin-bottom: 0.5em;
    margin-left: auto;
    margin-right: auto;
    border-style: inset;
    border-width: 2px;
}
</style>

<h2>
<center>
Latent Semantic Analysis
</center>
</h2>

In [1]:
import os
import sys
import subprocess

In [2]:
os.environ['SPARK_HOME'] = os.environ['HOME'] + '/spark'
os.environ['PATH'] += ':' + os.environ['SPARK_HOME'] + '/bin'
sys.path.append(os.environ['SPARK_HOME'] + '/python')
sys.path.append(os.environ['SPARK_HOME'] + '/python/lib/py4j-0.10.6-src.zip')

In [4]:
# run start-all.sh
subprocess.call(os.environ['SPARK_HOME'] + "/sbin/start-all.sh", env=os.environ)

0

In [3]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
spark = SparkSession(SparkContext(master='spark://blue0:7077'))
spark

<div class=h1_cell>
<p>
Using Latent Semantic Analysis (LSA), we can look at all the words in a tweet dataset and determine which words may be related or grouped together, usually by topic.
<p>
Spark doesn't do csv files very well, so we'll read the data into pandas and convert to a Spark DataFrame. On raspberry pi's, you have to use the linux package manager to install pandas. When we transition to Google Cloud, we'll change this.
</div>

In [4]:
if(True):  # if on raspberry pi, change if otherwise
    subprocess.call('sudo apt-get install python-pandas'.split(' '))
else:
    subprocess.call('pip install pandas'.split(' '))

import pandas as pd
tweets = pd.read_csv("https://docs.google.com/spreadsheets/d/1tw90jUqTQoRt-RNOqNWronMN46y7dxb2ciQwj1YFsTo/export?format=csv")
tweets.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is so...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [5]:
tweets.iloc[0]['tweet']

'@user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run'

<div class=h1_cell>
<p>
We need to take each tweet, break it down into words and remove any unuseful words (stopwords)
</div>

In [6]:
!pip install nltk

from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import stopwords
from nltk import download
from string import punctuation

word_punct_tokenizer = WordPunctTokenizer()
download('stopwords')
swords = stopwords.words('english')

def sentence_wrangler(sentence):
    word_list = word_punct_tokenizer.tokenize(sentence.lower())
    removed_words = []
    result = []
    for word in word_list:
        if word in swords:
            removed_words.append(word)
            continue
        check = False
        for char in word:
            if char in punctuation:
                check = True
                removed_words.append(word)
                break
        if not check: result.append(word)
      
    return result, removed_words

[33mYou are using pip version 9.0.3, however version 10.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
[nltk_data] Downloading package stopwords to /home/pi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


<div class=h1_cell>
<p>
Create a set of all unique words. Then represent each tweet as a RowVector of words.
<p>
For example, the vector representing "Clippers down 2 points at the second half #NBAFinals" would have a 1 in each of the columns of the words in the tweet.
</div>

In [7]:
# Get tweets with no special characters (ascii)
bag = set()
sentences = []
for i in range(len(tweets.index[:10])):
    t = tweets.iloc[i]['tweet']
    try:
        t.encode('ascii')
        words = set(sentence_wrangler(t)[0])
        for word in words:
            if word not in bag:
                bag.add(word)
        sentences.append(words)
    except UnicodeDecodeError:
        pass
bag = frozenset(bag)
print(sentences[0])

set(['selfish', 'kids', 'run', 'father', 'drags', 'user', 'dysfunctional', 'dysfunction'])


In [8]:
all_words = list(bag)
occur_matrix = pd.DataFrame(columns=all_words)
zeros = [0]*len(bag)
for sentence in sentences:
    occur_matrix = occur_matrix.append(pd.DataFrame([zeros], columns=all_words), ignore_index=True)
    for word in sentence:
        occur_matrix.loc[len(occur_matrix.index)-1, word] = 1
occur_matrix.head()

Unnamed: 0,motivation,lyft,majesty,getthanked,society,thanks,dysfunction,selfish,use,credit,...,user,bihday,kids,talking,wheelchair,pdx,gr8,leave,huge,drags
0,0,0,0,0,0,0,1,1,0,0,...,1,0,1,0,0,0,0,0,0,1
1,0,1,0,1,0,1,0,0,1,1,...,1,0,0,0,1,1,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,1,0


<div class=h1_cell>
<p>
Each row is a tweet, with 1's in the columns of words that were in the tweet.
</div>

<div class=h1_cell>
<p>
Now create a co-occurence matrix to represent how many times 2 words appear together in the same dataset. Multiply the occurence matrix by its transpose to get this matrix.
</div>

In [9]:
import numpy as np
comatrix = occur_matrix.T.dot(occur_matrix)
np.fill_diagonal(comatrix.values, 0)
comatrix.head()

Unnamed: 0,motivation,lyft,majesty,getthanked,society,thanks,dysfunction,selfish,use,credit,...,user,bihday,kids,talking,wheelchair,pdx,gr8,leave,huge,drags
motivation,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
lyft,0,0,0,1,0,1,0,0,1,1,...,1,0,0,0,1,1,0,0,0,0
majesty,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
getthanked,0,1,0,0,0,1,0,0,1,1,...,1,0,0,0,1,1,0,0,0,0
society,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


<div class=h1_cell>
<p>
Now, this matrix is incredibly sparse, and takes up space. By decomposing this matrix with Singular Value Decomposition, we can find the K most significant words in the dataset and get the cooccurence matrix of only those words. This significantly reduces the size of the matrix but retains the K most connected words in the data.
</div>

In [10]:
from pyspark.mllib.linalg import DenseVector
from pyspark.mllib.linalg.distributed import RowMatrix
rows = spark.sparkContext.parallelize([DenseVector(row) for row in comatrix.values.tolist()])
row_matrix = RowMatrix(rows)
svd = row_matrix.computeSVD(len(comatrix.index)/2, computeU=True)  # reduce matrix by
svd.s

DenseVector([11.147, 11.0, 6.7199, 2.3323, 2.0, 1.4653, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])

<div class=h1_cell>
<p>
SVD decomposed our cooccurence matrix C => U\*s\*VT, where U and VT are matricies and s is a vector of singular values. We told Spark to only compute half of the decomposition's singular values, but look at how the values draw closer to 1.0 as we approach the halfway mark. This is a good indication that we selected a good cutoff.
<p>
We can now remove all the 1.0's from our vector, and recompose a reduced matrix.
</div>

In [15]:
reduced_s = []
for val in svd.s.values.tolist():
    if round(val, 3) != 1.0:
        reduced_s.append(val)
reduced_s = DenseVector(reduced_s)
reduced_s

DenseVector([11.147, 11.0, 6.7199, 2.3323, 2.0, 1.4653])

In [15]:
spark.stop()

In [16]:
subprocess.call(os.environ['SPARK_HOME'] + "/sbin/stop-all.sh")

0