# DSCI 417 - Homework 02

**Malcolm M Nichols**

In [0]:
from pyspark.sql import SparkSession
from string import punctuation
import pandas as pd
import re

In [0]:
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

## Problem 1: Word Count

In [0]:
ws_lines = sc.textFile('/FileStore/tables/shakespeare_complete.txt')
ws_words = (
    ws_lines
    .flatMap(lambda x : re.split('[-_.,:|\s\t]', x))    # split on these chars
    .map(lambda x : x.strip(punctuation))               # strip punctuation
    .map(lambda x : x.strip('0123456789'))              # strip 0123456789
    .map(lambda x : x.replace("'", ''))                 # remove apostrphes
    .map(lambda x : x.lower())                          # convert to lower case
    .filter(lambda x : x != '')                         # remove empty strings
)
ws_words.persist()

dist_words = ws_words.distinct()

print('Total Number of Words:    ', ws_words.count())
print('Number of Distinct Words: ', dist_words.count())

In [0]:
words_sample = ws_words.sample(withReplacement=False, fraction=0.0001)
print(words_sample.collect())

## Problem 2: Longest Words

In [0]:
def comp_word(str1, str2):
    if str1 is None:
        return str2
    if str2 is None:
        return str1
    if len(str1) < len(str2):
        return str2
    if len(str2) < len(str1):
        return str1
    if len(str1) == len(str2):
        if str2 > str1:
            return str2
    if str1 <= str2:
        return str2

long_word = dist_words.reduce(comp_word)
print(long_word)

In [0]:
print(dist_words.sortBy(len, ascending=False).take(20))

## Problem 3: Word Frequency

In [0]:
pairs = ws_words.map(lambda x : (x, 1))
word_counts = (
    pairs
    .reduceByKey(lambda x, y : x + y)              # Takes the word and counts the values(1's)
    .sortBy(lambda x : x[1], ascending=False)      # Sort by the count values in descending order
)

words_list = word_counts.take(20)
df = pd.DataFrame(words_list, columns = ['word', 'count'])
df

Unnamed: 0,word,count
0,the,27379
1,and,26082
2,i,20717
3,to,19661
4,of,17474
5,a,14723
6,you,13630
7,my,12489
8,in,10996
9,that,10915


## Problem 4: Removing Stop Words

In [0]:
sw_rdd = sc.textFile('/FileStore/tables/stopwords.txt')
print(sw_rdd.count())
print(sw_rdd.sample(withReplacement=False, fraction=0.05).collect())
sw = sw_rdd.collect()

In [0]:
ws_words_f = ws_words.filter(lambda x : x not in sw)
dist_words_f = ws_words_f.distinct()
print('Number of Distinct Non-Stop Words: ', dist_words_f.count())

In [0]:
pairs_f = ws_words_f.map(lambda x : (x, 1))
word_counts_f = (
    pairs_f
    .reduceByKey(lambda x, y : x + y)              # Takes the word and counts the values(1's)
    .sortBy(lambda x : x[1], ascending=False)      # Sort by the count values in descending order
)

words_list_f = word_counts_f.take(20)
df_f = pd.DataFrame(words_list_f, columns = ['word', 'count'])
df_f

Unnamed: 0,word,count
0,will,4977
1,thy,4034
2,thee,3180
3,lord,3062
4,king,2871
5,good,2834
6,sir,2763
7,well,2553
8,enter,2350
9,love,2109


## Problem 5: Diamonds Dataset

In [0]:
diamonds_raw = sc.textFile('/FileStore/tables/diamonds.txt')
print(diamonds_raw.count())

In [0]:
for row in diamonds_raw.take(5):
    print(row)

In [0]:
header = diamonds_raw.take(1)[0].split('\t')                             # grabs info for header

def process_row(row):
    tokens = row.split('\t')
    return [float(tokens[0]), tokens[1], tokens[2], tokens[3],
           float(tokens[4]), float(tokens[5]), int(tokens[6]),
           float(tokens[7]), float(tokens[8]), float(tokens[9])]

diamonds = (
    diamonds_raw
    .filter(lambda x : 'carat' not in x)                        # filters header row
    .map(process_row)
)

for row in diamonds.take(5):
    print(row)

## Problem 6: Grouped Means

In [0]:
cut_summary = (
    diamonds
    .map(lambda x : (x[1], (x[0], x[6], 1)))
    .reduceByKey(lambda x, y : (x[0] + y[0], x[1] + y[1], x[2] + y[2]))
    .map(lambda x : (x[0], x[1][2], round(x[1][0]/x[1][2], 2), round(x[1][1]/x[1][2], 2)))
    .collect()
)

cut_df = pd.DataFrame(cut_summary, columns = ['Cut', 'Count', 'Mean_Carat', 'Mean_Price'])
cut_df

Unnamed: 0,Cut,Count,Mean_Carat,Mean_Price
0,Premium,13791,0.89,4584.26
1,Good,4906,0.85,3928.86
2,Very Good,12082,0.81,3981.76
3,Fair,1610,1.05,4358.76
4,Ideal,21551,0.7,3457.54
