Taller 1: Jorge Octavio Florez Caro

# PySpark Part of Speech (POS) analysis
Text taken from [Reuters](https://www.reuters.com/business/finance/banks-beware-outsiders-are-cracking-code-finance-2021-09-17/).

In [1]:
# Import ntlk for POS
import nltk

# Import re for compile a regular expression pattern into a regular expression object
import re

# Import SparkContext
from pyspark import SparkContext

In [2]:
# Download Punkt Sentence Tokenizer
nltk.download("punkt")

# Download POS Tagger
nltk.download("averaged_perceptron_tagger")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\SKYNET\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\SKYNET\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
# Entry point for working with RDD
sc = SparkContext(appName = "pyspark-pos-analysis")

In [4]:
# Loading a text file in RDD
rdd_reuters = sc.textFile("./data/reuters.txt")

# Showing the result of process
rdd_reuters.collect()

['Banks beware, Amazon and Walmart are cracking the code for finance',
 '',
 'LONDON, Sept 17 (Reuters) - Anyone can be a banker these days, you just need the right code.',
 '',
 'Global brands from Mercedes and Amazon (AMZN.O) to IKEA and Walmart (WMT.N) are cutting out the traditional financial middleman and plugging in software from tech startups to offer customers everything from banking and credit to insurance.',
 '',
 '',
 'So-called embedded finance - a fancy term for companies integrating software to offer financial services - means Amazon can let customers "buy now pay later" when they check out and Mercedes drivers can get their cars to pay for their fuel.',
 '',
 'To be sure, banks are still behind most of the transactions but investors and analysts say the risk for traditional lenders is that they will get pushed further away from the front end of the finance chain.',
 '',
 "And that means they'll be further away from the mountains of data others are hoovering up about the 

In [5]:
# Replacing the punctuation using a regular expression and transform text in lower case
#Note: Added step because punctuation characters remained when tokenizing
rdd_removedPunct=rdd_reuters.map(lambda row: re.sub(re.compile(r'[^a-zA-Z0-9\s]'),"",row).lower())

# Showing the result of process
rdd_removedPunct.collect()

['banks beware amazon and walmart are cracking the code for finance',
 '',
 'london sept 17 reuters  anyone can be a banker these days you just need the right code',
 '',
 'global brands from mercedes and amazon amzno to ikea and walmart wmtn are cutting out the traditional financial middleman and plugging in software from tech startups to offer customers everything from banking and credit to insurance',
 '',
 '',
 'socalled embedded finance  a fancy term for companies integrating software to offer financial services  means amazon can let customers buy now pay later when they check out and mercedes drivers can get their cars to pay for their fuel',
 '',
 'to be sure banks are still behind most of the transactions but investors and analysts say the risk for traditional lenders is that they will get pushed further away from the front end of the finance chain',
 '',
 'and that means theyll be further away from the mountains of data others are hoovering up about the preferences and behavio

In [6]:
# Splitting the RDD by spaces
rdd_splitSpace=rdd_removedPunct.flatMap(lambda row: row.split(" "))

# Showing the result of process
rdd_splitSpace.collect()

['banks',
 'beware',
 'amazon',
 'and',
 'walmart',
 'are',
 'cracking',
 'the',
 'code',
 'for',
 'finance',
 '',
 'london',
 'sept',
 '17',
 'reuters',
 '',
 'anyone',
 'can',
 'be',
 'a',
 'banker',
 'these',
 'days',
 'you',
 'just',
 'need',
 'the',
 'right',
 'code',
 '',
 'global',
 'brands',
 'from',
 'mercedes',
 'and',
 'amazon',
 'amzno',
 'to',
 'ikea',
 'and',
 'walmart',
 'wmtn',
 'are',
 'cutting',
 'out',
 'the',
 'traditional',
 'financial',
 'middleman',
 'and',
 'plugging',
 'in',
 'software',
 'from',
 'tech',
 'startups',
 'to',
 'offer',
 'customers',
 'everything',
 'from',
 'banking',
 'and',
 'credit',
 'to',
 'insurance',
 '',
 'for',
 'established',
 'financial',
 'institutions',
 'the',
 'signs',
 'are',
 'flashing',
 '',
 'socalled',
 'embedded',
 'finance',
 '',
 'a',
 'fancy',
 'term',
 'for',
 'companies',
 'integrating',
 'software',
 'to',
 'offer',
 'financial',
 'services',
 '',
 'means',
 'amazon',
 'can',
 'let',
 'customers',
 'buy',
 'now',
 'pay

In [7]:
# Removing empty rows
rdd_removedEmpty=rdd_splitSpace.filter(lambda x: x != "")

# Showing the result of process
rdd_removedEmpty.collect()

['banks',
 'beware',
 'amazon',
 'and',
 'walmart',
 'are',
 'cracking',
 'the',
 'code',
 'for',
 'finance',
 'london',
 'sept',
 '17',
 'reuters',
 'anyone',
 'can',
 'be',
 'a',
 'banker',
 'these',
 'days',
 'you',
 'just',
 'need',
 'the',
 'right',
 'code',
 'global',
 'brands',
 'from',
 'mercedes',
 'and',
 'amazon',
 'amzno',
 'to',
 'ikea',
 'and',
 'walmart',
 'wmtn',
 'are',
 'cutting',
 'out',
 'the',
 'traditional',
 'financial',
 'middleman',
 'and',
 'plugging',
 'in',
 'software',
 'from',
 'tech',
 'startups',
 'to',
 'offer',
 'customers',
 'everything',
 'from',
 'banking',
 'and',
 'credit',
 'to',
 'insurance',
 'for',
 'established',
 'financial',
 'institutions',
 'the',
 'signs',
 'are',
 'flashing',
 'socalled',
 'embedded',
 'finance',
 'a',
 'fancy',
 'term',
 'for',
 'companies',
 'integrating',
 'software',
 'to',
 'offer',
 'financial',
 'services',
 'means',
 'amazon',
 'can',
 'let',
 'customers',
 'buy',
 'now',
 'pay',
 'later',
 'when',
 'they',
 'ch

In [8]:
# Iterate over each row for Tokenize the text
rdd_tokenText=rdd_removedEmpty.map(lambda row:nltk.word_tokenize(row))

# Showing the result of process
rdd_tokenText.collect()

[['banks'],
 ['beware'],
 ['amazon'],
 ['and'],
 ['walmart'],
 ['are'],
 ['cracking'],
 ['the'],
 ['code'],
 ['for'],
 ['finance'],
 ['london'],
 ['sept'],
 ['17'],
 ['reuters'],
 ['anyone'],
 ['can'],
 ['be'],
 ['a'],
 ['banker'],
 ['these'],
 ['days'],
 ['you'],
 ['just'],
 ['need'],
 ['the'],
 ['right'],
 ['code'],
 ['global'],
 ['brands'],
 ['from'],
 ['mercedes'],
 ['and'],
 ['amazon'],
 ['amzno'],
 ['to'],
 ['ikea'],
 ['and'],
 ['walmart'],
 ['wmtn'],
 ['are'],
 ['cutting'],
 ['out'],
 ['the'],
 ['traditional'],
 ['financial'],
 ['middleman'],
 ['and'],
 ['plugging'],
 ['in'],
 ['software'],
 ['from'],
 ['tech'],
 ['startups'],
 ['to'],
 ['offer'],
 ['customers'],
 ['everything'],
 ['from'],
 ['banking'],
 ['and'],
 ['credit'],
 ['to'],
 ['insurance'],
 ['for'],
 ['established'],
 ['financial'],
 ['institutions'],
 ['the'],
 ['signs'],
 ['are'],
 ['flashing'],
 ['socalled'],
 ['embedded'],
 ['finance'],
 ['a'],
 ['fancy'],
 ['term'],
 ['for'],
 ['companies'],
 ['integrating'],
 [

In [9]:
# Showing the quantity of tokens
rdd_tokenText.count()

1229

In [10]:
# Identifying the POS
rdd_posText = rdd_tokenText.map(lambda row:nltk.pos_tag(row))

# Showing the result of process
rdd_posText.collect()

[[('banks', 'NNS')],
 [('beware', 'NN')],
 [('amazon', 'NN')],
 [('and', 'CC')],
 [('walmart', 'NN')],
 [('are', 'VBP')],
 [('cracking', 'VBG')],
 [('the', 'DT')],
 [('code', 'NN')],
 [('for', 'IN')],
 [('finance', 'NN')],
 [('london', 'NN')],
 [('sept', 'NN')],
 [('17', 'CD')],
 [('reuters', 'NNS')],
 [('anyone', 'NN')],
 [('can', 'MD')],
 [('be', 'VB')],
 [('a', 'DT')],
 [('banker', 'NN')],
 [('these', 'DT')],
 [('days', 'NNS')],
 [('you', 'PRP')],
 [('just', 'RB')],
 [('need', 'NN')],
 [('the', 'DT')],
 [('right', 'NN')],
 [('code', 'NN')],
 [('global', 'JJ')],
 [('brands', 'NNS')],
 [('from', 'IN')],
 [('mercedes', 'NNS')],
 [('and', 'CC')],
 [('amazon', 'NN')],
 [('amzno', 'NN')],
 [('to', 'TO')],
 [('ikea', 'NN')],
 [('and', 'CC')],
 [('walmart', 'NN')],
 [('wmtn', 'NN')],
 [('are', 'VBP')],
 [('cutting', 'VBG')],
 [('out', 'IN')],
 [('the', 'DT')],
 [('traditional', 'JJ')],
 [('financial', 'JJ')],
 [('middleman', 'NN')],
 [('and', 'CC')],
 [('plugging', 'VBG')],
 [('in', 'IN')],

In [11]:
# List of POS identified
rdd_countPos = rdd_posText.map(lambda row: (row[0][1],1)).reduceByKey(lambda x, y: x+y)

# Showing the result of process
rdd_countPos.collect()

[('NNS', 154),
 ('CC', 43),
 ('VBP', 12),
 ('CD', 35),
 ('PRP', 39),
 ('RB', 46),
 ('TO', 43),
 ('WRB', 8),
 ('PRP$', 17),
 ('JJS', 4),
 ('VBZ', 25),
 ('VBD', 27),
 ('WP', 1),
 ('NN', 349),
 ('VBG', 29),
 ('DT', 78),
 ('IN', 159),
 ('MD', 18),
 ('VB', 41),
 ('JJ', 59),
 ('VBN', 32),
 ('WDT', 4),
 ('RBR', 6)]

In [12]:
sc.stop()