# Q1. Form Tokenisation

#### 1. Sentence Segmentation

In [30]:
# Import sentence tokenizer from nltk
from nltk.tokenize import sent_tokenize

In [31]:
# Text from Data_1.txt
with open('Data_1.txt') as f:
    data_1 = f.read()

In [32]:
# Segment text into sentences using nltk sentence tokenizer
sentences = sent_tokenize(data_1)

# Display the list, with 2 new line characters to separate each sentence in the list
print(*sentences, sep = "\n\n")

Sentiment analysis is "contextual mining of text which identifies and extracts subjective information" in source material, and helping a business to understand the social sentiment of their brand, product or service while monitoring online conversations.

However, analysis of social media streams is usually restricted to just basic sentiment analysis and count based metrics.

This is akin to just scratching the surface and missing out on those high value insights that are waiting to be discovered.

So what should a brand do to capture that low hanging fruit?


#### 2. Word Tokenisation

In [33]:
# Text from Data_1.txt
with open('Data_1.txt') as f:
    data_1 = f.read()

#### 2a. Split Function

In [34]:
# Split text by whitespace using split function
tokens = data_1.split()
print(tokens)

['Sentiment', 'analysis', 'is', '"contextual', 'mining', 'of', 'text', 'which', 'identifies', 'and', 'extracts', 'subjective', 'information"', 'in', 'source', 'material,', 'and', 'helping', 'a', 'business', 'to', 'understand', 'the', 'social', 'sentiment', 'of', 'their', 'brand,', 'product', 'or', 'service', 'while', 'monitoring', 'online', 'conversations.', 'However,', 'analysis', 'of', 'social', 'media', 'streams', 'is', 'usually', 'restricted', 'to', 'just', 'basic', 'sentiment', 'analysis', 'and', 'count', 'based', 'metrics.', 'This', 'is', 'akin', 'to', 'just', 'scratching', 'the', 'surface', 'and', 'missing', 'out', 'on', 'those', 'high', 'value', 'insights', 'that', 'are', 'waiting', 'to', 'be', 'discovered.', 'So', 'what', 'should', 'a', 'brand', 'do', 'to', 'capture', 'that', 'low', 'hanging', 'fruit?']


#### 2b. Regular Expression

In [35]:
# Import regular expression module
import re

# Find all words from the text
tokens = re.findall("[\w]+", data_1)
print(tokens)

['Sentiment', 'analysis', 'is', 'contextual', 'mining', 'of', 'text', 'which', 'identifies', 'and', 'extracts', 'subjective', 'information', 'in', 'source', 'material', 'and', 'helping', 'a', 'business', 'to', 'understand', 'the', 'social', 'sentiment', 'of', 'their', 'brand', 'product', 'or', 'service', 'while', 'monitoring', 'online', 'conversations', 'However', 'analysis', 'of', 'social', 'media', 'streams', 'is', 'usually', 'restricted', 'to', 'just', 'basic', 'sentiment', 'analysis', 'and', 'count', 'based', 'metrics', 'This', 'is', 'akin', 'to', 'just', 'scratching', 'the', 'surface', 'and', 'missing', 'out', 'on', 'those', 'high', 'value', 'insights', 'that', 'are', 'waiting', 'to', 'be', 'discovered', 'So', 'what', 'should', 'a', 'brand', 'do', 'to', 'capture', 'that', 'low', 'hanging', 'fruit']


#### 2c. NLTK

In [36]:
# Import word tokenizer from nltk
from nltk.tokenize import word_tokenize

# Segment text into words using nltk word tokenizer
words = word_tokenize(data_1)
print(words)

['Sentiment', 'analysis', 'is', '``', 'contextual', 'mining', 'of', 'text', 'which', 'identifies', 'and', 'extracts', 'subjective', 'information', "''", 'in', 'source', 'material', ',', 'and', 'helping', 'a', 'business', 'to', 'understand', 'the', 'social', 'sentiment', 'of', 'their', 'brand', ',', 'product', 'or', 'service', 'while', 'monitoring', 'online', 'conversations', '.', 'However', ',', 'analysis', 'of', 'social', 'media', 'streams', 'is', 'usually', 'restricted', 'to', 'just', 'basic', 'sentiment', 'analysis', 'and', 'count', 'based', 'metrics', '.', 'This', 'is', 'akin', 'to', 'just', 'scratching', 'the', 'surface', 'and', 'missing', 'out', 'on', 'those', 'high', 'value', 'insights', 'that', 'are', 'waiting', 'to', 'be', 'discovered', '.', 'So', 'what', 'should', 'a', 'brand', 'do', 'to', 'capture', 'that', 'low', 'hanging', 'fruit', '?']


# Q2 Form Word Stemming

In [37]:
# Text from Data_1.txt
with open('Data_1.txt') as f:
    data_1 = f.read()

# Import stemmers from nltk
from nltk.stem import PorterStemmer, RegexpStemmer, LancasterStemmer, SnowballStemmer
# Import word tokenizer from nltk
from nltk.tokenize import word_tokenize

# Segment text into words using nltk word tokenizer
words = word_tokenize(data_1)
print("Before stemming:\n", words)
print()

# Initialise Regex stemmer
regexp = RegexpStemmer('ing$|s$|e$|able$|ed$|ly$|al$|ive$|ations?$', min=4)
print("After Regex stemming:\n", [regexp.stem(w) for w in words])
print()

# Initialise Porter stemmer
porter = PorterStemmer()
print("After Porter stemming:\n", [porter.stem(w) for w in words])
print()

# Initialise Snowball stemmer
snowball = SnowballStemmer("english")
print("After Snowball stemming:\n", [snowball.stem(w) for w in words])
print()

# Initialise Lancaster stemmer
lancaster = LancasterStemmer()
print("After Lancaster stemming:\n", [lancaster.stem(w) for w in words])
print()


Before stemming:
 ['Sentiment', 'analysis', 'is', '``', 'contextual', 'mining', 'of', 'text', 'which', 'identifies', 'and', 'extracts', 'subjective', 'information', "''", 'in', 'source', 'material', ',', 'and', 'helping', 'a', 'business', 'to', 'understand', 'the', 'social', 'sentiment', 'of', 'their', 'brand', ',', 'product', 'or', 'service', 'while', 'monitoring', 'online', 'conversations', '.', 'However', ',', 'analysis', 'of', 'social', 'media', 'streams', 'is', 'usually', 'restricted', 'to', 'just', 'basic', 'sentiment', 'analysis', 'and', 'count', 'based', 'metrics', '.', 'This', 'is', 'akin', 'to', 'just', 'scratching', 'the', 'surface', 'and', 'missing', 'out', 'on', 'those', 'high', 'value', 'insights', 'that', 'are', 'waiting', 'to', 'be', 'discovered', '.', 'So', 'what', 'should', 'a', 'brand', 'do', 'to', 'capture', 'that', 'low', 'hanging', 'fruit', '?']

After Regex stemming:
 ['Sentiment', 'analysi', 'is', '``', 'contextu', 'min', 'of', 'text', 'which', 'identifie', 'and

# Q3. Filter stop words and punctuation

#### 1. Remove stopwords and punctuations

In [38]:
# Text from Data_1.txt
with open('Data_1.txt') as f:
    data_1 = f.read()

# Import stopwrods from nltk
from nltk.corpus import stopwords
# Import word tokenizer from nltk
from nltk.tokenize import word_tokenize

# Remove punctuation
import string
data_1 = data_1.translate(str.maketrans("", "",string.punctuation))

# Segment text into words using nltk word tokenizer
tokens = word_tokenize(data_1)

# English stopwords
stopwords_list = stopwords.words('english')

# Remove stopwords
content = [word for word in tokens if word.lower() not in stopwords_list]
print("Filtered text corpus:")
print(content)

Filtered text corpus:
['Sentiment', 'analysis', 'contextual', 'mining', 'text', 'identifies', 'extracts', 'subjective', 'information', 'source', 'material', 'helping', 'business', 'understand', 'social', 'sentiment', 'brand', 'product', 'service', 'monitoring', 'online', 'conversations', 'However', 'analysis', 'social', 'media', 'streams', 'usually', 'restricted', 'basic', 'sentiment', 'analysis', 'count', 'based', 'metrics', 'akin', 'scratching', 'surface', 'missing', 'high', 'value', 'insights', 'waiting', 'discovered', 'brand', 'capture', 'low', 'hanging', 'fruit']


#### 2. Stopwords found

In [39]:
# Display the identified stopwords 
stopwords_found = [words for words in tokens if words.lower() in stopwords_list]
print("Found stopwords: ")
print(stopwords_found)

Found stopwords: 
['is', 'of', 'which', 'and', 'in', 'and', 'a', 'to', 'the', 'of', 'their', 'or', 'while', 'of', 'is', 'to', 'just', 'and', 'This', 'is', 'to', 'just', 'the', 'and', 'out', 'on', 'those', 'that', 'are', 'to', 'be', 'So', 'what', 'should', 'a', 'do', 'to', 'that']


# Q4. Form Parts of Speech (POS) taggers & Syntactic Analysers

#### 1. POS Tagging

In [40]:
# Open file
with open('Data_2.txt') as f:
    data_2= f.read()
print(data_2)

A videogame or computergame is an electronic-game that involves interaction with a user interface or input device


In [41]:
# Import required modules
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

import re
from nltk import word_tokenize, pos_tag

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\PAVILION\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\PAVILION\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [42]:
# Clean non-alphanumeric characters
clean_words = re.sub("[^a-zA-Z]", " ", data_2)
clean_words = " ".join(clean_words.split())
tokens = word_tokenize(clean_words)
print(tokens)

['A', 'videogame', 'or', 'computergame', 'is', 'an', 'electronic', 'game', 'that', 'involves', 'interaction', 'with', 'a', 'user', 'interface', 'or', 'input', 'device']


In [43]:
# Conduct POS tagging using NLTK POS tagger
tagged_tokens = pos_tag(tokens) 
print(tagged_tokens)

[('A', 'DT'), ('videogame', 'NN'), ('or', 'CC'), ('computergame', 'NN'), ('is', 'VBZ'), ('an', 'DT'), ('electronic', 'JJ'), ('game', 'NN'), ('that', 'WDT'), ('involves', 'VBZ'), ('interaction', 'NN'), ('with', 'IN'), ('a', 'DT'), ('user', 'JJ'), ('interface', 'NN'), ('or', 'CC'), ('input', 'NN'), ('device', 'NN')]


In [44]:
# POS regular expression patterns
patterns = [
    (r'(The|the|A|a|An|an)$', 'DT'),                # determiner
    (r'(That|that|Which|which)$', 'WDT'),           # wh-determiner
    (r'(And|and|But|but|Or|or)$', 'CC'),            # coordinating conjunction
    (r'(At|at|In|in|On|on|With|with)$', 'IN'),      # coordinating conjunction
    (r'.*ic$', 'JJ'),                               # adjectives
    (r'.*ing$', 'VBG'),                             # gerunds
    (r'.*ed$', 'VBD'),                              # simple past
    (r'(.*es$)|Is|is', 'VBZ'),                      # 3rd singular present
    (r'.*en$', 'VBN'),                              # past participle
    (r'.*ould$', 'MD'),                             # modals
    (r'.*\'s$', 'NN$'),                             # possessive nouns
    (r'.*s$', 'NNS'),                               # plural nouns
    (r'^-?[0-9]+(\.[0-9]+)?$', 'CD'),               # cardinal numbers
    (r'.*', 'NN')                                   # nouns (default)
]

In [45]:
# Conduct POS tagging using Regular Expression tagger
regexp_tagger = nltk.RegexpTagger(patterns)
regexp_tokens = regexp_tagger.tag(tokens)

# Compare NLTK POS Tagger and Regular Expression tagger results
print(f'{"":{19}} {"NLTK POS":{18}} {"Regexp POS"}')
for nltk_result, regexp_result in zip(tagged_tokens, regexp_tokens):
   print('{:20}{:20}{:20}'.format(nltk_result[0], nltk_result[1], regexp_result[1])) 

                    NLTK POS           Regexp POS
A                   DT                  DT                  
videogame           NN                  NN                  
or                  CC                  CC                  
computergame        NN                  NN                  
is                  VBZ                 VBZ                 
an                  DT                  DT                  
electronic          JJ                  JJ                  
game                NN                  NN                  
that                WDT                 WDT                 
involves            VBZ                 VBZ                 
interaction         NN                  NN                  
with                IN                  IN                  
a                   DT                  DT                  
user                JJ                  NN                  
interface           NN                  NN                  
or                  CC             

#### 4. Parse Tree

In [46]:
# Construct parse trees
chunker = nltk.RegexpParser("""
NP: {<DT>?<JJ>*<NN>+} # Extract noun phrases
NP: {<NP> <CC> <NP>} # Extract conjuncted noun
P: {<IN>}            # Extract prepositions
V: {<V.*>}           # Extract verbs
PP: {<P> <NP>}       # Extract prepositional phrases
VP: {<V> <NP|PP>*}   # Extract verb phrases
""")

In [47]:
# Parse the tagged tokens
output = chunker.parse(tagged_tokens)

# Print the outputs in a lexical format
print(output)

(S
  (NP (NP A/DT videogame/NN) or/CC (NP computergame/NN))
  (VP (V is/VBZ) (NP an/DT electronic/JJ game/NN))
  that/WDT
  (VP
    (V involves/VBZ)
    (NP interaction/NN)
    (PP
      (P with/IN)
      (NP
        (NP a/DT user/JJ interface/NN)
        or/CC
        (NP input/NN device/NN)))))


In [48]:
# Visualise first parse tree
output.draw()