# NLP

### Imported text (feel free to use your own)

In [42]:
text = "Millions of people across the UK and beyond have celebrated the coronation of King Charles III - a symbolic ceremony combining a religious service and pageantry. The ceremony was held at Westminster Abbey, with the King becoming the 40th reigning monarch to be crowned there since 1066. Queen Camilla was crowned alongside him before a huge parade back to Buckingham Palace. Here's how the day of splendour and formality, which featured customs dating back more than 1,000 years, unfolded."
text

"Millions of people across the UK and beyond have celebrated the coronation of King Charles III - a symbolic ceremony combining a religious service and pageantry. The ceremony was held at Westminster Abbey, with the King becoming the 40th reigning monarch to be crowned there since 1066. Queen Camilla was crowned alongside him before a huge parade back to Buckingham Palace. Here's how the day of splendour and formality, which featured customs dating back more than 1,000 years, unfolded."

# Segmentation

In [43]:
import nltk
from nltk.tokenize import sent_tokenize

# Split text into individual sentences
sentences = sent_tokenize(text)
sentences

['Millions of people across the UK and beyond have celebrated the coronation of King Charles III - a symbolic ceremony combining a religious service and pageantry.',
 'The ceremony was held at Westminster Abbey, with the King becoming the 40th reigning monarch to be crowned there since 1066.',
 'Queen Camilla was crowned alongside him before a huge parade back to Buckingham Palace.',
 "Here's how the day of splendour and formality, which featured customs dating back more than 1,000 years, unfolded."]

In [44]:
sentences[0]

'Millions of people across the UK and beyond have celebrated the coronation of King Charles III - a symbolic ceremony combining a religious service and pageantry.'

# Tokenization

In [48]:
import re

# Remove punctuation characters
text = re.sub(r"[^a-zA-Z0-9]"," ",sentences[0])
text

'Millions of people across the UK and beyond have celebrated the coronation of King Charles III   a symbolic ceremony combining a religious service and pageantry '

In [9]:
from nltk.tokenize import word_tokenize

words = word_tokenize(text)
print(words)

['Millions', 'of', 'people', 'across', 'the', 'UK', 'and', 'beyond', 'have', 'celebrated', 'the', 'coronation', 'of', 'King', 'Charles', 'III', 'a', 'symbolic', 'ceremony', 'combining', 'a', 'religious', 'service', 'and', 'pageantry']


# Removal of stop words

In [10]:
from nltk.corpus import stopwords

words = [w for w in words if w not in stopwords.words("english")]
print(words)

['Millions', 'people', 'across', 'UK', 'beyond', 'celebrated', 'coronation', 'King', 'Charles', 'III', 'symbolic', 'ceremony', 'combining', 'religious', 'service', 'pageantry']


In [11]:
# Stopwords list

print(stopwords.words("english"))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

# Stemming and Lemmatization

In [12]:
# Stemming
from nltk.stem.porter import PorterStemmer

# Reduce words to their stems
stemmed = [PorterStemmer().stem(w) for w in words]
print(stemmed)

['million', 'peopl', 'across', 'uk', 'beyond', 'celebr', 'coron', 'king', 'charl', 'iii', 'symbol', 'ceremoni', 'combin', 'religi', 'servic', 'pageantri']


In [13]:
# Lemmatize
from nltk.stem.wordnet import WordNetLemmatizer

# Reduce words to their root form
lemmatized = [WordNetLemmatizer().lemmatize(w) for w in words]
print(lemmatized)

['Millions', 'people', 'across', 'UK', 'beyond', 'celebrated', 'coronation', 'King', 'Charles', 'III', 'symbolic', 'ceremony', 'combining', 'religious', 'service', 'pageantry']


In [14]:
# Better representation of stemming and lemmatization example
words2 = ['wait','waiting','studies','studying','computers']

# Stemming
# Reduce words to their stems
stemmed = [PorterStemmer().stem(w) for w in words2]
print("Stemming output: {}".format(stemmed))

# Lemma
# Reduce words to their root forms
lemmatized = [WordNetLemmatizer().lemmatize(w) for w in words2]
print("Lemmatization output: {}".format(lemmatized))

Stemming output: ['wait', 'wait', 'studi', 'studi', 'comput']
Lemmatization output: ['wait', 'waiting', 'study', 'studying', 'computer']


# Speech tagging

In [15]:
from nltk import pos_tag

# Tag each word with part of speech
pos_tag(words)

[('Millions', 'NNS'),
 ('people', 'NNS'),
 ('across', 'IN'),
 ('UK', 'NNP'),
 ('beyond', 'IN'),
 ('celebrated', 'VBN'),
 ('coronation', 'NN'),
 ('King', 'NNP'),
 ('Charles', 'NNP'),
 ('III', 'NNP'),
 ('symbolic', 'JJ'),
 ('ceremony', 'NN'),
 ('combining', 'VBG'),
 ('religious', 'JJ'),
 ('service', 'NN'),
 ('pageantry', 'NN')]

### All tags for words: