# Question 1

Extract the contents of C-DAC wikipedia page.

Tokenize using NLTK and SpaCy (both word and sentences)

Display word and sentences counts in both cases.

In [1]:
import urllib3
from bs4 import BeautifulSoup

url = "https://en.wikipedia.org/wiki/Centre_for_Development_of_Advanced_Computing"

http = urllib3.PoolManager()

response = http.request('GET', url)

soup = BeautifulSoup(response.data, 'html.parser')

text = soup.get_text()

with open('cdac_article.txt', 'w', encoding='utf-8') as f:
    f.write(text)

print("Article saved to cdac_article.txt")


Article saved to cdac_article.txt


In [2]:
# Tokenize using NLTK
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [3]:
with open('cdac_article.txt', encoding='utf-8') as f:
    text = f.read()


tokens_nltk = nltk.word_tokenize(text)

sentences_nltk = nltk.sent_tokenize(text)

print("Number of tokens: ", len(tokens_nltk))
print("Number of sentences: ", len(sentences_nltk))

Number of tokens:  3555
Number of sentences:  248


In [4]:
# Tokenize using spaCy

import spacy

nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

nlp.add_pipe('sentencizer')

with open('cdac_article.txt', encoding='utf-8') as f:
    text = f.read()

doc = nlp(text)

tokens_spacy = [token.text for token in doc]

sentences_spacy = [sent.text for sent in doc.sents]

print("Number of tokens: ", len(tokens_spacy))
print("Number of sentences: ", len(sentences_spacy))

Number of tokens:  3840
Number of sentences:  222


# Question 2

Use text from https://en.wikipedia.org/wiki/Geoffrey_Hinton

Perform text cleaning. All five steps.

Display final text.

Use both NLTK and spaCy.

In [5]:
import urllib3
from bs4 import BeautifulSoup

url = "https://en.wikipedia.org/wiki/Geoffrey_Hinton"

http = urllib3.PoolManager()

response = http.request('GET', url)

soup = BeautifulSoup(response.data, 'html.parser')

text = soup.get_text()

with open('Geoffrey_Hinton_article.txt', 'w', encoding='utf-8') as f:
    f.write(text)

print("Article saved to Geoffrey_Hinton_article.txt")

Article saved to Geoffrey_Hinton_article.txt


In [6]:
# Cleaning using NLTK

import string
import nltk
nltk.download('punkt')


with open('cdac_article.txt', encoding='utf-8') as f:
    text = f.read()

text= text.replace("\n", " ")

for punct in string.punctuation:

    text = text.replace(punct, " ")

print("Text without Punctuation  : \n")
print(text)

Text without Punctuation  : 

    Centre for Development of Advanced Computing   Wikipedia                                    Jump to content        Main menu      Main menu move to sidebar hide    		Navigation 	   Main pageContentsCurrent eventsRandom articleAbout WikipediaContact us      		Contribute 	   HelpLearn to editCommunity portalRecent changesUpload fileSpecial pages                    Search            Search                       Appearance                 Donate  Create account  Log in         Personal tools      Donate Create account Log in      		Pages for logged out editors learn more    ContributionsTalk                             Contents move to sidebar hide      Top       1 History         2 Research activities         3 Centres         4 Education and training         5 Commercialization         6 Products and developments         7 Notable researchers and alumnus         8 Notable awards and accolades         9 Other projects         10 See also         11 Refere

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
text = text.lower()

print("Text in Lower Case : \n")
print(text)

Text in Lower Case : 

    centre for development of advanced computing   wikipedia                                    jump to content        main menu      main menu move to sidebar hide    		navigation 	   main pagecontentscurrent eventsrandom articleabout wikipediacontact us      		contribute 	   helplearn to editcommunity portalrecent changesupload filespecial pages                    search            search                       appearance                 donate  create account  log in         personal tools      donate create account log in      		pages for logged out editors learn more    contributionstalk                             contents move to sidebar hide      top       1 history         2 research activities         3 centres         4 education and training         5 commercialization         6 products and developments         7 notable researchers and alumnus         8 notable awards and accolades         9 other projects         10 see also         11 references   

In [8]:
tokenized_text = word_tokenize(text)

print("Tokenized Text : \n")
print(tokenized_text)

Tokenized Text : 

['centre', 'for', 'development', 'of', 'advanced', 'computing', 'wikipedia', 'jump', 'to', 'content', 'main', 'menu', 'main', 'menu', 'move', 'to', 'sidebar', 'hide', 'navigation', 'main', 'pagecontentscurrent', 'eventsrandom', 'articleabout', 'wikipediacontact', 'us', 'contribute', 'helplearn', 'to', 'editcommunity', 'portalrecent', 'changesupload', 'filespecial', 'pages', 'search', 'search', 'appearance', 'donate', 'create', 'account', 'log', 'in', 'personal', 'tools', 'donate', 'create', 'account', 'log', 'in', 'pages', 'for', 'logged', 'out', 'editors', 'learn', 'more', 'contributionstalk', 'contents', 'move', 'to', 'sidebar', 'hide', 'top', '1', 'history', '2', 'research', 'activities', '3', 'centres', '4', 'education', 'and', 'training', '5', 'commercialization', '6', 'products', 'and', 'developments', '7', 'notable', 'researchers', 'and', 'alumnus', '8', 'notable', 'awards', 'and', 'accolades', '9', 'other', 'projects', '10', 'see', 'also', '11', 'references',

In [9]:
print("\nFirst 10 tokens : \n")

print(tokenized_text[:10])


First 10 tokens : 

['centre', 'for', 'development', 'of', 'advanced', 'computing', 'wikipedia', 'jump', 'to', 'content']


In [10]:
from pprint import pprint

from nltk.probability import FreqDist

fdist = FreqDist(tokenized_text)

top_20_tokens = fdist.most_common(20)

pprint(top_20_tokens)


[('the', 92),
 ('of', 85),
 ('and', 57),
 ('c', 52),
 ('in', 49),
 ('dac', 47),
 ('india', 44),
 ('for', 36),
 ('a', 30),
 ('retrieved', 28),
 ('computing', 27),
 ('to', 26),
 ('on', 22),
 ('centre', 20),
 ('from', 20),
 ('september', 18),
 ('s', 17),
 ('development', 16),
 ('technology', 14),
 ('e', 14)]


In [11]:
# Cleaning using spaCy

import spacy
import re

nlp = spacy.load("en_core_web_sm")

with open('Geoffrey_Hinton_article.txt', encoding='utf-8') as f:
    text = f.read()

text = text.replace("\n", " ")

text = re.sub(r'[!"#$%&\'()*+,-./:;<=>?@[\]^_`{|}~]', ' ', text)

text = text.lower()

doc = nlp(text)

tokenized_text = [token.text for token in doc if not token.is_stop and token.text.strip()]

print("Processed Tokens:", tokenized_text)

Processed Tokens: ['geoffrey', 'hinton', 'wikipedia', 'jump', 'content', 'main', 'menu', 'main', 'menu', 'sidebar', 'hide', 'navigation', 'main', 'pagecontentscurrent', 'eventsrandom', 'articleabout', 'wikipediacontact', 'contribute', 'helplearn', 'editcommunity', 'portalrecent', 'changesupload', 'filespecial', 'pages', 'search', 'search', 'appearance', 'donate', 'create', 'account', 'log', 'personal', 'tools', 'donate', 'create', 'account', 'log', 'pages', 'logged', 'editors', 'learn', 'contributionstalk', 'contents', 'sidebar', 'hide', '1', 'education', '2', 'career', 'research', 'toggle', 'career', 'research', 'subsection', '2', '1', 'honours', 'awards', '3', 'views', 'toggle', 'views', 'subsection', '3', '1', 'risks', 'artificial', 'intelligence', '3', '1', '1', 'existential', 'risk', 'agi', '3', '1', '2', 'catastrophic', 'misuse', '3', '1', '3', 'economic', 'impacts', '3', '2', 'politics', '4', 'personal', 'life', '5', 'references', '6', 'reading', 'toggle', 'table', 'contents', '

In [12]:
word_freq = {}
for token in tokenized_text:
    word_freq[token] = word_freq.get(token, 0) + 1


print("\nWord Frequencies:", word_freq)


Word Frequencies: {'geoffrey': 70, 'hinton': 130, 'wikipedia': 6, 'jump': 1, 'content': 1, 'main': 3, 'menu': 2, 'sidebar': 4, 'hide': 4, 'navigation': 1, 'pagecontentscurrent': 1, 'eventsrandom': 1, 'articleabout': 1, 'wikipediacontact': 1, 'contribute': 1, 'helplearn': 1, 'editcommunity': 1, 'portalrecent': 1, 'changesupload': 2, 'filespecial': 1, 'pages': 4, 'search': 4, 'appearance': 2, 'donate': 2, 'create': 3, 'account': 2, 'log': 2, 'personal': 3, 'tools': 3, 'logged': 1, 'editors': 1, 'learn': 7, 'contributionstalk': 1, 'contents': 3, '1': 26, 'education': 3, '2': 20, 'career': 3, 'research': 27, 'toggle': 4, 'subsection': 2, 'honours': 3, 'awards': 7, '3': 19, 'views': 3, 'risks': 6, 'artificial': 30, 'intelligence': 28, 'existential': 3, 'risk': 4, 'agi': 5, 'catastrophic': 2, 'misuse': 4, 'economic': 5, 'impacts': 4, 'politics': 3, '4': 7, 'life': 7, '5': 9, 'references': 2, '6': 10, 'reading': 2, 'table': 2, '54': 3, 'languages': 2, 'afrikaansالعربيةঅসমীয়াazərbaycancaتۆرک

# Question 3

For the same url, perform:

Stemming using Porter stemmer, Snowball stemmer.

Lemmatization using both NLTK and spaCy.

Display the results.


In [15]:
import nltk
from nltk.stem import PorterStemmer
nltk.download('punkt')

porter_stemmer = PorterStemmer()

with open('cdac_article.txt', encoding='utf-8') as f:
    text = f.read()

text = word_tokenize(text)

for word in text:

    print(f"Original : {word} --------> Stemmed : {porter_stemmer.stem(word)}")

Original : Centre --------> Stemmed : centr
Original : for --------> Stemmed : for
Original : Development --------> Stemmed : develop
Original : of --------> Stemmed : of
Original : Advanced --------> Stemmed : advanc
Original : Computing --------> Stemmed : comput
Original : - --------> Stemmed : -
Original : Wikipedia --------> Stemmed : wikipedia
Original : Jump --------> Stemmed : jump
Original : to --------> Stemmed : to
Original : content --------> Stemmed : content
Original : Main --------> Stemmed : main
Original : menu --------> Stemmed : menu
Original : Main --------> Stemmed : main
Original : menu --------> Stemmed : menu
Original : move --------> Stemmed : move
Original : to --------> Stemmed : to
Original : sidebar --------> Stemmed : sidebar
Original : hide --------> Stemmed : hide
Original : Navigation --------> Stemmed : navig
Original : Main --------> Stemmed : main
Original : pageContentsCurrent --------> Stemmed : pagecontentscurr
Original : eventsRandom --------> St

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [16]:
# Snowball Stemmer

from nltk.stem import SnowballStemmer

snowball_stemmer = SnowballStemmer(language='english')

with open('cdac_article.txt', encoding='utf-8') as f:
    text = f.read()

words = word_tokenize(text)

for word in words:

    print(f"Original : {word} --------> Stemmed : {snowball_stemmer.stem(word)}")

Original : Centre --------> Stemmed : centr
Original : for --------> Stemmed : for
Original : Development --------> Stemmed : develop
Original : of --------> Stemmed : of
Original : Advanced --------> Stemmed : advanc
Original : Computing --------> Stemmed : comput
Original : - --------> Stemmed : -
Original : Wikipedia --------> Stemmed : wikipedia
Original : Jump --------> Stemmed : jump
Original : to --------> Stemmed : to
Original : content --------> Stemmed : content
Original : Main --------> Stemmed : main
Original : menu --------> Stemmed : menu
Original : Main --------> Stemmed : main
Original : menu --------> Stemmed : menu
Original : move --------> Stemmed : move
Original : to --------> Stemmed : to
Original : sidebar --------> Stemmed : sidebar
Original : hide --------> Stemmed : hide
Original : Navigation --------> Stemmed : navig
Original : Main --------> Stemmed : main
Original : pageContentsCurrent --------> Stemmed : pagecontentscurr
Original : eventsRandom --------> St

In [17]:
# NLTK lemma

from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

with open('cdac_article.txt', encoding='utf-8') as f:
    text = f.read()

text = word_tokenize(text)

for word in text:

    print(f"Original : {word} --------> Lemmatized : {lemmatizer.lemmatize(word)}")

Original : Centre --------> Lemmatized : Centre
Original : for --------> Lemmatized : for
Original : Development --------> Lemmatized : Development
Original : of --------> Lemmatized : of
Original : Advanced --------> Lemmatized : Advanced
Original : Computing --------> Lemmatized : Computing
Original : - --------> Lemmatized : -
Original : Wikipedia --------> Lemmatized : Wikipedia
Original : Jump --------> Lemmatized : Jump
Original : to --------> Lemmatized : to
Original : content --------> Lemmatized : content
Original : Main --------> Lemmatized : Main
Original : menu --------> Lemmatized : menu
Original : Main --------> Lemmatized : Main
Original : menu --------> Lemmatized : menu
Original : move --------> Lemmatized : move
Original : to --------> Lemmatized : to
Original : sidebar --------> Lemmatized : sidebar
Original : hide --------> Lemmatized : hide
Original : Navigation --------> Lemmatized : Navigation
Original : Main --------> Lemmatized : Main
Original : pageContentsCur

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [18]:
for word in text:

    print(f"Original : {word} --------> Lemmatized : {lemmatizer.lemmatize(word, pos='n')}")

Original : Centre --------> Lemmatized : Centre
Original : for --------> Lemmatized : for
Original : Development --------> Lemmatized : Development
Original : of --------> Lemmatized : of
Original : Advanced --------> Lemmatized : Advanced
Original : Computing --------> Lemmatized : Computing
Original : - --------> Lemmatized : -
Original : Wikipedia --------> Lemmatized : Wikipedia
Original : Jump --------> Lemmatized : Jump
Original : to --------> Lemmatized : to
Original : content --------> Lemmatized : content
Original : Main --------> Lemmatized : Main
Original : menu --------> Lemmatized : menu
Original : Main --------> Lemmatized : Main
Original : menu --------> Lemmatized : menu
Original : move --------> Lemmatized : move
Original : to --------> Lemmatized : to
Original : sidebar --------> Lemmatized : sidebar
Original : hide --------> Lemmatized : hide
Original : Navigation --------> Lemmatized : Navigation
Original : Main --------> Lemmatized : Main
Original : pageContentsCur

In [19]:
for word in text:

    print(f"Original : {word} --------> Lemmatized : {lemmatizer.lemmatize(word, pos='v')}")

Original : Centre --------> Lemmatized : Centre
Original : for --------> Lemmatized : for
Original : Development --------> Lemmatized : Development
Original : of --------> Lemmatized : of
Original : Advanced --------> Lemmatized : Advanced
Original : Computing --------> Lemmatized : Computing
Original : - --------> Lemmatized : -
Original : Wikipedia --------> Lemmatized : Wikipedia
Original : Jump --------> Lemmatized : Jump
Original : to --------> Lemmatized : to
Original : content --------> Lemmatized : content
Original : Main --------> Lemmatized : Main
Original : menu --------> Lemmatized : menu
Original : Main --------> Lemmatized : Main
Original : menu --------> Lemmatized : menu
Original : move --------> Lemmatized : move
Original : to --------> Lemmatized : to
Original : sidebar --------> Lemmatized : sidebar
Original : hide --------> Lemmatized : hide
Original : Navigation --------> Lemmatized : Navigation
Original : Main --------> Lemmatized : Main
Original : pageContentsCur

# Question 4

perform stop word removal on a non-english wikipedia page content

In [21]:
!pip install wikipedia

import wikipedia

Collecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wikipedia
  Building wheel for wikipedia (setup.py) ... [?25l[?25hdone
  Created wheel for wikipedia: filename=wikipedia-1.4.0-py3-none-any.whl size=11679 sha256=aa08fc09a4607ebb977ec143e41e5317ff9f04c82ea5b09671abc7258ba5bea6
  Stored in directory: /root/.cache/pip/wheels/8f/ab/cb/45ccc40522d3a1c41e1d2ad53b8f33a62f394011ec38cd71c6
Successfully built wikipedia
Installing collected packages: wikipedia
Successfully installed wikipedia-1.4.0


In [26]:
wikipedia.set_lang("fr")

text = wikipedia.page("Intelligence_artificielle").content

with open('intelligence_artificielle.txt', 'w', encoding='utf-8') as f:
    f.write(text)

print("Article saved to intelligence_artificielle.txt")

Article saved to intelligence_artificielle.txt


In [27]:
# NLTK

from nltk.corpus import stopwords
nltk.download('stopwords')

with open('intelligence_artificielle.txt', encoding='utf-8') as f:
    text = f.read()

sentences = sent_tokenize(text, language='french')

french_stop_words = set(stopwords.words('french'))

stemmer = nltk.stem.snowball.FrenchStemmer()

for i in range(len(sentences)):

    words = nltk.word_tokenize(sentences[i])

    filtered_words = [word for word in words if word.lower() not in french_stop_words]

    stemmed_words = [stemmer.stem(word) for word in filtered_words]

    print(f"Original Sentence : {sentences[i]}")
    print(f"Filtered Words : {filtered_words}")
    print(f"Stemmed Words : {stemmed_words}")
    print()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Original Sentence : L'intelligence artificielle (IA) est la capacité des machines à effectuer des tâches typiquement associées à l'intelligence humaine, comme l'apprentissage, le raisonnement, la résolution de problème, la perception ou la prise de décision.
Filtered Words : ["L'intelligence", 'artificielle', '(', 'IA', ')', 'capacité', 'machines', 'effectuer', 'tâches', 'typiquement', 'associées', "l'intelligence", 'humaine', ',', 'comme', "l'apprentissage", ',', 'raisonnement', ',', 'résolution', 'problème', ',', 'perception', 'prise', 'décision', '.']
Stemmed Words : ["l'intelligent", 'artificiel', '(', 'ia', ')', 'capac', 'machin', 'effectu', 'tâch', 'typiqu', 'associ', "l'intelligent", 'humain', ',', 'comm', "l'apprentissag", ',', 'raison', ',', 'résolu', 'problem', ',', 'percept', 'pris', 'décis', '.']

Original Sentence : L'intelligence artificielle est également le champ de recherche visant à développer de telles machines ainsi que les systèmes informatiques qui en résultent.
F

In [29]:
!python -m spacy download fr_core_news_sm

Collecting fr-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.7.0/fr_core_news_sm-3.7.0-py3-none-any.whl (16.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.3/16.3 MB[0m [31m36.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: fr-core-news-sm
Successfully installed fr-core-news-sm-3.7.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [31]:
# spaCy

import spacy

nlp = spacy.load('fr_core_news_sm')

with open('intelligence_artificielle.txt', encoding='utf-8') as f:
    sentences = f.read()



doc = nlp(sentences)

for token in doc:

    print(f"Word : {token.text} \t Is Stopword : {token.is_stop}")

Word : L' 	 Is Stopword : True
Word : intelligence 	 Is Stopword : False
Word : artificielle 	 Is Stopword : False
Word : ( 	 Is Stopword : False
Word : IA 	 Is Stopword : False
Word : ) 	 Is Stopword : False
Word : est 	 Is Stopword : True
Word : la 	 Is Stopword : True
Word : capacité 	 Is Stopword : False
Word : des 	 Is Stopword : True
Word : machines 	 Is Stopword : False
Word : à 	 Is Stopword : True
Word : effectuer 	 Is Stopword : False
Word : des 	 Is Stopword : True
Word : tâches 	 Is Stopword : False
Word : typiquement 	 Is Stopword : False
Word : associées 	 Is Stopword : False
Word : à 	 Is Stopword : True
Word : l' 	 Is Stopword : True
Word : intelligence 	 Is Stopword : False
Word : humaine 	 Is Stopword : False
Word : , 	 Is Stopword : False
Word : comme 	 Is Stopword : True
Word : l' 	 Is Stopword : True
Word : apprentissage 	 Is Stopword : False
Word : , 	 Is Stopword : False
Word : le 	 Is Stopword : True
Word : raisonnement 	 Is Stopword : False
Word : , 	 Is Stopwo