In [2]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

corpus = 'India, officially the Republic of India, is a country in South Asia. It is the seventh-largest country by area, the second-most populous country, and the most populous democracy in the world. Bounded by the Indian Ocean on the south, the Arabian Sea in the south-west, and the Bay of Bengal in the south-east, it shares land borders with Pakistan to the west; China, Nepal and Bhutan to the north; and Bangladesh and Myanmar to the east. In the Indian Ocean, India is in the vicinity of Sri Lanka and the Maldives; its Andaman and Nicobar Islands share a maritime border with Thailand, Myanmar and Indonesia.'

# <font color = 'red'>1. <u>Text Encoding Decoding</u> | <u>without StopWords</u></font>

## Stopwords and special characters Removal

In [62]:
words = []

for word in word_tokenize(corpus):
    if word.lower() not in stopwords.words('english') and len(word) > 1:
        words.append(word.lower())
        
vocab = list(set(words))
len(vocab)

45

## Creating encoders and decoders

In [16]:
word_to_num = {}
num_to_word = {}

num = 1

for word in vocab:
    word_to_num[word] = num
    num_to_word[num] = word
    num += 1

## Encoding sentences

In [63]:
data = []

for sent in sent_tokenize(corpus):                                            # Iterating each sentence
    
    temp = []
    
    for word in word_tokenize(sent):                                          # Iterating each word in a sentence
        
        if word.lower() not in stopwords.words('english') and len(word) > 1:  # Stopwords and special characters removal
            temp.append(word_to_num[word.lower()])                            # Appending each word in temp list
            print(word.lower(), end = ' ')
    
    print('\n', temp, '\n')
    
    data.append(temp)                                                         # Appending each sentence in data list

india officially republic india country south asia 
 [18, 44, 26, 18, 36, 25, 19] 

seventh-largest country area second-most populous country populous democracy world 
 [8, 36, 31, 20, 23, 36, 23, 22, 17] 

bounded indian ocean south arabian sea south-west bay bengal south-east shares land borders pakistan west china nepal bhutan north bangladesh myanmar east 
 [1, 37, 35, 25, 28, 21, 38, 7, 43, 2, 40, 33, 14, 32, 15, 27, 13, 4, 29, 30, 10, 16] 

indian ocean india vicinity sri lanka maldives andaman nicobar islands share maritime border thailand myanmar indonesia 
 [37, 35, 18, 24, 11, 3, 9, 41, 45, 12, 42, 5, 39, 34, 10, 6] 



## Decoding the sentences

In [64]:
for sent in data:                                    # Iterating through each sentence
    for word in sent:                                # Iterating through each word in a sentence
        print(num_to_word[word], end = ' ')          # decoding using num_to_words dictionary
    print()

india officially republic india country south asia 
seventh-largest country area second-most populous country populous democracy world 
bounded indian ocean south arabian sea south-west bay bengal south-east shares land borders pakistan west china nepal bhutan north bangladesh myanmar east 
indian ocean india vicinity sri lanka maldives andaman nicobar islands share maritime border thailand myanmar indonesia 


# <font color = 'red'>2. <u>Text Encoding Decoding</u> | <u>With Stopwords</u></font>

## Special Characters Removal

In [139]:
words = []

for word in word_tokenize(corpus):
    if len(word) == 1:
        if ((ord(word) >= 97 and ord(word) <= 122) or (ord(word) >= 65 and ord(word) <= 90)):    # checking if word is not an alphabet
            words.append(word.lower())
    else:
        words.append(word.lower())
    
vocab = list(set(words))                                 # Creating Vocab
len(vocab)

58

## Creating encoders and decoders

In [140]:
word_to_num = {}
num_to_word = {}

num = 1

for word in vocab:
    word_to_num[word] = num
    num_to_word[num] = word
    num += 1

## Encoding sentence

In [141]:
data = []

for sent in sent_tokenize(corpus):
    
    temp = []
    
    for word in word_tokenize(sent):
        
        if len(word) == 1:
            if ((ord(word) >= 97 and ord(word) <= 122) or (ord(word) >= 65 and ord(word) <= 90)):
                temp.append(word_to_num[word.lower()])
                print(word.lower(), end = ' ')
        else:
            temp.append(word_to_num[word.lower()])
            print(word.lower(), end = ' ')
    
    print('\n', temp, '\n')
    data.append(temp)

india officially the republic of india is a country in south asia 
 [21, 6, 44, 34, 23, 21, 17, 55, 57, 5, 33, 26] 

it is the seventh-largest country by area the second-most populous country and the most populous democracy in the world 
 [52, 17, 44, 10, 57, 25, 40, 44, 27, 30, 57, 22, 44, 39, 30, 29, 5, 44, 20] 

bounded by the indian ocean on the south the arabian sea in the south-west and the bay of bengal in the south-east it shares land borders with pakistan to the west china nepal and bhutan to the north and bangladesh and myanmar to the east 
 [1, 25, 44, 48, 47, 32, 44, 33, 44, 37, 28, 5, 44, 49, 22, 44, 9, 23, 56, 5, 44, 2, 52, 51, 45, 16, 35, 41, 24, 44, 18, 36, 15, 22, 4, 24, 44, 38, 22, 42, 22, 12, 24, 44, 19] 

in the indian ocean india is in the vicinity of sri lanka and the maldives its andaman and nicobar islands share a maritime border with thailand myanmar and indonesia 
 [5, 44, 48, 47, 21, 17, 5, 44, 31, 23, 13, 3, 22, 44, 11, 43, 53, 22, 58, 14, 54, 55, 7, 50, 35,

## Decoding the Sentences

In [143]:
for sent in data:
    for word in sent:
        print(num_to_word[word], end = ' ')
    print('\n')

india officially the republic of india is a country in south asia 

it is the seventh-largest country by area the second-most populous country and the most populous democracy in the world 

bounded by the indian ocean on the south the arabian sea in the south-west and the bay of bengal in the south-east it shares land borders with pakistan to the west china nepal and bhutan to the north and bangladesh and myanmar to the east 

in the indian ocean india is in the vicinity of sri lanka and the maldives its andaman and nicobar islands share a maritime border with thailand myanmar and indonesia 

