---   
 <img align="left" width="75" height="75"  src="https://upload.wikimedia.org/wikipedia/en/c/c8/University_of_the_Punjab_logo.png"> 

<h1 align="center">Department of Data Science</h1>
<h1 align="center">Natural Language Processing</h1>

---
<h3><div align="right">Instructor: Dr.Khurram Shahzad (Tenured)</div></h3>    

<h1 align="center">Topic 03 - Text Normalization</h1>

# Learning agenda of this notebook 

## Tokenization 
### Type of Tokenization


![image.png](attachment:image.png)


## Sub types of tokenization

![image.png](attachment:image.png)

## White space tokenization 

In [10]:
from nltk.tokenize import word_tokenize
text = "This is an example sentence for tokenization."
tokens = word_tokenize(text)
print(tokens)

['This', 'is', 'an', 'example', 'sentence', 'for', 'tokenization', '.']


## Dictionary based tokenization

### example 1:
- Tokenization without spliting the sentence

In [1]:
import nltk
import pandas as pd

tokens = []
text = "If it were not for Allah checking some people by means of others"
dictionary = pd.read_excel("dictionary.xlsx")
for word in dictionary.Word:
    if word in text:
        tokens.append(word)
print(tokens)


['a', 'an', 'b', 'by', 'checking', 'e', 'f', 'for', 'g', 'he', 'her', 'i', 'in', 'it', 'ki', 'king', 'la', 'le', 'me', 'means', 'n', 'no', 'not', 'o', 'of', 'or', 'other', 'others', 'people', 're', 's', 'so', 'some', 'the', 'we', 'were', 'y']


### example 2
- Tokenization by spliting the sentence

In [2]:
import nltk
import pandas as pd

tokens = []
text = "If it were not for Allah checking some people by means of others"
dictionary = pd.read_excel("dictionary.xlsx")
for word in dictionary.Word:
    if word in text.split():
        tokens.append(word)
print(tokens)


['by', 'checking', 'for', 'it', 'means', 'not', 'of', 'others', 'people', 'some', 'were']


## Regular Expression Tokenizer

In [17]:
import nltk
from nltk.tokenize import regexp_tokenize

text = "Hello, how are you doing today?"

# Define a regular expression pattern for tokenization
pattern = r'\w+'

# Tokenize the text using the regular expression pattern
tokens = regexp_tokenize(text, pattern)

# Print the tokens
print(tokens)


['Hello', 'how', 'are', 'you', 'doing', 'today']


## Penn TreeBank Tokenization

In [27]:
from nltk.tokenize.treebank import TreebankWordTokenizer, TreebankWordDetokenizer

text = '''Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\nThanks.'''
detokenizer = TreebankWordDetokenizer()
tokenizer = TreebankWordTokenizer()
toks = tokenizer.tokenize(text)
detokenizer.detokenize(toks)

'Good muffins cost $3.88 in New York. Please buy me two of them. Thanks.'

## Space Base Tokenization

In [40]:
text = "Hello, how are you doing today?"
tokens = text.split(' ')
print(toke)

['Hello,', 'how', 'are', 'you', 'doing', 'today?']


## Punctuation based tokenizer

In [46]:
import nltk
from nltk.tokenize import WordPunctTokenizer

text = "Hello, how are you doing today?"
tokens = WordPunctTokenizer().tokenize(text)

# Print the tokens
print(tokens)


['Hello', ',', 'how', 'are', 'you', 'doing', 'today', '?']


## Subwords Tokenization

In [49]:
from tokenizers import ByteLevelBPETokenizer

# Create a ByteLevelBPETokenizer instance
tokenizer = ByteLevelBPETokenizer()

# Train the tokenizer on your text data
corpus_files = ["textfile.txt"]
tokenizer.train(files=corpus_files, vocab_size=1000, min_frequency=2, special_tokens=["<s>", "<pad>", "</s>", "<unk>"])

# Encode a sentence into subword tokens
sentence = "This is an example sentence."
encoded = tokenizer.encode(sentence)

# Print the subword tokens
tokens = encoded.tokens
print(tokens)





['T', 'h', 'is', 'Ġis', 'Ġan', 'Ġex', 'am', 'ple', 'Ġs', 'ent', 'en', 'ce', '.']


## How to generate vocabulary

In [None]:
import pandas as pd
import re

excel_file = 'Fold_1.xlsx'
df = pd.read_excel(excel_file)

text = ' '.join([str(item) for sublist in df.values.tolist() for item in sublist])
processed_text = re.sub(r'\W+', ' ', text).lower()
words = processed_text.split()
vocabulary = set(words)

vocabulary_list = list(vocabulary)  # Convert the set of vocabulary to a list
df_vocabulary = pd.DataFrame(vocabulary_list, columns=['Word'])  # Create a DataFrame with 'Word' column
df_vocabulary.to_excel("dictionary.xlsx", index=False, engine='xlsxwriter')