# Natural Language Proccessing

In [None]:
# importing libraries
import nltk

## Text preprocessing and cleaning

In [None]:
text = "Tokenization is the task of splitting a text into meaningful segments, called tokens."

## Tokenization

#### Word tokenization

In [None]:
nltk.word_tokenize(text)

In [None]:
nltk.download('punkt')
nltk.word_tokenize(text)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


['Tokenization',
 'is',
 'the',
 'task',
 'of',
 'splitting',
 'a',
 'text',
 'into',
 'meaningful',
 'segments',
 ',',
 'called',
 'tokens',
 '.']

In [None]:
words = nltk.word_tokenize(text)
words

['Tokenization',
 'is',
 'the',
 'task',
 'of',
 'splitting',
 'a',
 'text',
 'into',
 'meaningful',
 'segments',
 ',',
 'called',
 'tokens',
 '.']

#### Sentence tokenization

In [None]:
paragraph ="He is a good boy. she is a good girl."

In [None]:
nltk.sent_tokenize(paragraph)

['He is a good boy.', 'she is a good girl.']

In [None]:
para = nltk.sent_tokenize(paragraph)
para

['He is a good boy.', 'she is a good girl.']

### Stopwords and stopsword removal.
*  words like "is, the, of,a " is not much important because it does not much meaning to the sentence we can remove them using stopsword

In [None]:
# stopwords in english languages
stopword = nltk.corpus.stopwords.words('english')
stopword

In [None]:
nltk.download('stopwords')
stopword = nltk.corpus.stopwords.words('english')
stopword

In [None]:
# stopwords in nepali languages
stopword = nltk.corpus.stopwords.words('nepali')
stopword

#### Stopwords removal

In [None]:
sent = "He is a good boy"

In [None]:
words = nltk.word_tokenize(sent)
words

['He', 'is', 'a', 'good', 'boy']

In [None]:
stopwords = nltk.corpus.stopwords.words("english")
stopwords

In [None]:
clean_text = [word for word in words if word.lower() not in stopwords]
" ".join(clean_text)

'good boy'

## Stemming: convert words into root word or base word stem or base word - word may not have any meaning - very fast

In [None]:
words = ['change', 'changes', "changing"]


In [None]:
# import libraries
from nltk.stem import PorterStemmer

In [None]:
stemmer = PorterStemmer()
[stemmer.stem(word) for word in words]

['chang', 'chang', 'chang']

##### chang ko dictionary meaning chaina

### Lemmatization: Lemmatization deals with reducing the word to its canonical dictionary form. The root word is called a "lemma" and the method is called lemmatization.-slow

In [None]:
nltk.download("wordnet")

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
words = ['change', 'changes','changing','changed']
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
[lemmatizer.lemmatize(word) for word in words]

['change', 'change', 'changing', 'changed']

## Removing HTML from text

In [None]:
text = "<p><b>Pangolins</b> risk extinction in Khotang</p>"

In [None]:
text

'<p><b>Pangolins</b> risk extinction in Khotang</p>'

In [None]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(text, "html.parser").text

In [None]:
soup

'Pangolins risk extinction in Khotang'

# Regex or Regular Expression
A regular expression, sometimes referred to as rational expression, is a sequence of characters that specifies a match pattern in text. Usually such patterns are used by string-searching algorithms for "find" or "find and replace" operations on strings, or for input validation.

In [None]:
import re

In [None]:
pattern = "a....s"
text = "abacus"
re.match(pattern, text)

<re.Match object; span=(0, 6), match='abacus'>

In [None]:
# example
pattern = "a....s"
text = "abas"
re.match(pattern, text)

# [ ]
* specifies set of character you widh to match

In [None]:
import re
pattern = '[abc]'
string = 'aklbac'
re.match(pattern, string)

<re.Match object; span=(0, 1), match='a'>

In [None]:
import re
pattern = '[abc]'
string = 'aklbac'
re.findall(pattern, string)

['a', 'b', 'a', 'c']

In [None]:
import re
pattern = "[0-9]"
string = "Hello world I am naresh dhimal. Currently, i am 24 years old."
re.findall(pattern, string)

['2', '4']

In [None]:
import re
pattern = "\d+"
string = "Hello world I am naresh dhimal. Currently, i am 24 years old."
re.findall(pattern, string)

['24']

# +

In [None]:
import re
pattern = "[ab]+"
string = "abibas"
re.findall(pattern, string)

['ab', 'ba']

In [None]:
pattern= r'\bfoo'
string = 'football, foot , foo '
re.findall(pattern, string)

['foo', 'foo', 'foo']

In [None]:
email= """
naresh@gmail.com
info@gmail.com
ram.sigh@gmail.com
"""

In [None]:
pattern= r'\b[0-9a-zA-Z]+\@gmail.com\b'

In [None]:
re.findall(pattern, email)

['naresh@gmail.com', 'info@gmail.com', 'sigh@gmail.com']

# Remove html using regex

In [None]:
text = "<p><b>Pangolins</b> risk extinction in Khotang</p>"

In [None]:
import  re
CLEANER = re.compile('<.*?>')
def clean_html(raw_html):
  cleantext = re.sub(CLEANER, "", raw_html)
  return cleantext

In [None]:
clean_html(text)

'Pangolins risk extinction in Khotang'

# Coverting text to vector
* Bag of words/ CountVectorizer()
* Tfidf-vectorizer

## Bag of words

In [None]:
# Example
corpus =["He is a lazy boy. She is also lazy.","Neeraj is lazy person"]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vector = CountVectorizer(stop_words='english')
X = vector.fit_transform(corpus)
X


<2x4 sparse matrix of type '<class 'numpy.int64'>'
	with 5 stored elements in Compressed Sparse Row format>

In [None]:
vocab = vector.get_feature_names_out()
vocab

array(['boy', 'lazy', 'neeraj', 'person'], dtype=object)

In [None]:
X.toarray()

array([[1, 2, 0, 0],
       [0, 1, 1, 1]])

In [None]:
import pandas as pd
df = pd.DataFrame(X.toarray(), columns=vocab)

In [None]:
df

Unnamed: 0,boy,lazy,neeraj,person
0,1,2,0,0
1,0,1,1,1


# Disadvantage
#### CountVectorizer gives more priority to frequently occurung words the rare occurring words has lower importance to alleviate this problem we use TF-IDF vectorizer

# TF-IDF(Term Frequency- Inverse Document Frequency) vectorizer
$$
tf = \frac{\text{No of repetition of words in sentence}}{\text{No of words in sentence}}
\\
\\
idf = log(\frac{\text{No of sentence}}{\text{No of sentence containing the word}})
$$

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words= "english")
X = vectorizer.fit_transform(corpus)
vocal = vectorizer.get_feature_names_out()
pd.DataFrame(X.toarray(), columns=vocab)

Unnamed: 0,boy,lazy,neeraj,person
0,0.574962,0.81818,0.0,0.0
1,0.0,0.449436,0.631667,0.631667
