<a href="https://colab.research.google.com/github/Hariharan8426/NLP-obs/blob/main/Copy_of_NLP__11239A034.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**LEMMATIZATION**

In [None]:
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
lemmatizer = WordNetLemmatizer()
words = ["cats", "running", "better", "studies"]
for w in words:
    print(w, "→", lemmatizer.lemmatize(w))


cats → cat
running → running
better → better
studies → study


**NORMALIZATION**

In [None]:
import re

text = "This is an Example: Normalizing Text in NLP!!!"
text = text.lower()                     # lowercase
text = re.sub(r'[^\w\s]', '', text)     # remove punctuation
print(text)

this is an example normalizing text in nlp


**TOKENIZATION**

In [None]:
import nltk
nltk.download('punkt_tab', quiet=True)
from nltk.tokenize import word_tokenize
text = "I love learning NLP with ChatGPT!"
print(word_tokenize(text))

['I', 'love', 'learning', 'NLP', 'with', 'ChatGPT', '!']


**STEMMING**

In [None]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
words = ["running", "flies", "easily", "fairly"]
for w in words:
    print(w, "→", stemmer.stem(w))


running → run
flies → fli
easily → easili
fairly → fairli


**MORPHOLOGY**

In [None]:
import spacy, warnings
warnings.filterwarnings("ignore")
nlp = spacy.load("en_core_web_sm")
text = "The cats are running quickly"
doc = nlp(text)
for token in doc:
    print(token.text, "→", token.lemma_, "|", token.pos_, "|", token.morph)


The → the | DET | Definite=Def|PronType=Art
cats → cat | NOUN | Number=Plur
are → be | AUX | Mood=Ind|Tense=Pres|VerbForm=Fin
running → run | VERB | Aspect=Prog|Tense=Pres|VerbForm=Part
quickly → quickly | ADV | 


SPELLING **CORRECTION**

In [None]:
!pip install textblob -q
from textblob import TextBlob
text = "I lik to lern naturall langauge procesing"
blob = TextBlob(text)
print(blob.correct())


I like to learn natural language processing


**DEDUCTION**

In [None]:
import nltk
from nltk.sem import Expression
from nltk.inference import ResolutionProver

read_expr = Expression.fromstring

kb = [
    read_expr('man(Socrates)'),
    read_expr('all x (man(x) -> mortal(x))')
]
goal = read_expr('mortal(Socrates)')
print(ResolutionProver().prove(goal, kb))


True


**UNIGRAM**

In [None]:
text = "I love natural language processing and I love coding"

words = text.split()

freq = {}
for word in words:
    freq[word] = freq.get(word, 0) + 1

print(freq)


{'I': 2, 'love': 2, 'natural': 1, 'language': 1, 'processing': 1, 'and': 1, 'coding': 1}


**BIGRAM**

In [None]:
text = "I love natural language processing and I love coding"

words = text.split()

bigrams = []
for i in range(len(words) - 1):
    bigrams.append((words[i], words[i+1]))

freq = {}
for bigram in bigrams:
    freq[bigram] = freq.get(bigram, 0) + 1

print(freq)


{('I', 'love'): 2, ('love', 'natural'): 1, ('natural', 'language'): 1, ('language', 'processing'): 1, ('processing', 'and'): 1, ('and', 'I'): 1, ('love', 'coding'): 1}


**TRIGRAM**

In [None]:
text = "I love natural language processing and I love coding"

words = text.split()

trigrams = []
for i in range(len(words) - 2):
    trigrams.append((words[i], words[i+1], words[i+2]))

freq = {}
for trigram in trigrams:
    freq[trigram] = freq.get(trigram, 0) + 1

print(freq)


{('I', 'love', 'natural'): 1, ('love', 'natural', 'language'): 1, ('natural', 'language', 'processing'): 1, ('language', 'processing', 'and'): 1, ('processing', 'and', 'I'): 1, ('and', 'I', 'love'): 1, ('I', 'love', 'coding'): 1}


N-GRAM **SMOOTHING**

In [None]:
from collections import Counter
text = "I love NLP I love machine learning"
words = text.split()
V = len(set(words))  # Vocabulary size
unigrams = Counter(words)
bigrams = Counter([(words[i], words[i+1]) for i in range(len(words)-1)])
def laplace_prob(w1, w2):
    return (bigrams[(w1, w2)] + 1) / (unigrams[w1] + V)
print("P(love | I) =", laplace_prob("I", "love"))
print("P(NLP | love) =", laplace_prob("love", "NLP"))
print("P(machine | NLP) =", laplace_prob("NLP", "machine"))
print("P(learning | machine) =", laplace_prob("machine", "learning"))
print("P(unknown | NLP) =", laplace_prob("NLP", "unknown"))  # unseen word

P(love | I) = 0.42857142857142855
P(NLP | love) = 0.2857142857142857
P(machine | NLP) = 0.16666666666666666
P(learning | machine) = 0.3333333333333333
P(unknown | NLP) = 0.16666666666666666


POS **TAGGING**

In [None]:
import nltk
nltk.download('averaged_perceptron_tagger_eng', quiet=True)
text = "I love learning NLP"
words = nltk.word_tokenize(text)
print(nltk.pos_tag(words))

[('I', 'PRP'), ('love', 'VBP'), ('learning', 'VBG'), ('NLP', 'NNP')]


**HMM** **TAGGING**

In [None]:
import nltk
from nltk.tag import hmm
import warnings
warnings.filterwarnings("ignore")
train_data = [[
    ('I', 'PRON'),
    ('love', 'VERB'),
    ('dogs', 'NOUN')
], [
    ('You', 'PRON'),
    ('love', 'VERB'),
    ('cats', 'NOUN')
]]
trainer = hmm.HiddenMarkovModelTrainer()
tagger = trainer.train_supervised(train_data)
sentence = ['I', 'love', 'cats']
print(tagger.tag(sentence))


[('I', 'PRON'), ('love', 'VERB'), ('cats', 'NOUN')]


BRILL POS **TAGGER **

In [None]:
import nltk
from nltk.tag import brill, brill_trainer, UnigramTagger

nltk.download('treebank', quiet=True)
nltk.download('universal_tagset', quiet=True)
data = nltk.corpus.treebank.tagged_sents(tagset='universal')[:3000]
uni = UnigramTagger(data)
tagger = brill_trainer.BrillTaggerTrainer(uni, brill.fntbl37()).train(data)
print(tagger.tag("I love learning NLP".split()))

[('I', 'PRON'), ('love', None), ('learning', 'NOUN'), ('NLP', None)]
