# VK23-2 Qualification NLP Case 2

In [45]:
# Language modelling n - gram
import pandas as pd 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

class NGramLanguageModel : 
    def __init__(self,n):
        self.n = n
        self.vectorizer = CountVectorizer(analyzer='word',ngram_range=(n,n))
        
    def fit_transform(self,corpus):
        return self.vectorizer.fit_transform(corpus)
    
    def transform(self,corpus):
        return self.vectorizer.transform(corpus)

def calculate_cosine_similarity_matrix(matrix, query_v):
    similarities = cosine_similarity(matrix, query_v)
    return similarities

corpus = []
try : 
    with open("pos.txt", 'r') as file:
        corpus = file.readlines()
except FileNotFoundError:
    print("File not found")

In [46]:
def similarity_n_gram(query):
  n = 1
  document_index = 0

  ngram_model = NGramLanguageModel(n)

  matrix = ngram_model.fit_transform(corpus)
  query_v = ngram_model.transform([query])

  print(f'{n}-Gram Model:')
  data = matrix.A

  print(pd.DataFrame(matrix.A, columns=ngram_model.vectorizer.get_feature_names_out()))
  print(query_v.A)

  similarities = calculate_cosine_similarity_matrix(matrix, query_v)

  data = pd.DataFrame(data, columns=ngram_model.vectorizer.get_feature_names_out())
  df = pd.DataFrame(similarities, columns=['Similarity'])

  print(query)
  print()
  print(df)

In [47]:
# Word Embedding
import nltk
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

nltk.download('punkt')

def word_embedding(query):
  sentences = [sentence for sentence in corpus if len(nltk.word_tokenize(sentence.lower())) == 7][:20]

  sentences = sentences[:1] + sentences[2:]

  tokenized_corpus = [word_tokenize(comment.lower()) for comment in sentences]

  model = Word2Vec(sentences=tokenized_corpus, vector_size=100, window=5, min_count=1, workers=4)

  tokenized_query = word_tokenize(query.lower())

  similarities = []
  for comment in tokenized_corpus:
      similarity = model.wv.n_similarity(tokenized_query, comment)
      similarities.append(similarity)

  df = pd.DataFrame({'Document': sentences, 'Similarity': similarities})
  print()
  print(df)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jonathanmaverick/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [73]:
# Grammar Parsing with NLTK
from nltk import CFG, ChartParser

import nltk
from nltk import CFG
from nltk.parse import ChartParser

def demonstrate_nlp_parsing(sentence, grammar):
    words = nltk.word_tokenize(sentence)
    parser = ChartParser(grammar)
    parses = list(parser.parse(words))
    if parses:
        for tree in parses:
            print(tree, "\n")
            tree.pretty_print()
    else:
        print("No parse tree found")
        
nlp_grammar = CFG.fromstring('''
    S -> NP VP | NP VP PP
    VP -> V NP | V PP | V N | V Adj
    PP -> P VP | P NP | P N
    V -> "eat" | "sleeps" | "dances" | "reads" | "love"
    NP -> "I" | "the" N | "she" | "he"
    N -> "cat" | "pizza" | "sofa" | "parties" | "books" | "library"
    P -> "on" | "in" | "at" | "to"
    Adj -> "well"
''')
        
nlp_sentences = [
    "I love to eat pizza",
    "the cat sleeps on the sofa",
    "she dances well at parties",
    "he reads books in the library"
]

def nlp_parsing():
    for i, sentence in enumerate(nlp_sentences):
        print(f"Sentence {i+1}: {sentence}")
        demonstrate_nlp_parsing(sentence, nlp_grammar)
        print("\n")


In [77]:
# Dependency Parsing with SpaCy
import spacy 

def extract_named_entities_and_parse_tree(sentence):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(sentence)
    
    print("Formatted Dependency Parse Tree:")
    for token in doc:
        print(f"{token.text} --{token.dep_}-- {token.head.text} ({token.pos_})")

In [81]:
# Named Entity Recognition
import spacy

nlp = spacy.load("en_core_web_sm")

def categorized_nar(sentence) : 
    doc = nlp(sentence)
    categories = {}
    for ent in doc.ents:
        if ent.label_ not in categories:
            categories[ent.label_] = []
        categories[ent.label_].append(ent.text)
        
    print("Categorized Named Entities:")
    for label, entities in categories.items():
        print(f"{label}: {', '.join(entities)}")

In [82]:
print('Natural Language Processing Case 2')
print('1. Text Similartity N-Gram') 
print('2. Text Similarity word embedding')
print('3. Simulate grammar parsing')
print('4. Dependency Parsing')
print('5. Named Entity Recognition')
command = input('Enter command: ')

if (command == '1'):
    sentence = input('Enter sentence: ')
    similarity_n_gram(sentence)
elif (command == '2'):
    sentence = input('Enter sentence: ')
    word_embedding(sentence)
elif (command == '3'):
    nlp_parsing()
elif (command == '4'):
    sentence = input('Enter sentence: ')
    extract_named_entities_and_parse_tree(sentence) 
elif (command == '5'):
    sentence = input('Enter sentence: ')
    categorized_nar(sentence)

Natural Language Processing Case 2
1. Text Similartity N-Gram
2. Text Similarity word embedding
3. Simulate grammar parsing
4. Dependency Parsing
5. Named Entity Recognition
0. Exit
Categorized Named Entities:
PERSON: Barack Obama
GPE: Hawaii
DATE: 2008
