In [None]:
import nltk
nltk.download('punkt')
nltk.download('stop')
nltk.download('stopwords')

In [None]:
import spacy
nlp=spacy.load("en_core_web_sm")
nlp

In [None]:
# 3. How to tokenize a given text?
# Tokeniation with nltk
text = " I wat gay"
tokens=nltk.word_tokenize(text)
for token in tokens:
  print(token)
# Tokenization with spaCy
nlp=spacy.load("en_core_web_sm")
doc=nlp(text)
for token in doc:
  print(token.text)

In [None]:
# Remove stopword by NLTK
from nltk.corpus import stopwords
my_stopwords=set(stopwords.words('english'))
new_tokens=[]

# Tokenization using word_tokenize()
all_tokens=nltk.word_tokenize(text)

for token in all_tokens:
  if token not in my_stopwords:
    new_tokens.append(token)
    
" ".join(new_tokens)


In [None]:
#How to do spell correction in a given text ?
# Import textblob
from textblob import TextBlob

# Using textblob's correct() function
text=TextBlob(text)
print(text.correct())
#> He is a great person. He believes in god

In [None]:
# How to extract all the nouns in a text?
# Coverting the text into a spacy Doc
nlp=spacy.load("en_core_web_sm")
doc=nlp(text)

# Using spacy's pos_ attribute to check for part of speech tags
for token in doc:
  if token.pos_=='NOUN' or token.pos_=='PROPN':
    print(token.text)

In [None]:
# How to extract all the pronouns in a text?
nlp=spacy.load("en_core_web_sm")
doc=nlp(text)

for token in doc:
  if token.pos_=='PRON':
    print(token.text)

In [None]:
# How to find cosine similary with matrix
# Using Vectorizer of sklearn to get vector representation
text1='Taj Mahal is a tourist place in India'
text2='Great Wall of China is a tourist place in china'
documents=[text1,text2]
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

vectorizer=CountVectorizer()
matrix=vectorizer.fit_transform(documents)

# Obtaining the document-word matrix
doc_term_matrix=matrix.todense()
doc_term_matrix

# Computing cosine similarity
df=pd.DataFrame(doc_term_matrix)

from sklearn.metrics.pairwise import cosine_similarity
print(cosine_similarity(df,df))

In [None]:
import gensim

from gensim.models import Word2Vec
model=Word2Vec()

sentence_orange = 'Oranges are my favorite fruit'
sent="apples are not my favorite"

# Computing the word mover distance
distance = model.wmdistance(sent, sentence_orange)

#> 5.378

In [None]:
# How to extract topic keywords using LSA?
# Importing the Tf-idf vectorizer from sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

# Defining the vectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features= 1000,  max_df = 0.5, smooth_idf=True)

# Transforming the tokens into the matrix form through .fit_transform()
matrix= vectorizer.fit_transform(texts)

# SVD represent documents and terms in vectors
from sklearn.decomposition import TruncatedSVD
SVD_model = TruncatedSVD(n_components=10, algorithm='randomized', n_iter=100, random_state=122)
SVD_model.fit(matrix)

# Getting the terms 
terms = vectorizer.get_feature_names()

# Iterating through each topic
for i, comp in enumerate(SVD_model.components_):
    terms_comp = zip(terms, comp)
    # sorting the 7 most important terms
    sorted_terms = sorted(terms_comp, key= lambda x:x[1], reverse=True)[:7]
    print("Topic "+str(i)+": ")
    # printing the terms of a topic
    for t in sorted_terms:
        print(t[0],end=' ')
    print(' ')

#> Topic 0: 
#> learn new life travelling country feel  
#> Topic 1: 
#> life cherish diaries let share experience  
#> Topic 2: 
#> feel know time people just regions  
#> Topic 3: 
#> time especially cherish diaries let share  

In [None]:

# Sentiment analysis with TextBlob
from textblob import TextBlob
blob=TextBlob(text)

# Using the sentiment attribute 
print(blob.sentiment)
if(blob.sentiment.polarity > 0):
  print("Positive")

#> Sentiment(polarity=0.9533333333333333, subjectivity=1.0)
#> Positive

In [None]:
# Creating bigrams and trigrams
from nltk import ngrams
bigram=list(ngrams(Sentences.lower().split(),2))
trigram=list(ngrams(Sentences.lower().split(),3))

print(" Bigrams are",bigram)
print(" Trigrams are", trigram)

In [None]:
# How to detect the language of entered text ?
# Install spacy's languagedetect library
import spacy
!pip install spacy_langdetect
from spacy_langdetect import LanguageDetector
nlp = spacy.load('en')

# Add the language detector to the processing pipeline
nlp.add_pipe(LanguageDetector(), name='language_detector', last=True)

doc = nlp(text)
# document level language detection. Think of it like average language of the document!
print(doc._.language)
# sentence level language detection
for sent in doc.sents:
   print(sent, sent._.language)

#> {'language': 'es', 'score': 0.9999963653206719}
#> El agente imprime su pase de abordaje. {'language': 'es', 'score': 0.9999969081229643}
#> Los oficiales de seguridad del aeropuerto pasan junto a él con un perro grande. {'language': 'es', 'score': 0.9999951631258189}
#> El perro está olfateando alrededor del equipaje de las personas tratando de detectar drogas o explosivos. {'language': 'es', 'score': 0.9999938903880353}

In [None]:
# How to extract first name and last names present in the document ?
text="Sherlock Holmes and Clint Thomas were good friends. I am a fan of John Mark"
# Import and initialize spacy's matcher
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)
doc=nlp(text)

# Function that adds patterns to the matcher and finds the respective matches
def extract_matches(doc):
   pattern = [{'POS': 'PROPN'}, {'POS': 'PROPN'}]
   matcher.add('FULL_NAME', None, pattern)
   matches = matcher(doc)
   
   for match_id, start, end in matches:
     span = doc[start:end]
     print(span.text)

extract_matches(doc)


#> Sherlock Holmes
#> Clint Thomas
#> John Mark

In [None]:
# How to identify named entities in the given text
text=" Walter works at Google. He lives in London."
# Load spacy modelimport spacy
nlp=spacy.load("en_core_web_sm")
doc=nlp(text)
# Using the ents attribute of doc, identify labels
for entity in doc.ents:  
   print(entity.text,entity.label_)

#> Walter PERSON
#> Google ORG
#> London GPE

In [None]:
# How to identify all the names of Organizations present in the text with NER ?
doc=nlp(text)
list_of_org=[]
for entity in doc.ents:
  if entity.label_=="ORG":
    list_of_org.append(entity.text)

print(list_of_org)
#> ['Google', 'Amazon', 'Apple', 'Flipkart']

In [None]:
# How to replace all names of people in the text with ‘UNKNOWN’

doc=nlp(news)

# Identifying the entities of category 'PERSON'
entities = [entity.text  for entity in doc.ents  if entity.label_=='PERSON']
updated_text=[]

for token in doc:
  if token.text in entities:
    updated_text.append("UNKNOWN")
  else :
    updated_text.append(token.text)

" ".join(updated_text)
    


In [None]:
# How to visualize the named entities using spaCy
# Use spacy's displacy with the parameter style="ent"
from spacy import displacy
doc=nlp(text)
displacy.render(doc,style='ent',jupyter=True)

In [None]:
# How to implement dependency parsing ?
# Using dep_ attribute od tokens in spaCy to access the dependency of the word in sentence.
doc=nlp(text)

for token in doc:
  print(token.text,token.dep_)


#> Mark nsubj
#> plays ROOT
#> volleyball dobj
#> every det
#> evening npadvmod
#> . punct

In [None]:
# How to find the ROOT word of any word in a sentence?
# use the head attribute of tokens to find it's rootword
text="Mark plays volleyball. Sam is not into sports, he paints a lot"
doc=nlp(text)
for token in doc:
  print(token.text,token.head)

#> Mark plays
#> plays plays
#> volleyball plays
#> . plays
#> Sam is
#> is paints
#> not is
#> into is
#> sports into
#> , paints
#> he paints
#> paints paints
#> a lot
#> lot paints

In [None]:
# How to build a text classifier with TextBlob ?
# Importing the classifier
from textblob.classifiers import NaiveBayesClassifier
from textblob import TextBlob

# Training
cl = NaiveBayesClassifier(train)

# Classify some text
print(cl.classify("My favorite food is spring rolls"))  
print(cl.classify("It was a cold place for picnic"))  

# Printing accuracy of classifier
print("Accuracy: {0}".format(cl.accuracy(test)))

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

# Sample text
text = "He would also attend the opening ceremony for the construction of the US. Embassy complex in Cau Giay District, as well as meeting students, teachers and scientists at the Hanoi University of Science and Technology"

# Tokenize the text
words = word_tokenize(text)

# Filter words with less than 4 letters
filtered_words = [word.lower() for word in words if len(word) >= 4]

# Calculate frequency distribution
fdist = FreqDist(filtered_words)

# Sort the words by frequency
sorted_words = sorted(fdist.items(), key=lambda x: x[1], reverse=True)

# Display the words in decreasing order of frequency
word_frequency_list = [(word, frequency) for word, frequency in sorted_words]
print(word_frequency_list)