In [41]:
!pip install nltk



In [42]:
import nltk

<code>
Tokenization
</code>

In [43]:
sentence = "Hello! How are you doing today? I hope you're doing fine."
sentence_token = nltk.sent_tokenize(sentence)
word_token = nltk.word_tokenize(sentence)

sentence_token , word_token

(['Hello!', 'How are you doing today?', "I hope you're doing fine."],
 ['Hello',
  '!',
  'How',
  'are',
  'you',
  'doing',
  'today',
  '?',
  'I',
  'hope',
  'you',
  "'re",
  'doing',
  'fine',
  '.'])

<code>stopwords</code>

In [None]:
from nltk.corpus import stopwords

english_stopwords = stopwords.words("english")
filtered_sentence = [ word for word in word_token if word not in english_stopwords]
filtered_sentence

['Hello', '!', 'How', 'today', '?', 'I', 'hope', "'re", 'fine', '.']

<code>stemming</code>

In [45]:
from nltk.stem import PorterStemmer

ps = PorterStemmer()

words = ["Running" , "jumped"]

for word in words : 
    stemmed_word = ps.stem(word)
    print(f"{word} -> {stemmed_word}")

Running -> run
jumped -> jump


<code>Lemmatization vs Stemming</code>

In [46]:
from nltk.stem import WordNetLemmatizer , PorterStemmer
from nltk import word_tokenize

lemmatizer = WordNetLemmatizer()

sentence = "I've used the computer to get informations"



filtered_sentence = [ lemmatizer.lemmatize(word) for word in word_tokenize(sentence)]
print(filtered_sentence)

filtered_sentence = [ ps.stem(word) for word in word_tokenize(sentence)]
filtered_sentence

['I', "'ve", 'used', 'the', 'computer', 'to', 'get', 'information']


['i', "'ve", 'use', 'the', 'comput', 'to', 'get', 'inform']

<code>Regular Expression</code>

In [47]:
import re


txt = """
Elon musk's phone number is 9991116666, call him if you have any questions on dodgecoin
Tesla's revenue is 40 billion
Tesla's CFO number (999)-333-7777
"""
pattern = r'\(\d{3}\)-\d{3}-\d{4}|\d{10}'
cleaned = re.findall(pattern , txt)



In [48]:
txt = """

Note 1 - Overview
Note 2 - Summary of Significant Accounting Policies

"""

pattern = r'Note \d - [^\n]*'
cleaned = re.findall(pattern , txt)

cleaned

['Note 1 - Overview', 'Note 2 - Summary of Significant Accounting Policies']

In [49]:
txt =""" 
    The gross cost of operating lease vehicles in FY2021 Q1 was $4.85 billion
    In previous quarter i.e. fy2020 Q4 it was $3 billion fy2030 q1
"""

pattern = r'fy\d{4} Q[1-4]'
cleaned = re.findall(pattern , txt , flags=re.IGNORECASE)

cleaned


['FY2021 Q1', 'fy2020 Q4', 'fy2030 q1']

In [50]:
import re
text = '''
Follow our leader Elon musk on twitter here: https://twitter.com/elonmusk
, more information 
on Tesla's products can be found at https://www.tesla.com/. Also here are leading influencers 
for tesla related news,
https://twitter.com/teslarati
https://twitter.com/dummy_tesla
https://twitter.com/dummy_2_tesla
'''
z = r'https:\/\/twitter.com\/\w[^\n]*' # todo: type your regex here

re.findall(z, text)

['https://twitter.com/elonmusk',
 'https://twitter.com/teslarati',
 'https://twitter.com/dummy_tesla',
 'https://twitter.com/dummy_2_tesla']

In [53]:
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger_eng.zip.


True

<code>POS - part of speech</code>

In [None]:
from nltk import pos_tag

txt = word_tokenize("Mashary loves coding!")

result = pos_tag(txt)
result

[('Mashary', 'JJ'), ('loves', 'NNS'), ('coding', 'VBG'), ('!', '.')]

In [None]:
!python -m spacy download en_core_web_sm

In [94]:
import pandas as pd
import spacy
from tabulate import tabulate
# Load your spaCy model
nlp = spacy.load("en_core_web_sm")

# Assuming 'result' is a spaCy Doc object
data = []
for token in result:
    data.append({
        'token': token.text,
        'pos': token.pos_,
        'pos_explanation': spacy.explain(token.pos_)
    })

# Create a single DataFrame
df = pd.DataFrame(data)
print(df)

     token    pos pos_explanation
0  Mashary  PROPN     proper noun
1    Loves   VERB            verb
2   Coding  PROPN     proper noun
