In [28]:
# Installations (Run this cell first)
!pip install -U spacy
!python -m spacy download en_core_web_sm
!pip install nltk scikit-learn textblob



Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m74.5 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [29]:
import nltk
nltk.download('punkt')  # Download the tokenizer model


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [30]:
# Importing libraries
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from textblob import TextBlob
import spacy
from nltk.tokenize import word_tokenize
from nltk import pos_tag, ne_chunk
from nltk.tree import Tree

In [31]:
# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [32]:
# Initialize Spacy English model
nlp = spacy.load('en_core_web_sm')



In [34]:
# Function to tokenize, remove stopwords, and lemmatize text
def preprocess_text(text):
    doc = nlp(text)
    return ' '.join([token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct])

In [35]:
# Example text
default_text = "I love him"

In [36]:
# Preprocess text
processed_text = preprocess_text(default_text)
print(f"\nProcessed Text: {processed_text}")


Processed Text: love


In [37]:
# Bag of Words (BoW)
vectorizer = CountVectorizer()
bow = vectorizer.fit_transform([processed_text]).toarray()
print("\nBag of Words (BoW):", bow)


Bag of Words (BoW): [[1]]


In [38]:
# TF-IDF
tfidf_vectorizer = TfidfVectorizer()
tfidf = tfidf_vectorizer.fit_transform([processed_text]).toarray()
print("\nTF-IDF:", tfidf)


TF-IDF: [[1.]]


In [39]:
from nltk.tokenize import word_tokenize
from nltk import pos_tag

In [40]:
import nltk
nltk.download('punkt_tab')
tokens = word_tokenize(processed_text)   # Tokenizes the text into words
import nltk
nltk.download('averaged_perceptron_tagger_eng')
pos_tags = pos_tag(tokens)               # Tags each word with its Part-of-Speech

print("\nPart-of-Speech Tags:", pos_tags)




Part-of-Speech Tags: [('love', 'NN')]


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [41]:
import nltk
nltk.download('maxent_ne_chunker_tab')
# Named Entity Recognition (NER)
def get_continuous_chunks(text):
    chunked = ne_chunk(pos_tag(word_tokenize(text)))
    continuous_chunk = []
    current_chunk = []

    for subtree in chunked:
        if type(subtree) == Tree:
            current_chunk.append(" ".join([token for token, pos in subtree.leaves()]))
        elif current_chunk:
            named_entity = " ".join(current_chunk)
            if named_entity not in continuous_chunk:
                continuous_chunk.append(named_entity)
                current_chunk = []
        else:
            continue

    return continuous_chunk

ner = get_continuous_chunks(default_text)
print("\nNamed Entities:", ner)

[nltk_data] Downloading package maxent_ne_chunker_tab to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker_tab is already up-to-date!



Named Entities: []


In [42]:
# Subjectivity Analysis
blob = TextBlob(default_text)
print("\nSubjectivity:", blob.sentiment.subjectivity)


Subjectivity: 0.6


In [43]:
# Polarity Analysis
print("Polarity:", blob.sentiment.polarity)

Polarity: 0.5
