In [None]:
# pip install -U spacy
# python -m spacy download en_core_web_sm
import spacy

# Load English tokenizer, tagger, parser and NER
nlp = spacy.load("en_core_web_sm")

# Process whole documents
text = ("When Sebastian Thrun started working on self-driving cars at "
        "Google in 2007, few people outside of the company took him "
        "seriously. “I can tell you very senior CEOs of major American "
        "car companies would shake my hand and turn away because I wasn’t "
        "worth talking to,” said Thrun, in an interview with Recode earlier "
        "this week.")
doc = nlp(text)



In [None]:
for token in doc:
  print(token.text,token.pos_)

When SCONJ
Sebastian PROPN
Thrun PROPN
started VERB
working VERB
on ADP
self NOUN
- PUNCT
driving VERB
cars NOUN
at ADP
Google PROPN
in ADP
2007 NUM
, PUNCT
few ADJ
people NOUN
outside ADP
of ADP
the DET
company NOUN
took VERB
him PRON
seriously ADV
. PUNCT
“ PUNCT
I PRON
can AUX
tell VERB
you PRON
very ADV
senior ADJ
CEOs NOUN
of ADP
major ADJ
American ADJ
car NOUN
companies NOUN
would AUX
shake VERB
my PRON
hand NOUN
and CCONJ
turn VERB
away ADV
because SCONJ
I PRON
was AUX
n’t PART
worth ADJ
talking VERB
to ADP
, PUNCT
” PUNCT
said VERB
Thrun PROPN
, PUNCT
in ADP
an DET
interview NOUN
with ADP
Recode PROPN
earlier ADV
this DET
week NOUN
. PUNCT


In [None]:
# Define the helper function
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text, ent.label_, spacy.explain(ent.label_))
    else:
        print("No entities found.")

show_ents(doc)

Sebastian Thrun PERSON People, including fictional
Google ORG Companies, agencies, institutions, etc.
2007 DATE Absolute or relative dates or periods
American NORP Nationalities or religious or political groups
Thrun GPE Countries, cities, states
Recode ORG Companies, agencies, institutions, etc.
earlier this week DATE Absolute or relative dates or periods


In [None]:
# Print token with POS and detailed tag
for token in doc:
    print(f"{token.text:<12} {token.pos_:<10} {token.tag_:<10} {spacy.explain(token.tag_)}")

When         SCONJ      WRB        wh-adverb
Sebastian    PROPN      NNP        noun, proper singular
Thrun        PROPN      NNP        noun, proper singular
started      VERB       VBD        verb, past tense
working      VERB       VBG        verb, gerund or present participle
on           ADP        IN         conjunction, subordinating or preposition
self         NOUN       NN         noun, singular or mass
-            PUNCT      HYPH       punctuation mark, hyphen
driving      VERB       VBG        verb, gerund or present participle
cars         NOUN       NNS        noun, plural
at           ADP        IN         conjunction, subordinating or preposition
Google       PROPN      NNP        noun, proper singular
in           ADP        IN         conjunction, subordinating or preposition
2007         NUM        CD         cardinal number
,            PUNCT      ,          punctuation mark, comma
few          ADJ        JJ         adjective (English), other noun-modifier (Chinese)

NER

In [None]:
for ent in doc.ents:
    print(ent.text, ent.label_, spacy.explain(ent.label_))


Sebastian Thrun PERSON People, including fictional
Google ORG Companies, agencies, institutions, etc.
2007 DATE Absolute or relative dates or periods
American NORP Nationalities or religious or political groups
Thrun GPE Countries, cities, states
Recode ORG Companies, agencies, institutions, etc.
earlier this week DATE Absolute or relative dates or periods


In [None]:
from spacy import displacy
displacy.render(doc,style = 'ent',jupyter = True)

sentence segmentation

In [None]:
for sent in doc.sents:
    print(sent.text)

When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously.
“I can tell you very senior CEOs of major American car companies would shake my hand and turn away because I wasn’t worth talking to,” said Thrun, in an interview with Recode earlier this week.


Bag of Words (BoW)

In [None]:
from collections import Counter

In [None]:
bow = Counter([token.text.lower() for token in doc if not token.is_stop and not token.is_punct])
print(bow)


Counter({'thrun': 2, 'sebastian': 1, 'started': 1, 'working': 1, 'self': 1, 'driving': 1, 'cars': 1, 'google': 1, '2007': 1, 'people': 1, 'outside': 1, 'company': 1, 'took': 1, 'seriously': 1, 'tell': 1, 'senior': 1, 'ceos': 1, 'major': 1, 'american': 1, 'car': 1, 'companies': 1, 'shake': 1, 'hand': 1, 'turn': 1, 'away': 1, 'worth': 1, 'talking': 1, 'said': 1, 'interview': 1, 'recode': 1, 'earlier': 1, 'week': 1})


In [None]:
# Vocabulary
vocab = list(bow.keys())

# Create vector (counts for each vocab word)
vector = [bow[word] for word in vocab]

print("Vocabulary:", vocab)
print("Vector:", vector)


Vocabulary: ['sebastian', 'thrun', 'started', 'working', 'self', 'driving', 'cars', 'google', '2007', 'people', 'outside', 'company', 'took', 'seriously', 'tell', 'senior', 'ceos', 'major', 'american', 'car', 'companies', 'shake', 'hand', 'turn', 'away', 'worth', 'talking', 'said', 'interview', 'recode', 'earlier', 'week']
Vector: [1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


TF/IDF

In [None]:
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer

# Load spaCy
nlp = spacy.load("en_core_web_sm")

# Example corpus
corpus = [
    "Apple is looking at buying a U.K. startup.",
    "Apple will invest $1 billion in the startup.",
    "Google is launching a new product next year."
]

# Custom tokenizer using spaCy
def spacy_tokenizer(sentence):
    doc = nlp(sentence)
    return [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct]

# Create TF-IDF vectorizer
vectorizer = TfidfVectorizer(tokenizer=spacy_tokenizer)

X = vectorizer.fit_transform(corpus)

print("Feature names (vocabulary):")
print(vectorizer.get_feature_names_out())

print("\nTF-IDF Matrix:")
print(X.toarray())


Feature names (vocabulary):
['$' '1' 'apple' 'billion' 'buy' 'google' 'invest' 'launch' 'look' 'new'
 'product' 'startup' 'u.k' 'year']

TF-IDF Matrix:
[[0.         0.         0.37302199 0.         0.49047908 0.
  0.         0.         0.49047908 0.         0.         0.37302199
  0.49047908 0.        ]
 [0.44036207 0.44036207 0.3349067  0.44036207 0.         0.
  0.44036207 0.         0.         0.         0.         0.3349067
  0.         0.        ]
 [0.         0.         0.         0.         0.         0.4472136
  0.         0.4472136  0.         0.4472136  0.4472136  0.
  0.         0.4472136 ]]




**Class 3**

**EMBEDDINGS**

1️⃣ Word Embeddings range (–1 to 1?)

In models like Word2Vec, GloVe, FastText, each word is mapped to a fixed-length vector (say 100, 200, or 300 dimensions).

These numbers are not restricted to –1 to 1. They can be any real numbers (e.g., –5.2, 0.34, 2.7).

But many implementations normalize vectors so cosine similarity is easier. After normalization, values may fall roughly between –1 and 1.

2️⃣ Sentence Embeddings range

A sentence embedding is just a vector representation of a full sentence (instead of a single word).

Like word embeddings, sentence embeddings also contain real-valued numbers, not strictly bounded.

Depending on the model (e.g., Sentence-BERT), values can be anywhere in the real space (but often between –1 and 1 after normalization).

3️⃣ What are the “columns” in an embedding?

Think of an embedding as a matrix:

Each row = a word (or a sentence/document).

Each column (dimension) = a feature learned by the model.

**Example of sentence embeddings**

🔎 Step 1: Input Example Sentences

Let’s take two pairs of sentences:

Pair 1 (similar meaning):

S1: “I love playing football.”

S2: “I enjoy playing soccer.”

Pair 2 (different meaning):

S3: “The sun is shining today.”

S4: “I cooked pasta for dinner.”

---

🔎 Step 2: How embeddings are generated
Using a transformer model (e.g., Sentence-BERT):

Tokenization:
Each sentence is split into tokens (words or subwords).
Example: “I love playing football” → [CLS], I, love, playing, football, [SEP]

Model Encoding:
Tokens pass through a transformer (BERT-like). Each token gets a contextual vector (768D for BERT-base).

Pooling:
To create a single sentence embedding, we pool token embeddings:

Often by taking the [CLS] token representation, or

By averaging all token embeddings.

👉 Result: A fixed-length vector (e.g., 768 numbers) for the entire sentence.

Example (numbers are illustrative, not real):

"I love playing football" → [0.12, -0.34, 0.87, ..., 0.45]   (768D)
"I enjoy playing soccer"  → [0.10, -0.30, 0.85, ..., 0.50]   (768D)

---

🔎 Step 3: How similarity is checked

The most common way = Cosine Similarity:

Dot product of two vectors, divided by product of their magnitudes.

Range = –1 to 1.

1 → perfectly similar (same direction)

0 → unrelated (orthogonal)

–1 → opposite meaning

---

🔎 Step 4: Example Results

Suppose we compute cosine similarity:

S1 vs S2 (football vs soccer) → 0.92 ✅ (high similarity)

S1 vs S3 (football vs sun) → 0.20 ❌ (low similarity)

S3 vs S4 (sun vs pasta) → 0.05 ❌ (almost unrelated)

**Step 2 (How embeddings are generated)**

🔹 1. Tokenization

Transformer models (like BERT) can’t directly take raw text.

They split text into tokens (words or subwords).

Special tokens are added:

[CLS] = signals start of sentence (used for classification / embedding).

[SEP] = signals end of sentence.

Example:

"I love playing football"
→ [CLS], "I", "love", "playing", "football", [SEP]


---


🔹 2. Model Encoding

Each token is converted into a vector embedding (initially from a lookup table).

These embeddings go through multiple transformer layers (self-attention, feed-forward).

At the end, each token has a contextual vector (e.g., 768 dimensions for BERT-base).

Example (shortened vectors, just for illustration):

"I"        → [0.12, -0.34, ..., 0.22]  
"love"     → [0.67,  0.10, ..., -0.45]  
"playing"  → [0.21,  0.98, ...,  0.11]  
"football" → [0.55, -0.29, ...,  0.90]  


---


🔹 3. Pooling (combine into 1 sentence vector)

Now we need one embedding for the entire sentence. Common methods:

[CLS] token:
Take the vector for [CLS] (trained to summarize the sentence).

Mean pooling:
Average all token embeddings (excluding [CLS] and [SEP]).

Example (mean pooling, simplified to 3D just to see numbers):

"I"        → [0.1, -0.3, 0.2]  
"love"     → [0.6,  0.1, -0.4]  
"playing"  → [0.2,  1.0,  0.1]  
"football" → [0.5, -0.2, 0.9]  

Mean = [(0.1+0.6+0.2+0.5)/4, (-0.3+0.1+1.0-0.2)/4, (0.2-0.4+0.1+0.9)/4]  
     = [0.35, 0.15, 0.2]


So the sentence embedding = [0.35, 0.15, 0.2] (in reality → 768D or 1536D).


---


🔹 4. Final Result

Each sentence becomes a fixed-length vector (e.g., 768 numbers for BERT-base).

Different sentences with similar meaning → embeddings that are close together in vector space.

✅ So the pipeline is:
Sentence → Tokens → Contextual embeddings for each token → Pooling → Sentence embedding (vector).

                                 ----
                                 

**Sentiment Analysis**


For Sentiment Analysis NLTK is better than Spacy

In [None]:
!pip install nltk



In [None]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer

# Initialize
sia = SentimentIntensityAnalyzer()

# Example text
text1 = "I love this product! It's amazing and works perfectly."
text2 = "This is the worst experience I've ever had."

# Get sentiment scores
print(sia.polarity_scores(text1))
print(sia.polarity_scores(text2))


{'neg': 0.0, 'neu': 0.286, 'pos': 0.714, 'compound': 0.9259}
{'neg': 0.369, 'neu': 0.631, 'pos': 0.0, 'compound': -0.6249}


neg → Negative sentiment score

neu → Neutral sentiment score

pos → Positive sentiment score

compound → Normalized, overall sentiment score (ranges from -1 to +1)

📌 How to interpret compound score

compound >= 0.05 → Positive

compound <= -0.05 → Negative

Otherwise → Neutral

🚀 Let’s break down how **VADER (Valence Aware Dictionary for sEntiment Reasoning)** works internally. It’s a **lexicon + rule–based sentiment analyzer** specially tuned for social media, reviews, and informal text.

---

## ⚙️ How VADER Works Internally

### 1. **Sentiment Lexicon**

* At its core, VADER has a **dictionary of \~7,500 words/phrases**.
* Each entry has a **valence score** ranging from **-4 (most negative)** to **+4 (most positive)**.
* Examples:

  * "excellent" → +3.1
  * "horrible" → -2.5
  * "meh" → -0.2
  * "love" → +3.2
  * "hate" → -3.0

---

### 2. **Text Preprocessing**

* Input text is tokenized (split into words, emoticons, punctuation).
* VADER keeps emoticons, emojis, punctuation, and capitalization — because they carry sentiment (unlike normal text cleaning).

Example:

* "I LOVE it!!! 😍😍" → tokens: \[I, LOVE, it, !!!, 😍, 😍]

---

### 3. **Valence Scoring per Token**

* Each token is checked against the lexicon.
* If found, its score is retrieved.
* If not found, score = 0.

---

### 4. **Heuristic Adjustments (Rules)**

VADER doesn’t just sum up words — it applies **rules** to capture sentiment intensity:

#### 🔹 (a) **Degree modifiers (intensifiers & dampeners)**

Words like "very", "extremely", "slightly" adjust sentiment strength.

* "very good" → "good" (+2) × 1.5 = +3.0
* "slightly bad" → "bad" (-2) × 0.5 = -1.0

#### 🔹 (b) **Negation handling**

Words like "not", "never", "isn't" flip/soften the sentiment.

* "good" → +2
* "not good" → \~ -1.5

#### 🔹 (c) **Punctuation**

* Exclamation marks **boost intensity** (but capped).

  * "good" → +2
  * "good!!" → +2.5
* Question marks sometimes increase emphasis.

#### 🔹 (d) **Capitalization**

* UPPERCASE words are more intense.

  * "good" → +2
  * "GOOD" → +2.5

#### 🔹 (e) **Conjunction “but”**

* Text after "but" gets more weight.

  * "The movie was boring, BUT the acting was great."
    → "boring" effect reduced, "great" boosted.

#### 🔹 (f) **Emoticons & emojis**

* 😊 (+2), 😡 (-2.5), ❤️ (+3), etc.

---

### 5. **Aggregating Scores**

* After adjusting, VADER sums the valence of all tokens.
* It normalizes the score to a range between **-1 and +1** using this formula:

$$
compound = \frac{sum\_valence}{\sqrt{sum\_valence^2 + \alpha}}
$$

where **α = 15**, a normalization constant to keep values in (-1,1).

---

### 6. **Final Output**

VADER returns a dictionary with **4 scores**:

```python
{'neg': 0.xx, 'neu': 0.xx, 'pos': 0.xx, 'compound': -0.6789}
```

* **pos/neu/neg** = proportion of text that falls into each sentiment.
* **compound** = single normalized score in \[-1,1].

  * ≥ 0.05 → Positive
  * ≤ -0.05 → Negative
  * Otherwise → Neutral