# General Concepts in SpaCy


In [None]:
# Import spaCy
import spacy

# Load the English language model
nlp = spacy.load("en_core_web_sm")

# Define a sample text
text = "This is an example sentence. SpaCy is great for NLP tasks."

# Process the text
doc = nlp(text)

# Tokenization
Tokenization is the process of splitting a text into individual words or tokens. When you process text with spaCy, tokenization happens automatically.

In [None]:
# Tokenization
for token in doc:
    print(f"Token: {token.text}, POS: {token.pos_}, Dependency: {token.dep_}")

- **Token**: Each token represents an individual word or unit in the text.
- **POS (Part-of-Speech)**: It indicates the grammatical category of the token (e.g., noun, verb, adjective).
- **Dependency**: It describes the grammatical relationship between tokens (e.g., subject, object).

# Named Entity Recognition (NER)

In [None]:
# Named Entity Recognition (NER)
for ent in doc.ents:
    print(f"Entity: {ent.text}, Label: {ent.label_}")

- **Entity**: This represents the recognized named entity (e.g., "SpaCy").
- **Label**: It categorizes the entity into predefined categories (e.g., ORGANIZATION, PERSON).

# Sentence Detection

In [None]:
for sentence in doc.sents:
    print(f"Sentence: {sentence.text}")

- **Sentence**: Each sentence is identified and printed separately.

# Stop Words

In [None]:
for token in doc:
    if token.is_stop:
        print(f"Stop Word: {token.text}")

- **Stop Word**: This indicates tokens that are identified as stop words.
- You can check if a token is a stop word using the `is_stop` attribute.

# Lemmatization

In [None]:
for token in doc:
    print(f"Token: {token.text}, Lemma: {token.lemma_}")

- **Token**: This represents the original word or token.
- **Lemma**: It represents the lemmatized form of the token.
- spaCy provides lemmatized forms of tokens with the `lemma_` attribute.

# Dependency Parsing

In [None]:
for token in doc:
    print(f"Token: {token.text}, Dependency: {token.dep_}, Head: {token.head.text}")

- **Token**: Each token in the document is processed.
- **Dependency**: This describes the grammatical relationship between the token and its head (e.g., subject, object).
- **Head**: It identifies the head token to which the current token is dependent.

# Custom NER

In [None]:
from spacy.matcher import Matcher

matcher = Matcher(nlp.vocab)
pattern = [{"LOWER": "spacy"}]
matcher.add("SpaCy", [pattern])

for match_id, start, end in matcher(doc):
    span = doc[start:end]
    print(f"Custom NER: {span.text}")

- **Custom NER**: Phrases or terms matching the custom pattern are identified.

# Word Vectors

In [None]:
word = nlp("example")
vector = word.vector
print(f"Word Vector: {vector}")

- **Word Vector**: This is a numerical representation of a word's meaning.
- You can access the word vectors for tokens in a document using the `vector` attribute.

# Disable Components

In [None]:
nlp.disable_pipes("tagger", "parser")

- You can disable specific spaCy pipeline components, such as the tagger or parser, to improve processing speed or customize the pipeline for your specific needs.

# Adding Custom Components

In [None]:
def custom_component(doc):
    # Custom processing logic here
    return doc

nlp.add_pipe(custom_component, name="custom_component", last=True)

- You can add custom processing components to the spaCy pipeline to perform specialized tasks.
- Custom components allow you to extend spaCy's functionality for your unique requirements.