# Natural Language Processing Tutorials

This notebook walks through basic NLP tasks step by step.

In [None]:
!pip install nltk scikit-learn transformers spacy

In [None]:
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report
from transformers import pipeline
import spacy
from spacy.cli import download

## 0. Tokenize text with NLTK

Tokenization splits text into individual tokens.

In [None]:
nltk.download('punkt')
text = 'Natural language processing with Python is fun!'
tokens = nltk.word_tokenize(text)
print(tokens)

## 1. Bag-of-words representation

Count how often each word appears in a set of documents.

In [None]:
docs = ['I love coding in Python', 'Python can be used for NLP']
vectorizer = CountVectorizer()
bag = vectorizer.fit_transform(docs)
print('Vocabulary:', vectorizer.vocabulary_)
print('Bag-of-words matrix:
', bag.toarray())

## 2. Train a text classifier

Train a logistic regression model on two newsgroup categories.

In [None]:
categories = ['rec.sport.baseball', 'sci.space']
train = fetch_20newsgroups(subset='train', categories=categories, remove=('headers','footers','quotes'))
test = fetch_20newsgroups(subset='test', categories=categories, remove=('headers','footers','quotes'))
clf = make_pipeline(CountVectorizer(), LogisticRegression(max_iter=1000))
clf.fit(train.data, train.target)
preds = clf.predict(test.data)
print(classification_report(test.target, preds, target_names=test.target_names))

## 3. Sentiment analysis with transformers

Use a pretrained model from Hugging Face to classify a sentence.

In [None]:
sentiment = pipeline('sentiment-analysis')
result = sentiment('I love using transformers for NLP!')[0]
print(f"Label: {result['label']}, score: {result['score']:.3f}")

## 4. Named entity recognition with spaCy

Recognize people, organizations and more in text.

In [None]:
try:
    nlp = spacy.load('en_core_web_sm')
except OSError:
    download('en_core_web_sm')
    nlp = spacy.load('en_core_web_sm')
doc = nlp('Apple was founded by Steve Jobs in California.')
for ent in doc.ents:
    print(ent.text, ent.label_)

## 5. Word embeddings and similarity

Word vectors allow us to compare semantic similarity between words.

In [None]:
try:
    nlp_md = spacy.load('en_core_web_md')
except OSError:
    download('en_core_web_md')
    nlp_md = spacy.load('en_core_web_md')
tokens = nlp_md('dog cat banana')
for t1 in tokens:
    for t2 in tokens:
        print(f'Similarity({t1.text}, {t2.text}) = {t1.similarity(t2):.3f}')