In [1]:
!pip install scikit-learn gensim nltk pandas


Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m58.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [3]:
"""
NLP Techniques Demo:
- Bag of Words
- TF-IDF
- Naive Bayes classifier
- Word2Vec using Gensim

Collaboration-friendly structure:
- Functions separated by task
- Clear comments
- Easy dataset swap
"""

import pandas as pd
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from gensim.models import Word2Vec

nltk.download("punkt")
nltk.download("punkt_tab")

# -------------------------------------------------
# Sample dataset (replace with real dataset anytime)
# -------------------------------------------------
def load_sample_data():
    data = {
        "text": [
            "I love machine learning",
            "AI is amazing",
            "I hate bugs in code",
            "Debugging is frustrating",
            "Machine learning is fun",
            "AI will shape future"
        ],
        "label": ["pos", "pos", "neg", "neg", "pos", "pos"]
    }
    return pd.DataFrame(data)


# -------------------------------------------------
# BAG OF WORDS
# -------------------------------------------------
def bag_of_words(corpus):
    print("\n===== BAG OF WORDS =====")
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(corpus)

    print("Vocabulary:")
    print(vectorizer.get_feature_names_out())

    print("\nBoW Matrix:")
    print(X.toarray())


# -------------------------------------------------
# TF-IDF
# -------------------------------------------------
def tfidf_features(corpus):
    print("\n===== TF-IDF =====")
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(corpus)

    print("Vocabulary:")
    print(vectorizer.get_feature_names_out())

    print("\nTF-IDF Matrix:")
    print(X.toarray())


# -------------------------------------------------
# NAIVE BAYES CLASSIFIER
# -------------------------------------------------
def naive_bayes_classifier(texts, labels):
    print("\n===== NAIVE BAYES TEXT CLASSIFIER =====")

    # Pipeline combines TF-IDF + Naive Bayes
    model = Pipeline([
        ("tfidf", TfidfVectorizer()),
        ("nb", MultinomialNB())
    ])

    model.fit(texts, labels)

    test_sentences = [
        "AI is powerful",
        "I hate errors"
    ]

    predictions = model.predict(test_sentences)

    for sent, pred in zip(test_sentences, predictions):
        print(f"{sent} -> {pred}")


# -------------------------------------------------
# WORD2VEC USING GENSIM
# -------------------------------------------------
def train_word2vec(corpus):
    print("\n===== WORD2VEC (GENSIM) =====")

    tokenized = [nltk.word_tokenize(text.lower()) for text in corpus]

    model = Word2Vec(
        sentences=tokenized,
        vector_size=50,
        window=3,
        min_count=1,
        workers=4
    )

    print("Vector for 'ai':")
    print(model.wv["ai"])

    print("\nMost similar words to 'machine':")
    print(model.wv.most_similar("machine"))


# -------------------------------------------------
# MAIN EXECUTION
# -------------------------------------------------
if __name__ == "__main__":
    df = load_sample_data()

    corpus = df["text"]
    labels = df["label"]

    bag_of_words(corpus)
    tfidf_features(corpus)
    naive_bayes_classifier(corpus, labels)
    train_word2vec(corpus)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.



===== BAG OF WORDS =====
Vocabulary:
['ai' 'amazing' 'bugs' 'code' 'debugging' 'frustrating' 'fun' 'future'
 'hate' 'in' 'is' 'learning' 'love' 'machine' 'shape' 'will']

BoW Matrix:
[[0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0]
 [1 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0]
 [0 0 1 1 0 0 0 0 1 1 0 0 0 0 0 0]
 [0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0]
 [0 0 0 0 0 0 1 0 0 0 1 1 0 1 0 0]
 [1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1]]

===== TF-IDF =====
Vocabulary:
['ai' 'amazing' 'bugs' 'code' 'debugging' 'frustrating' 'fun' 'future'
 'hate' 'in' 'is' 'learning' 'love' 'machine' 'shape' 'will']

TF-IDF Matrix:
[[0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.5355058
  0.65304446 0.5355058  0.         0.        ]
 [0.55902156 0.68172171 0.         0.         0.         0.
  0.         0.         0.         0.         0.47196441 0.
  0.         0.         0.         0.        ]
 [0.         0.         0.5        0.5        0.         0.
  0.         0.        