<a href="https://colab.research.google.com/github/MOHAMMEDASHRAF310/Courser-Test/blob/main/Assign_2_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np


In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
# Sample documents
documents = [
    "Machine learning is a subset of artificial intelligence.",
    "Data science involves analyzing data to extract insights.",
    "Blockchain technology is revolutionizing finance and beyond."
]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)
    # Convert to lowercase
    tokens = [word.lower() for word in tokens]
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return tokens

In [None]:
# Convert documents to TF-IDF vectors
tfidf_vectorizer = TfidfVectorizer(tokenizer=preprocess_text)
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

In [None]:
from sklearn.preprocessing import normalize

# Compute L2 normalization
tfidf_matrix_normalized = normalize(tfidf_matrix, norm='l2')


In [None]:
# Display unique words
unique_words = tfidf_vectorizer.get_feature_names_out()
print("Unique words:", unique_words)

# Display TF-IDF scores
print("TF-IDF scores:")

Unique words: ['.' 'analyzing' 'artificial' 'beyond' 'blockchain' 'data' 'extract'
 'finance' 'insight' 'intelligence' 'involves' 'learning' 'machine'
 'revolutionizing' 'science' 'subset' 'technology']
TF-IDF scores:


In [None]:
for i, doc in enumerate(documents):
    print("Document", i+1, ":")
    feature_index = tfidf_matrix[i,:].nonzero()[1]
    tfidf_scores = zip(feature_index, [tfidf_matrix[i, x] for x in feature_index])
    for w, s in [(unique_words[i], s) for (i, s) in tfidf_scores]:
        print(w, ":", s)
    print()


Document 1 :
. : 0.25537359879528915
intelligence : 0.4323850887896905
artificial : 0.4323850887896905
subset : 0.4323850887896905
learning : 0.4323850887896905
machine : 0.4323850887896905

Document 2 :
insight : 0.32705547923242223
extract : 0.32705547923242223
analyzing : 0.32705547923242223
involves : 0.32705547923242223
science : 0.32705547923242223
data : 0.6541109584648445
. : 0.19316423462032448

Document 3 :
beyond : 0.4323850887896905
finance : 0.4323850887896905
revolutionizing : 0.4323850887896905
technology : 0.4323850887896905
blockchain : 0.4323850887896905
. : 0.25537359879528915



In [None]:
# Display normalized TF-IDF scores
print("Normalized TF-IDF scores:")
for i, doc in enumerate(documents):
    print("Document", i+1, ":")
    feature_index = tfidf_matrix_normalized[i,:].nonzero()[1]
    tfidf_scores = zip(feature_index, [tfidf_matrix_normalized[i, x] for x in feature_index])
    for w, s in [(unique_words[i], s) for (i, s) in tfidf_scores]:
        print(w, ":", s)
    print()

Normalized TF-IDF scores:
Document 1 :
. : 0.2553735987952892
intelligence : 0.43238508878969056
artificial : 0.43238508878969056
subset : 0.43238508878969056
learning : 0.43238508878969056
machine : 0.43238508878969056

Document 2 :
insight : 0.32705547923242223
extract : 0.32705547923242223
analyzing : 0.32705547923242223
involves : 0.32705547923242223
science : 0.32705547923242223
data : 0.6541109584648445
. : 0.19316423462032448

Document 3 :
beyond : 0.4323850887896905
finance : 0.4323850887896905
revolutionizing : 0.4323850887896905
technology : 0.4323850887896905
blockchain : 0.4323850887896905
. : 0.25537359879528915

