In [2]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
import spacy


In [8]:
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [23]:

# Sample text
text = " I live in New York, America"

# Tokenization with NLTK
tokens = word_tokenize(text)
print("Tokens:", tokens)




Tokens: ['I', 'live', 'in', 'New', 'York', ',', 'America']


In [24]:
# Lemmatization with NLTK
lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(token) for token in tokens]
print("Lemmatized:", lemmatized)


Lemmatized: ['I', 'live', 'in', 'New', 'York', ',', 'America']


In [25]:
# Stemming with NLTK
stemmer = PorterStemmer()
stemmed = [stemmer.stem(token) for token in tokens]
print("Stemmed:", stemmed)


Stemmed: ['i', 'live', 'in', 'new', 'york', ',', 'america']


In [26]:
# Removing stopwords
nltk.download('stopwords'),
filtered_tokens = [token for token in tokens if token not in stopwords.words('english')]
print("Without Stopwords:", filtered_tokens)



Without Stopwords: ['I', 'live', 'New', 'York', ',', 'America']


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [27]:
# Named Entity Recognition with spaCy
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
print("Named Entities:", [(ent.text, ent.label_) for ent in doc.ents])

Named Entities: [('New York', 'GPE'), ('America', 'GPE')]


In [14]:
#Vectorization: Bag of Words and TF-IDF


from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

texts = [
    "Natural language processing enables computers to understand human language.",
    "Machine learning is a core part of NLP.",
    "NLP is evolving with deep learning models."
]



# [Natural, Language, core ,Machine ....., models]

# sentence 1 : [1,2,0 ...... ]
# sentence 2 : [0,, 0, 1, 1 ]
# sentence 3 : [0, ]

# Bag of Words
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(texts)
print("Bag of Words:", bow_matrix.toarray())


Bag of Words: [[1 0 0 1 0 1 0 2 0 0 0 1 0 0 0 1 1 1 0]
 [0 1 0 0 0 0 1 0 1 1 0 0 1 1 1 0 0 0 0]
 [0 0 1 0 1 0 1 0 1 0 1 0 1 0 0 0 0 0 1]]


In [15]:
# TF-IDF
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(texts)
print("TF-IDF:", tfidf_matrix.toarray())

TF-IDF: [[0.30151134 0.         0.         0.30151134 0.         0.30151134
  0.         0.60302269 0.         0.         0.         0.30151134
  0.         0.         0.         0.30151134 0.30151134 0.30151134
  0.        ]
 [0.         0.41756662 0.         0.         0.         0.
  0.31757018 0.         0.31757018 0.41756662 0.         0.
  0.31757018 0.41756662 0.41756662 0.         0.         0.
  0.        ]
 [0.         0.         0.41756662 0.         0.41756662 0.
  0.31757018 0.         0.31757018 0.         0.41756662 0.
  0.31757018 0.         0.         0.         0.         0.
  0.41756662]]


In [28]:
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import pipeline

# Load pre-trained model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')

# Sample text for sentiment analysis
texts = ["I love learning about NLP!", "This is a challenging task."]

# Convert to BERT format
tokens = tokenizer(texts, padding=True, truncation=True, return_tensors="tf")

# Make predictions
nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
results = nlp(texts)

for text, result in zip(texts, results):
    print(f"Text: {text}, Sentiment: {result['label']}, Score: {result['score']:.2f}")


#1 Model Scratch Train (GPU, Data )
#2 MOdel FineTune (Specific Data Configure)
#3 Model Use


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Text: I love learning about NLP!, Sentiment: LABEL_0, Score: 0.57
Text: This is a challenging task., Sentiment: LABEL_0, Score: 0.57
