In [1]:
import nltk
nltk.download('punkt')

text = "This is a sample sentence for tokenization."
tokens = nltk.word_tokenize(text)
print(tokens)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


['This', 'is', 'a', 'sample', 'sentence', 'for', 'tokenization', '.']


In [2]:
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
print(filtered_tokens)

['sample', 'sentence', 'tokenization', '.']


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
from nltk.stem import PorterStemmer

porter = PorterStemmer()
stemmed_tokens = [porter.stem(word) for word in tokens]
print(stemmed_tokens)

['thi', 'is', 'a', 'sampl', 'sentenc', 'for', 'token', '.']


In [5]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
print(lemmatized_tokens)

[nltk_data] Downloading package wordnet to /root/nltk_data...


['This', 'is', 'a', 'sample', 'sentence', 'for', 'tokenization', '.']


In [6]:
from sklearn.feature_extraction.text import CountVectorizer

corpus = ["This is the first document.", "This document is the second document.", "And this is the third one."]
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(corpus)
print(bow_matrix.toarray())

[[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]]


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = ["This is the first document.", "This document is the second document.", "And this is the third one."]
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(corpus)
print(tfidf_matrix.toarray())

[[0.         0.46941728 0.61722732 0.3645444  0.         0.
  0.3645444  0.         0.3645444 ]
 [0.         0.7284449  0.         0.28285122 0.         0.47890875
  0.28285122 0.         0.28285122]
 [0.49711994 0.         0.         0.29360705 0.49711994 0.
  0.29360705 0.49711994 0.29360705]]


In [None]:
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense
from keras.utils import to_categorical
import urllib.request

# Download pre-trained GloVe embeddings (for demonstration purposes)
glove_url = 'http://nlp.stanford.edu/data/glove.6B.zip'
glove_zip_path = 'glove.6B.zip'
glove_txt_path = 'glove.6B.100d.txt'

urllib.request.urlretrieve(glove_url, glove_zip_path)

# Unzip GloVe file
import zipfile
with zipfile.ZipFile(glove_zip_path, 'r') as zip_ref:
    zip_ref.extractall()

# Load pre-trained GloVe embeddings into memory
word_embeddings = {}
with open(glove_txt_path, 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        word_embeddings[word] = coefs

# Sample text data
texts = ["this is the first document",
         "this document is the second document",
         "and this is the third one",
         "is this the first document"]

# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

# Pad sequences to ensure uniform length
maxlen = 10
padded_sequences = pad_sequences(sequences, maxlen=maxlen)

# Create an embedding matrix
word_index = tokenizer.word_index
num_words = len(word_index) + 1
embedding_dim = 100  # Embedding dimension (should match the dimension of the loaded GloVe embeddings)
embedding_matrix = np.zeros((num_words, embedding_dim))

for word, i in word_index.items():
    embedding_vector = word_embeddings.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Define the model
model = Sequential()
model.add(Embedding(num_words, embedding_dim, weights=[embedding_matrix], input_length=maxlen, trainable=False))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Example: Binary classification task with dummy labels
labels = np.array([1, 0, 1, 0])

# Train the model
model.fit(padded_sequences, labels, epochs=10, batch_size=32)

# Evaluate the model
loss, accuracy = model.evaluate(padded_sequences, labels)
print(f'Loss: {loss}, Accuracy: {accuracy}')