In [None]:
!git clone https://github.com/PacktPublishing/Python-Natural-Language-Processing-Cookbook.git

Cloning into 'Python-Natural-Language-Processing-Cookbook'...
remote: Enumerating objects: 308, done.[K
remote: Counting objects: 100% (84/84), done.[K
remote: Compressing objects: 100% (71/71), done.[K
remote: Total 308 (delta 36), reused 39 (delta 12), pack-reused 224 (from 1)[K
Receiving objects: 100% (308/308), 658.34 MiB | 19.84 MiB/s, done.
Resolving deltas: 100% (128/128), done.
Updating files: 100% (93/93), done.


In [None]:
%cd Python-Natural-Language-Processing-Cookbook

/content/Python-Natural-Language-Processing-Cookbook


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import nltk
nltk.download('punkt_tab')

# Assuming necessary helper functions and preprocessing are defined as per Chapter 1 and 3
from Chapter01.dividing_into_sentences import read_text_file, preprocess_text, divide_into_sentences_nltk
from Chapter03.bag_of_words import get_sentences, get_new_sentence_vector

# Step 1: Get sentences from the text file
sentences = get_sentences("/content/Python-Natural-Language-Processing-Cookbook/Chapter01/sherlock_holmes_1.txt")

# Step 2: Create the bigram vectorizer
bigram_vectorizer = CountVectorizer(ngram_range=(1, 2))

# Step 3: Fit and transform sentences into a sparse matrix
X = bigram_vectorizer.fit_transform(sentences)

# Step 4: Print sparse matrix representation
print(X)

# Convert to dense format
denseX = X.todense()
print(denseX)

# Step 5: Print vocabulary used by the vectorizer
print(bigram_vectorizer.get_feature_names_out())

# Step 6: Represent new sentences
new_sentence = "I had seen little of Holmes lately."
new_sentence_vector = bigram_vectorizer.transform([new_sentence])
print(new_sentence_vector)
print(new_sentence_vector.todense())

# Step 7: Compare with another sentence from the original text
new_sentence1 = "And yet there was but one woman to him, and that woman was the late Irene Adler, of dubious and questionable memory."
new_sentence_vector1 = bigram_vectorizer.transform([new_sentence1])
print(new_sentence_vector1)
print(new_sentence_vector1.todense())


  (0, 269)	1
  (0, 229)	1
  (0, 118)	1
  (0, 226)	1
  (0, 136)	1
  (0, 20)	1
  (0, 0)	1
  (0, 299)	1
  (0, 275)	1
  (0, 230)	1
  (0, 119)	1
  (0, 228)	1
  (0, 137)	1
  (0, 21)	1
  (0, 1)	1
  (1, 93)	1
  (1, 221)	1
  (1, 101)	1
  (1, 108)	1
  (1, 156)	1
  (1, 103)	1
  (1, 278)	1
  (1, 31)	1
  (1, 190)	1
  (1, 167)	1
  :	:
  (10, 307)	1
  (10, 261)	1
  (10, 141)	1
  (10, 60)	1
  (10, 210)	1
  (10, 151)	1
  (10, 30)	1
  (10, 308)	1
  (10, 262)	1
  (10, 285)	1
  (10, 45)	1
  (10, 187)	1
  (10, 300)	1
  (10, 271)	1
  (10, 109)	1
  (10, 251)	1
  (10, 301)	1
  (10, 288)	1
  (10, 253)	1
  (10, 142)	1
  (10, 8)	1
  (10, 180)	1
  (10, 61)	1
  (10, 27)	1
  (10, 211)	1
[[1 1 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 1 0 0]
 [0 0 0 ... 0 1 1]]
['_the_' '_the_ woman' 'abhorrent' 'abhorrent to' 'actions' 'adjusted'
 'adjusted temperament' 'adler' 'adler of' 'admirable' 'admirable things'
 'admirably' 'admirably balanced' 'admit' 'admit such' 'akin' 'akin t

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
%run -i "/content/drive/MyDrive/DeepLearning/util_simple_classifier.ipynb"

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m23.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [None]:
(train_df, test_df) = load_train_test_dataset_pd()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.46k [00:00<?, ?B/s]

train.parquet:   0%|          | 0.00/699k [00:00<?, ?B/s]

validation.parquet:   0%|          | 0.00/90.0k [00:00<?, ?B/s]

test.parquet:   0%|          | 0.00/92.2k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8530 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1066 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1066 [00:00<?, ? examples/s]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

bigram_vectorizer = CountVectorizer(ngram_range=(1,3),max_df=300)
X = bigram_vectorizer.fit_transform(train_df["text"])

In [None]:
print(bigram_vectorizer.get_feature_names_out())
print(len(bigram_vectorizer.get_feature_names_out()))

['10' '10 inch' '10 inch television' ... 'ótimo esforço do' 'últimos'
 'últimos tiempos']
79583


In [None]:
first_review = test_df['text'].iat[0]
dense_vector = bigram_vectorizer.transform([first_review]).todense()
print(dense_vector)

[[0 0 0 ... 0 0 0]]


In [None]:
vectorize = lambda x: bigram_vectorizer.transform([x]).toarray()[0]
(X_train, X_test, y_train, y_test) = create_train_test_data(train_df, test_df, vectorize,'text')
clf = train_classifier(X_train, y_train)
test_classifier(test_df, clf)

              precision    recall  f1-score   support

           0       0.74      0.72      0.73       160
           1       0.73      0.75      0.74       160

    accuracy                           0.74       320
   macro avg       0.74      0.74      0.74       320
weighted avg       0.74      0.74      0.74       320

