# Code First Edit

In [None]:
!git clone https://github.com/PacktPublishing/Python-Natural-Language-Processing-Cookbook.git

Cloning into 'Python-Natural-Language-Processing-Cookbook'...
remote: Enumerating objects: 308, done.[K
remote: Counting objects: 100% (84/84), done.[K
remote: Compressing objects: 100% (71/71), done.[K
remote: Total 308 (delta 36), reused 39 (delta 12), pack-reused 224 (from 1)[K
Receiving objects: 100% (308/308), 658.34 MiB | 21.90 MiB/s, done.
Resolving deltas: 100% (128/128), done.
Updating files: 100% (93/93), done.


In [None]:
%cd Python-Natural-Language-Processing-Cookbook

/content/Python-Natural-Language-Processing-Cookbook


In [None]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
import numpy as np
import pandas as pd
import string
import pickle
from sklearn import svm
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from Chapter01.tokenization import tokenize_nltk
from Chapter04.unsupervised_text_classification import tokenize_and_stem
from Chapter04.preprocess_bbc_dataset import get_data
from Chapter04.keyword_classification import get_labels
from Chapter04.preprocess_bbc_dataset import get_stopwords

bbc_dataset = "/content/Python-Natural-Language-Processing-Cookbook/Chapter04/bbc-text.csv"
stopwords_file_path = "/content/Python-Natural-Language-Processing-Cookbook/Chapter01/stopwords.csv"
stopwords = get_stopwords(stopwords_file_path)

In [None]:
new_example = """iPhone 12: Apple makes jump to 5G
Apple has confirmed its iPhone 12 handsets will be its first to work on faster 5G networks.
The company has also extended the range to include a new "Mini" model that has a smaller 5.4in screen.
The US firm bucked a wider industry downturn by increasing its handset sales over the past year.
But some experts say the new features give Apple its best opportunity for growth since 2014, when it revamped its line-up with the iPhone 6.
"5G will bring a new level of performance for downloads and uploads, higher quality video streaming, more responsive gaming,
real-time interactivity and so much more," said chief executive Tim Cook.
There has also been a cosmetic refresh this time round, with the sides of the devices getting sharper, flatter edges.
The higher-end iPhone 12 Pro models also get bigger screens than before and a new sensor to help with low-light photography.
However, for the first time none of the devices will be bundled with headphones or a charger.
Apple said the move was to help reduce its impact on the environment. "Tim Cook [has] the stage set for a super-cycle 5G product release,"
commented Dan Ives, an analyst at Wedbush Securities.
He added that about 40% of the 950 million iPhones in use had not been upgraded in at least three-and-a-half years, presenting a "once-in-a-decade" opportunity.
In theory, the Mini could dent Apple's earnings by encouraging the public to buy a product on which it makes a smaller profit than the other phones.
But one expert thought that unlikely.
"Apple successfully launched the iPhone SE in April by introducing it at a lower price point without cannibalising sales of the iPhone 11 series," noted Marta Pinto from IDC.
"There are customers out there who want a smaller, cheaper phone, so this is a proven formula that takes into account market trends."
The iPhone is already the bestselling smartphone brand in the UK and the second-most popular in the world in terms of market share.
If forecasts of pent up demand are correct, it could prompt a battle between network operators, as customers become more likely to switch.
"Networks are going to have to offer eye-wateringly attractive deals, and the way they're going to do that is on great tariffs and attractive trade-in deals,"
predicted Ben Wood from the consultancy CCS Insight. Apple typically unveils its new iPhones in September, but opted for a later date this year.
It has not said why, but it was widely speculated to be related to disruption caused by the coronavirus pandemic. The firm's shares ended the day 2.7% lower.
This has been linked to reports that several Chinese internet platforms opted not to carry the livestream,
although it was still widely viewed and commented on via the social media network Sina Weibo."""

In [None]:
from sklearn.metrics import classification_report


In [None]:
def create_dataset(data_dict, le):
    text = []
    labels = []
    for topic in data_dict:
        label = le.transform([topic])
        text = text + data_dict[topic]
        this_topic_labels = [label[0]]*len(data_dict[topic])
        labels = labels + this_topic_labels
    docs = {'text':text, 'label':labels}
    frame = pd.DataFrame(docs)
    return frame

def split_dataset(df, train_column_name, gold_column_name, test_percent):
    X_train, X_test, y_train, y_test = train_test_split(df[train_column_name], df[gold_column_name], test_size=test_percent, random_state=0)
    return (X_train, X_test, y_train, y_test)

def create_and_fit_vectorizer(training_text):
    vec = TfidfVectorizer(max_df=0.90, min_df=0.05, stop_words=stopwords,
                          use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))
    return vec.fit(training_text)

def train_svm_classifier(X_train, y_train):
    clf = svm.SVC(C=1, kernel='linear')
    clf = clf.fit(X_train, y_train)
    return clf

def evaluate(clf, X_test, y_test, le):
    y_pred = clf.predict(X_test)
    print(classification_report(y_test, y_pred, labels=le.transform(le.classes_), target_names=le.classes_))

def test_new_example(input_string, clf, vectorizer, le):
    vector = vectorizer.transform([input_string]).toarray() # Changed to toarray()
    prediction = clf.predict(vector)
    print(prediction)
    label = le.inverse_transform(prediction)
    print(label)

In [None]:
test_new_example(new_example, clf, vectorizer, le)

[4]
['tech']


In [None]:
data_dict = get_data(bbc_dataset)
le = get_labels(list(data_dict.keys()))
df = create_dataset(data_dict, le)
(X_train, X_test, y_train, y_test) = split_dataset(df, 'text', 'label', 0.2)
vectorizer = create_and_fit_vectorizer(X_train)
# Convert X_train and X_test to NumPy arrays
X_train = np.asarray(vectorizer.transform(X_train).todense()) # Changed to np.asarray
X_test = np.asarray(vectorizer.transform(X_test).todense()) # Changed to np.asarray
clf = train_svm_classifier(X_train, y_train)
pickle.dump(clf, open('bbc_svm.pkl', 'wb'))
clf = pickle.load(open('bbc_svm.pkl', 'rb'))
evaluate(clf, X_test, y_test, le)




               precision    recall  f1-score   support

     business       0.93      0.93      0.93       105
entertainment       0.96      0.96      0.96        78
     politics       0.93      0.94      0.94        72
        sport       0.98      0.99      0.99       106
         tech       0.96      0.94      0.95        84

     accuracy                           0.96       445
    macro avg       0.95      0.95      0.95       445
 weighted avg       0.96      0.96      0.96       445



TypeError: np.matrix is not supported. Please convert to a numpy array with np.asarray. For more information see: https://numpy.org/doc/stable/reference/generated/numpy.matrix.html

# Code Second Edit

In [None]:
%run -i "/content/drive/MyDrive/DeepLearning/util_simple_classifier.ipynb"

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [None]:
!git clone https://github.com/PacktPublishing/Python-Natural-Language-Processing-Cookbook-Second-Edition.git

Cloning into 'Python-Natural-Language-Processing-Cookbook-Second-Edition'...
remote: Enumerating objects: 433, done.[K
remote: Counting objects: 100% (24/24), done.[K
remote: Compressing objects: 100% (22/22), done.[K
remote: Total 433 (delta 11), reused 6 (delta 2), pack-reused 409 (from 1)[K
Receiving objects: 100% (433/433), 18.28 MiB | 6.96 MiB/s, done.
Resolving deltas: 100% (235/235), done.


In [None]:
train_df = pd.read_json("/content/Python-Natural-Language-Processing-Cookbook/Python-Natural-Language-Processing-Cookbook-Second-Edition/data/bbc_train.json")
test_df = pd.read_json("/content/Python-Natural-Language-Processing-Cookbook/Python-Natural-Language-Processing-Cookbook-Second-Edition/data/bbc_test.json")
train_df.sample(frac=1)
print(train_df.groupby('label_text').count())
print(test_df.groupby('label_text').count())

               text  label  text_tokenized  text_clean
label_text                                            
business        408    408             408         408
entertainment   309    309             309         309
politics        333    333             333         333
sport           409    409             409         409
tech            321    321             321         321
               text  label  text_tokenized  text_clean
label_text                                            
business        102    102             102         102
entertainment    77     77              77          77
politics         84     84              84          84
sport           102    102             102         102
tech             80     80              80          80


In [None]:
from sklearn.svm import SVC
from sentence_transformers import SentenceTransformer
from sklearn.metrics import confusion_matrix

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')
def get_sentence_vector(text, model):
    sentence_embeddings = model.encode(text)
    return sentence_embeddings

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
target_names=["tech", "business", "sport", "entertainment", "politics"]
vectorize = lambda x: get_sentence_vector(x, model)
(X_train, X_test, y_train, y_test) = create_train_test_data(train_df, test_df, vectorize, column_name="text_clean")
clf = train_classifier(X_train, y_train)
print(classification_report(train_df["label"], y_train, target_names=target_names))
test_classifier(test_df, clf, target_names=target_names)

               precision    recall  f1-score   support

         tech       1.00      1.00      1.00       321
     business       1.00      1.00      1.00       408
        sport       1.00      1.00      1.00       409
entertainment       1.00      1.00      1.00       309
     politics       1.00      1.00      1.00       333

     accuracy                           1.00      1780
    macro avg       1.00      1.00      1.00      1780
 weighted avg       1.00      1.00      1.00      1780

               precision    recall  f1-score   support

         tech       0.97      0.96      0.97        80
     business       0.98      0.99      0.99       102
        sport       0.99      1.00      1.00       102
entertainment       0.99      0.99      0.99        77
     politics       0.98      0.96      0.97        84

     accuracy                           0.98       445
    macro avg       0.98      0.98      0.98       445
 weighted avg       0.98      0.98      0.98       445



In [None]:
print(confusion_matrix(test_df["label"], test_df["prediction"]))

[[ 77   0   1   1   1]
 [  1 101   0   0   0]
 [  0   0 102   0   0]
 [  0   0   0  76   1]
 [  1   2   0   0  81]]


In [None]:
new_example = """iPhone 12: Apple makes jump to 5G
Apple has confirmed its iPhone 12 handsets will be its first to work on faster 5G networks.
The company has also extended the range to include a new "Mini" model that has a smaller 5.4in screen.
The US firm bucked a wider industry downturn by increasing its handset sales over the past year.
But some experts say the new features give Apple its best opportunity for growth since 2014, when it revamped its line-up with the iPhone 6.
"5G will bring a new level of performance for downloads and uploads, higher quality video streaming, more responsive gaming,
real-time interactivity and so much more," said chief executive Tim Cook.
There has also been a cosmetic refresh this time round, with the sides of the devices getting sharper, flatter edges.
The higher-end iPhone 12 Pro models also get bigger screens than before and a new sensor to help with low-light photography.
However, for the first time none of the devices will be bundled with headphones or a charger.
Apple said the move was to help reduce its impact on the environment. "Tim Cook [has] the stage set for a super-cycle 5G product release,"
commented Dan Ives, an analyst at Wedbush Securities.
He added that about 40% of the 950 million iPhones in use had not been upgraded in at least three-and-a-half years, presenting a "once-in-a-decade" opportunity.
In theory, the Mini could dent Apple's earnings by encouraging the public to buy a product on which it makes a smaller profit than the other phones.
But one expert thought that unlikely.
"Apple successfully launched the iPhone SE in April by introducing it at a lower price point without cannibalising sales of the iPhone 11 series," noted Marta Pinto from IDC.
"There are customers out there who want a smaller, cheaper phone, so this is a proven formula that takes into account market trends."
The iPhone is already the bestselling smartphone brand in the UK and the second-most popular in the world in terms of market share.
If forecasts of pent up demand are correct, it could prompt a battle between network operators, as customers become more likely to switch.
"Networks are going to have to offer eye-wateringly attractive deals, and the way they're going to do that is on great tariffs and attractive trade-in deals,"
predicted Ben Wood from the consultancy CCS Insight. Apple typically unveils its new iPhones in September, but opted for a later date this year.
It has not said why, but it was widely speculated to be related to disruption caused by the coronavirus pandemic. The firm's shares ended the day 2.7% lower.
This has been linked to reports that several Chinese internet platforms opted not to carry the livestream,
although it was still widely viewed and commented on via the social media network Sina Weibo."""
vector = vectorize(new_example)
prediction = clf.predict([vector])
print(prediction)

[0]
