In [None]:
!git clone https://github.com/PacktPublishing/Python-Natural-Language-Processing-Cookbook.git

Cloning into 'Python-Natural-Language-Processing-Cookbook'...
remote: Enumerating objects: 308, done.[K
remote: Counting objects:   1% (1/84)[Kremote: Counting objects:   2% (2/84)[Kremote: Counting objects:   3% (3/84)[Kremote: Counting objects:   4% (4/84)[Kremote: Counting objects:   5% (5/84)[Kremote: Counting objects:   7% (6/84)[Kremote: Counting objects:   8% (7/84)[Kremote: Counting objects:   9% (8/84)[Kremote: Counting objects:  10% (9/84)[Kremote: Counting objects:  11% (10/84)[Kremote: Counting objects:  13% (11/84)[Kremote: Counting objects:  14% (12/84)[Kremote: Counting objects:  15% (13/84)[Kremote: Counting objects:  16% (14/84)[Kremote: Counting objects:  17% (15/84)[Kremote: Counting objects:  19% (16/84)[Kremote: Counting objects:  20% (17/84)[Kremote: Counting objects:  21% (18/84)[Kremote: Counting objects:  22% (19/84)[Kremote: Counting objects:  23% (20/84)[Kremote: Counting objects:  25% (21/84)[Kremote: Counting objec

In [None]:
%cd Python-Natural-Language-Processing-Cookbook

/content/Python-Natural-Language-Processing-Cookbook/Python-Natural-Language-Processing-Cookbook


In [None]:
import nltk
nltk.download('punkt_tab')
import re
import string
import pandas as pd
import pickle
from sklearn.cluster import KMeans
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.probability import FreqDist
from Chapter01.tokenization import tokenize_nltk
from Chapter01.dividing_into_sentences import divide_into_sentences_nltk
from Chapter04.preprocess_bbc_dataset import get_data
from Chapter04.keyword_classification import divide_data
from Chapter04.preprocess_bbc_dataset import get_stopwords


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
bbc_dataset = "Chapter04/bbc-text.csv"
stopwords_file_path = "Chapter01/stopwords.csv"
stopwords = get_stopwords(stopwords_file_path)
stemmer = SnowballStemmer("english")

In [None]:
def tokenize_and_stem(sentence):
    tokens = nltk.word_tokenize(sentence)
    filtered_tokens = [t for t in tokens if t not in stopwords and t not in string.punctuation and re.search("[a-zA-Z]",t)]
    stems = [stemmer.stem(t) for t in tokens]
    return stems

In [None]:
def create_vectorizer(data):
    vec = TfidfVectorizer(max_df=0.90, min_df=0.05, stop_words=stopwords, tokenizer=tokenize_and_stem, ngram_range=(1,3))
    vec.fit(data)
    return vec

In [None]:
def make_predictions(test_data, vectorizer, km):
    predicted_data = {}
    for topic in test_data.keys():
        this_topic_list = test_data[topic]
        if (topic not in predicted_data.keys()):
            predicted_data[topic] = {}
        for text in this_topic_list:
            prediction = km.predict(vectorizer.transform([text]))[0]
            if (prediction not in predicted_data[topic].keys()):
                predicted_data[topic][prediction] = []
            predicted_data[topic][prediction].append(text)
    return predicted_data

In [None]:
def print_report(predicted_data):
    for topic in predicted_data.keys():
        print(topic)
        for prediction in predicted_data[topic].keys():
            print("Cluster number: ", prediction, "number of items: ", len(predicted_data[topic][prediction]))

In [None]:
def get_most_frequent_words(text):
    word_list = tokenize_nltk(text)
    word_list = [word for word in word_list if word not in stopwords and word not in string.punctuation and re.search('[a-zA-Z]', word)]
    freq_list = FreqDist(word_list)
    top_200 = freq_list.most_common(200)
    top_200 = [word[0] for word in top_200]
    return top_200

In [None]:
def print_most_common_words_by_cluster(all_training, km, num_clusters):
    clusters = km.labels_.tolist()
    docs = {'text': all_training, 'cluster': clusters}
    frame = pd.DataFrame(docs, index = [clusters])
    for cluster in range(0, num_clusters):
        this_cluster_text = frame[frame['cluster'] == cluster]
        all_text = " ".join(this_cluster_text['text'].astype(str))
        top_200 = get_most_frequent_words(all_text)
        print(cluster)
        print(top_200)
    return frame

In [None]:
data_dict = get_data(bbc_dataset)
(train_dict, test_dict) = divide_data(data_dict)
all_training = []
all_test = []
for topic in train_dict.keys():
    all_training = all_training + train_dict[topic]
for topic in test_dict.keys():
    all_test = all_test + test_dict[topic]
vectorizer = create_vectorizer(all_training)
matrix = vectorizer.transform(all_training)
num_clusters = 5
km = KMeans(n_clusters=num_clusters, init='k-means++', random_state=0)
km.fit(matrix)
predicted_data = make_predictions(test_dict, vectorizer, km)
print_report(predicted_data)
print_most_common_words_by_cluster(all_training, km, num_clusters)
pickle.dump(km, open('/content/drive/MyDrive/Data/bbc_kmean.pkl','wb'))



tech
Cluster number:  2 number of items:  62
Cluster number:  1 number of items:  18
Cluster number:  0 number of items:  1
business
Cluster number:  2 number of items:  101
Cluster number:  3 number of items:  1
sport
Cluster number:  4 number of items:  53
Cluster number:  1 number of items:  50
entertainment
Cluster number:  1 number of items:  39
Cluster number:  0 number of items:  35
Cluster number:  2 number of items:  4
politics
Cluster number:  3 number of items:  66
Cluster number:  4 number of items:  1
Cluster number:  2 number of items:  13
Cluster number:  1 number of items:  4
0
['film', 'best', 'awards', 'award', 'won', 'year', 'director', 'films', 'actor', 'actress', 'years', 'first', 'last', 'british', 'new', 'star', 'festival', 'oscar', 'song', 'aviator', 'music', 'hollywood', 'people', 'prize', 'bbc', 'including', 'role', 'comedy', 'win', 'nominations', 'three', 'ceremony', 'movie', 'category', 'uk', 'time', 'nominated', 'tv', 'show', 'stars', 'million', 'top', 'nam

In [None]:
pre = {'tech':0,'business':0,'sport':0,'entertainment':0,'politics':0}
predicted_data = make_predictions(test_dict, vectorizer, km)
for topic in predicted_data.keys():
    print(topic)
    for prediction in predicted_data[topic].keys():
        pre[topic] = len(predicted_data[topic][prediction])
        print(pre)

tech
{'tech': 62, 'business': 0, 'sport': 0, 'entertainment': 0, 'politics': 0}
{'tech': 18, 'business': 0, 'sport': 0, 'entertainment': 0, 'politics': 0}
{'tech': 1, 'business': 0, 'sport': 0, 'entertainment': 0, 'politics': 0}
business
{'tech': 1, 'business': 101, 'sport': 0, 'entertainment': 0, 'politics': 0}
{'tech': 1, 'business': 1, 'sport': 0, 'entertainment': 0, 'politics': 0}
sport
{'tech': 1, 'business': 1, 'sport': 53, 'entertainment': 0, 'politics': 0}
{'tech': 1, 'business': 1, 'sport': 50, 'entertainment': 0, 'politics': 0}
entertainment
{'tech': 1, 'business': 1, 'sport': 50, 'entertainment': 39, 'politics': 0}
{'tech': 1, 'business': 1, 'sport': 50, 'entertainment': 35, 'politics': 0}
{'tech': 1, 'business': 1, 'sport': 50, 'entertainment': 4, 'politics': 0}
politics
{'tech': 1, 'business': 1, 'sport': 50, 'entertainment': 4, 'politics': 66}
{'tech': 1, 'business': 1, 'sport': 50, 'entertainment': 4, 'politics': 1}
{'tech': 1, 'business': 1, 'sport': 50, 'entertainment'

In [None]:
from nltk import word_tokenize
from sklearn.cluster import KMeans
from nltk.probability import FreqDist
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedShuffleSplit
from joblib import dump, load

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m23.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [None]:
from datasets import load_dataset

In [None]:
# Load dataset
train_dataset = load_dataset("SetFit/bbc-news", split="train")
test_dataset = load_dataset("SetFit/bbc-news", split="test")
train_df = train_dataset.to_pandas()
test_df = test_dataset.to_pandas()
print(train_df)
print(test_df)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/880 [00:00<?, ?B/s]

train.jsonl:   0%|          | 0.00/2.87M [00:00<?, ?B/s]

test.jsonl:   0%|          | 0.00/2.28M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1225 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

                                                   text  label     label_text
0     wales want rugby league training wales could f...      2          sport
1     china aviation seeks rescue deal scandal-hit j...      1       business
2     rock band u2 break ticket record u2 have smash...      3  entertainment
3     markets signal brazilian recovery the brazilia...      1       business
4     tough rules for ringtone sellers firms that fl...      0           tech
...                                                 ...    ...            ...
1220  us economy shows solid gdp growth the us econo...      1       business
1221  microsoft releases bumper patches microsoft ha...      0           tech
1222  stuart joins norwich from addicks norwich have...      2          sport
1223  why few targets are better than many the econo...      1       business
1224  boothroyd calls for lords speaker betty boothr...      4       politics

[1225 rows x 3 columns]
                                       

In [None]:
# See the distribution of classes
print(train_df.groupby('label_text').count())
print(test_df.groupby('label_text').count())

               text  label
label_text                
business        286    286
entertainment   210    210
politics        242    242
sport           275    275
tech            212    212
               text  label
label_text                
business        224    224
entertainment   176    176
politics        175    175
sport           236    236
tech            189    189


In [None]:
# Combine train and test dataframes and create a better train/test split
combined_df = pd.concat([train_df, test_df], ignore_index=True, sort=False)
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
train_index, test_index = next(sss.split(combined_df["text"], combined_df["label"]))
train_df = combined_df[combined_df.index.isin(train_index)].copy()
test_df = combined_df[combined_df.index.isin(test_index)].copy()
print(train_df.groupby('label_text').count())
print(test_df.groupby('label_text').count())

               text  label
label_text                
business        408    408
entertainment   309    309
politics        333    333
sport           409    409
tech            321    321
               text  label
label_text                
business        102    102
entertainment    77     77
politics         84     84
sport           102    102
tech             80     80


In [None]:
import nltk
import spacy
from nltk import word_tokenize
from nltk.corpus import stopwords
from string import punctuation
small_model = spacy.load("en_core_web_sm")
# large_model = spacy.load("en_core_web_lg")

In [None]:
def word_tokenize_nltk(text):
    return nltk.tokenize.word_tokenize(text)

def word_tokenize_spacy(text, model):
    doc = model(text)
    return [token.text for token in doc]

In [None]:
def get_subject_phrase(doc):
    for token in doc:
        if ("subj" in token.dep_):
            subtree = list(token.subtree)
            start = subtree[0].i
            end = subtree[-1].i + 1
            return doc[start:end]
def get_object_phrase(doc):
    for token in doc:
        if ("dobj" in token.dep_):
            subtree = list(token.subtree)
            start = subtree[0].i
            end = subtree[-1].i + 1
            return doc[start:end]
def get_dative_phrase(doc):
    for token in doc:
        if ("dative" in token.dep_):
            subtree = list(token.subtree)
            start = subtree[0].i
            end = subtree[-1].i + 1
            return doc[start:end]
def get_prepositional_phrase_objs(doc):
    prep_spans = []
    for token in doc:
        if ("pobj" in token.dep_):
            subtree = list(token.subtree)
            start = subtree[0].i
            end = subtree[-1].i + 1
            prep_spans.append(doc[start:end])
    return prep_spans

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
stop_words = list(stopwords.words('english'))
stop_words.append("``")
stop_words.append("'s")

In [None]:
def remove_stopwords_helper(x):
    new_list = [w for w in x if w not in stop_words and w not in punctuation]
    return new_list

In [None]:
def tokenize(input_df, column_name):
    input_df[column_name + "_tokenized"] = input_df[column_name].apply(word_tokenize)
    return input_df

In [None]:
def remove_stopword_punct(input_df, column_name):
    input_df[column_name] = input_df[column_name].apply(remove_stopwords_helper)
    return input_df

In [None]:
# Preprocess the data
train_df = tokenize(train_df, "text")
train_df = remove_stopword_punct(train_df, "text_tokenized")
test_df = tokenize(test_df, "text")
test_df = remove_stopword_punct(test_df, "text_tokenized")
print(train_df)
print(test_df)

                                                   text  label     label_text  \
0     wales want rugby league training wales could f...      2          sport   
1     china aviation seeks rescue deal scandal-hit j...      1       business   
2     rock band u2 break ticket record u2 have smash...      3  entertainment   
3     markets signal brazilian recovery the brazilia...      1       business   
4     tough rules for ringtone sellers firms that fl...      0           tech   
...                                                 ...    ...            ...   
2217  soros group warns of kazakh close the open soc...      1       business   
2218  election  could be terror target  terrorists m...      4       politics   
2219  lifestyle  governs mobile choice  faster  bett...      0           tech   
2220  mobile multimedia slow to catch on there is no...      0           tech   
2221  owen determined to stay in madrid england forw...      2          sport   

                           

In [None]:
!git clone https://github.com/PacktPublishing/Python-Natural-Language-Processing-Cookbook-Second-Edition.git

Cloning into 'Python-Natural-Language-Processing-Cookbook-Second-Edition'...
remote: Enumerating objects: 433, done.[K
remote: Counting objects: 100% (24/24), done.[K
remote: Compressing objects: 100% (22/22), done.[K
remote: Total 433 (delta 11), reused 6 (delta 2), pack-reused 409 (from 1)[K
Receiving objects: 100% (433/433), 18.28 MiB | 11.77 MiB/s, done.
Resolving deltas: 100% (235/235), done.


In [None]:
# Get the training data and create the vectorizer
train_df["text_clean"] = train_df["text_tokenized"].apply(lambda x: " ".join(list(x)))
test_df["text_clean"] = test_df["text_tokenized"].apply(lambda x: " ".join(list(x)))
train_df.to_json("/content/Python-Natural-Language-Processing-Cookbook/Python-Natural-Language-Processing-Cookbook-Second-Edition/data/bbc_train.json")
test_df.to_json("/content/Python-Natural-Language-Processing-Cookbook/Python-Natural-Language-Processing-Cookbook-Second-Edition/data/bbc_test.json")
vec = TfidfVectorizer(ngram_range=(1,3))
matrix = vec.fit_transform(train_df["text_clean"])
# Cluster the data
km = KMeans(n_clusters=5, n_init=10)
km.fit(matrix)

In [None]:
def get_most_frequent_words(text, num_words):
    word_list = word_tokenize(text)
    freq_dist = FreqDist(word_list)
    top_words = freq_dist.most_common(num_words)
    top_words = [word[0] for word in top_words]
    return top_words
def print_most_common_words_by_cluster(input_df, km, num_clusters):
    clusters = km.labels_.tolist()
    input_df["cluster"] = clusters
    for cluster in range(0, num_clusters):
        this_cluster_text = input_df[input_df['cluster'] == cluster]
        all_text = " ".join(this_cluster_text['text_clean'].astype(str))
        top_200 = get_most_frequent_words(all_text, 200)
        print(cluster)
        print(top_200)
    return input_df
print_most_common_words_by_cluster(train_df, km, 5)

0
['said', 'us', 'year', 'also', 'market', 'would', 'company', 'growth', 'economy', 'new', 'mr', 'bank', 'last', 'economic', 'sales', 'government', 'firm', 'could', 'oil', 'however', 'shares', 'prices', '2004', 'may', 'years', 'china', '000', 'world', 'one', 'two', 'chief', 'analysts', 'rise', 'group', 'business', 'deal', 'since', 'expected', 'december', 'stock', 'dollar', 'yukos', 'uk', 'three', 'financial', 'country', 'spending', 'months', 'companies', 'still', 'first', 'european', 'rate', 'rates', 'people', 'time', 'firms', 'state', 'trade', 'demand', 'president', '2005', 'many', 'budget', 'interest', 'figures', 'jobs', 'strong', 'exchange', 'profits', 'made', 'next', 'biggest', 'tax', 'month', 'quarter', 'deutsche', 'hit', 'news', 'india', 'europe', 'london', 'costs', 'share', 'price', 'rose', 'japan', 'much', 'foreign', 'investment', 'added', 'executive', 'high', 'deficit', 'million', 'back', 'euros', 'offer', 'part', 'told', 'investors', 'countries', 'january', 'recent', 'set', '

Unnamed: 0,text,label,label_text,text_tokenized,text_clean,cluster
0,wales want rugby league training wales could f...,2,sport,"[wales, want, rugby, league, training, wales, ...",wales want rugby league training wales could f...,4
1,china aviation seeks rescue deal scandal-hit j...,1,business,"[china, aviation, seeks, rescue, deal, scandal...",china aviation seeks rescue deal scandal-hit j...,0
2,rock band u2 break ticket record u2 have smash...,3,entertainment,"[rock, band, u2, break, ticket, record, u2, sm...",rock band u2 break ticket record u2 smashed ir...,1
3,markets signal brazilian recovery the brazilia...,1,business,"[markets, signal, brazilian, recovery, brazili...",markets signal brazilian recovery brazilian st...,0
4,tough rules for ringtone sellers firms that fl...,0,tech,"[tough, rules, ringtone, sellers, firms, flout...",tough rules ringtone sellers firms flout rules...,2
...,...,...,...,...,...,...
2217,soros group warns of kazakh close the open soc...,1,business,"[soros, group, warns, kazakh, close, open, soc...",soros group warns kazakh close open society in...,1
2218,election could be terror target terrorists m...,4,politics,"[election, could, terror, target, terrorists, ...",election could terror target terrorists might ...,3
2219,lifestyle governs mobile choice faster bett...,0,tech,"[lifestyle, governs, mobile, choice, faster, b...",lifestyle governs mobile choice faster better ...,2
2220,mobile multimedia slow to catch on there is no...,0,tech,"[mobile, multimedia, slow, catch, doubt, mobil...",mobile multimedia slow catch doubt mobile phon...,2


In [None]:
test_example = test_df.iloc[1, test_df.columns.get_loc('text')]
print(test_example)
vectorized = vec.transform([test_example])
prediction = km.predict(vectorized)
print(prediction)

lib dems  new election pr chief the lib dems have appointed a senior figure from bt to be the party s new communications chief for their next general election effort.  sandy walkington will now work with senior figures such as matthew taylor on completing the party manifesto. party chief executive lord rennard said the appointment was a  significant strengthening of the lib dem team . mr walkington said he wanted the party to be ready for any  mischief  rivals or the media tried to throw at it.   my role will be to ensure this new public profile is effectively communicated at all levels   he said.  i also know the party will be put under scrutiny in the media and from the other parties as never before - and we will need to show ourselves ready and prepared to counter the mischief and misrepresentation that all too often comes from the party s opponents.  the party is already demonstrating on every issue that it is the effective opposition.  mr walkington s new job title is director of 

In [None]:
dump(km, '/content/drive/MyDrive/Data/kmeans.joblib')
km_ = load('/content/drive/MyDrive/Data/kmeans.joblib')
prediction = km_.predict(vectorized)
print(prediction)

[3]
