In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import nltk
import importlib
import utils.preprocessing as preprocessing
import clustering.wiki_graph as wiki_graph

0it [00:00, ?it/s]
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\utilisateur\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\utilisateur\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\utilisateur\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Data Preprocessing

In this first part we will be Preprocessing text data to prepare them for clustering and classification. This will include the following steps:
* Noise Removal
* Normalization
* Tekenization & Segmentation 

## Data Loading

In [2]:
df = pd.read_pickle("data/dataset_business_technology_cybersecurity.pickle")
df = pd.DataFrame(df)
df.sample(5)

Unnamed: 0,title,content,topic
84,Partnership,<p>A <b>partnership</b> is an arrangement wher...,business
219,Cable car (railway),<p>A <b>cable car</b> (usually known as a <b>c...,technology
90,Benefit shortfall,<p>When the actual benefits of a venture are l...,business
261,Computer virus,"<p class=""mw-empty-elt"">\n\n</p>\n\n<p>A <b>co...",cybersecurity
101,Trade name,"<p>A <b>trade name</b>, <b>trading name</b>, o...",business


In [3]:
# explore the data format in a txt file 
df.to_csv("data/backup_preprocess/content.txt")

## Noise Removal
Noise removal can be defined as text-specific normalization. As we are dealing with html row data, our data preprocessing pipeline will include striping away all HTML markup with the help of the BeautifulSoup library. We will also be replacing contractions with their expansions.

In [4]:
importlib.reload(preprocessing)
df["content"] = preprocessing.remove_noise_from_df(df["content"])
# backup saving
df.to_csv("data/backup_preprocess/content_without_noise.txt")


0it [00:00, ?it/s][A
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\utilisateur\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\utilisateur\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\utilisateur\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!

10it [00:00, 86.96it/s][A
19it [00:00, 77.22it/s][A
27it [00:00, 70.23it/s][A
36it [00:00, 72.11it/s][A
42it [00:00, 63.46it/s][A
48it [00:00, 58.04it/s][A
54it [00:00, 58.10it/s][A
60it [00:00, 58.49it/s][A
67it [00:01, 56.46it/s][A
73it [00:01, 57.48it/s][A
0it [00:26, ?it/s]

85it [00:01, 30.18it/s][A
94it [00:01, 37.23it/s][A
101it [00:01, 42.99it/s][A
112it [00:02, 51.07it/s][A
119it [00:02, 55.44it/s][A
127it [00:02, 57.14it/s][A
134it [00:02, 58.95it/s][A
142it 

## Normalization
Normalization refers to a series of tasks that put all text on a level of playing field: converting all text to the same case(upper or lower), removing special characters(punctuation) and numbers, stemming, lemmatization, ... Normalization puts all words on equal footing and alows processing to proceed uniformly.

In [5]:
importlib.reload(preprocessing)
df["content"] = preprocessing.normalize_df(df["content"])
# backup save
df.to_csv("data/backup_preprocess/content_normalized.txt")

0it [00:00, ?it/s]
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\utilisateur\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\utilisateur\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\utilisateur\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
333it [04:25,  1.25it/s]


## Tockenization
 

In [6]:
importlib.reload(preprocessing)
df["content"] = df["content"].progress_apply(nltk.word_tokenize)
df.to_csv("data/backup_preprocess/content_tokenized.txt")
df.head(5)

0it [00:00, ?it/s]
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\utilisateur\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\utilisateur\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\utilisateur\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
333it [00:02, 142.53it/s]


Unnamed: 0,title,content,topic
0,Accounting,"[account, account, measur, process, commun, fi...",business
1,Commerce,"[commerc, exchang, good, servic, especi, larg,...",business
2,Finance,"[financ, term, matter, regard, manag, creation...",business
3,Industrial relations,"[industri, relat, employ, relat, multidiscipli...",business
4,Management,"[manag, manag, administr, organ, whether, busi...",business


# Clustering


In [2]:
df = pd.read_csv('data/backup_preprocess/content_tokenized.txt')
df.head(5)

Unnamed: 0.1,Unnamed: 0,title,content,topic
0,0,Accounting,"['account', 'account', 'measur', 'process', 'c...",business
1,1,Commerce,"['commerc', 'exchang', 'good', 'servic', 'espe...",business
2,2,Finance,"['financ', 'term', 'matter', 'regard', 'manag'...",business
3,3,Industrial relations,"['industri', 'relat', 'employ', 'relat', 'mult...",business
4,4,Management,"['manag', 'manag', 'administr', 'organ', 'whet...",business


In [3]:
importlib.reload(wiki_graph)
wiki_pages = df.to_dict(orient="records")
n_tokens = list(range(5, 45, 1))
nb_clusters = {}
clusters = {}
for n in tqdm(n_tokens):
    graph = wiki_graph.WikiGraph()
    graph.build_graph(wiki_pages, constraint=n)
    clusters[n] = graph.get_wiki_clusters()
    nb_clusters[n] = len(clusters[n])

s]



100%|██████████| 210/210 [00:00<00:00, 105020.13it/s]



100%|██████████| 209/209 [00:00<00:00, 209314.60it/s]



100%|██████████| 208/208 [00:00<00:00, 208263.36it/s]



100%|██████████| 207/207 [00:00<00:00, 206620.88it/s]



100%|██████████| 206/206 [00:00<00:00, 103032.03it/s]



100%|██████████| 205/205 [00:00<00:00, 102482.99it/s]



100%|██████████| 204/204 [00:00<00:00, 203820.39it/s]



100%|██████████| 203/203 [00:00<00:00, 203548.58it/s]



100%|██████████| 202/202 [00:00<00:00, 202062.82it/s]



100%|██████████| 201/201 [00:00<00:00, 200775.21it/s]



100%|██████████| 200/200 [00:00<00:00, 199443.84it/s]



100%|██████████| 199/199 [00:00<00:00, 99661.67it/s]



100%|██████████| 198/198 [00:00<00:00, 198014.35it/s]





100%|██████████| 333/333 [00:00<00:00, 2870.67it/s]


  0%|          | 0/332 [00:00<?, ?it/s][A[A


100%|██████████| 332/332 [00:00<00:00, 110639.51it/s]



100%|██████████| 331/331 [00:00<00:00, 110341.33it/s]



100%|██████████| 330/330 [00:00<00:0

In [4]:
np.save("data/backup_preprocess/nb_clusters.npy", nb_clusters)
nb_clusters = np.load('data/backup_preprocess/nb_clusters.npy',allow_pickle='TRUE').item()

In [5]:
nb_clusters

{15: 1,
 30: 37,
 45: 333,
 60: 333,
 75: 333,
 90: 333,
 105: 333,
 120: 333,
 135: 333,
 150: 333,
 165: 333,
 180: 333,
 195: 333,
 210: 333,
 225: 333,
 240: 333,
 255: 333,
 270: 333,
 285: 333}

In [18]:
for c in clusters[200]:
    print(c)
    print(c.get_topics_count())

Cluster technologynb of pages: 314
{'business': 98, 'cybersecurity': 90, 'technology': 126}
Cluster businessnb of pages: 1
{'business': 1, 'cybersecurity': 0, 'technology': 0}
Cluster businessnb of pages: 1
{'business': 1, 'cybersecurity': 0, 'technology': 0}
Cluster businessnb of pages: 1
{'business': 1, 'cybersecurity': 0, 'technology': 0}
Cluster businessnb of pages: 1
{'business': 1, 'cybersecurity': 0, 'technology': 0}
Cluster businessnb of pages: 1
{'business': 1, 'cybersecurity': 0, 'technology': 0}
Cluster technologynb of pages: 1
{'business': 0, 'cybersecurity': 0, 'technology': 1}
Cluster technologynb of pages: 1
{'business': 0, 'cybersecurity': 0, 'technology': 1}
Cluster technologynb of pages: 1
{'business': 0, 'cybersecurity': 0, 'technology': 1}
Cluster technologynb of pages: 1
{'business': 0, 'cybersecurity': 0, 'technology': 1}
Cluster cybersecuritynb of pages: 1
{'business': 0, 'cybersecurity': 1, 'technology': 0}
Cluster cybersecuritynb of pages: 1
{'business': 0, 'cy