In [91]:
import pandas as pd
import numpy as np
from copy import deepcopy
import nltk
import importlib
import utils.preprocessing as preprocessing
import clustering.wiki_graph as wiki_graph

# Data Preprocessing

In this first part we will be Preprocessing text data to prepare them for clustering and classification. This will include the following steps:
* Noise Removal
* Normalization
* Tekenization & Segmentation 

## Data Loading

In [2]:
df = pd.read_pickle("data/dataset_business_technology_cybersecurity.pickle")
df = pd.DataFrame(df)
df.sample(5)

Unnamed: 0,title,content,topic
44,Franchising,<p><b>Franchising</b> is based on a marketing ...,business
74,Middle management,<p><b>Middle management</b> is the intermediat...,business
249,IPFilter,<p><b>IPFilter</b> (commonly referred to as <b...,cybersecurity
108,Computer,"<p class=""mw-empty-elt"">\n</p>\n\n<p>A <b>comp...",technology
324,Application portfolio attack surface,"<p>In the realm of application security, the t...",cybersecurity


In [3]:
# explore the data format in a txt file 
df.to_csv("data/backup_preprocess/content.txt")

## Noise Removal
Noise removal can be defined as text-specific normalization. As we are dealing with html row data, our data preprocessing pipeline will include striping away all HTML markup with the help of the BeautifulSoup library. We will also be replacing contractions with their expansions.

In [4]:
importlib.reload(preprocessing)
df["content"] = preprocessing.remove_noise_from_df(df["content"])
# backup saving
df.to_csv("data/backup_preprocess/content_without_noise.txt")


0it [00:00, ?it/s][A
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\utilisateur\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\utilisateur\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\utilisateur\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!

10it [00:00, 97.09it/s][A
19it [00:00, 92.79it/s][A
31it [00:00, 98.82it/s][A
40it [00:00, 90.78it/s][A
49it [00:00, 90.53it/s][A
59it [00:00, 92.92it/s][A
69it [00:00, 94.40it/s][A
0it [00:01, ?it/s]

91it [00:01, 63.76it/s][A
100it [00:01, 67.98it/s][A
110it [00:01, 74.87it/s][A
119it [00:01, 72.03it/s][A
127it [00:01, 73.42it/s][A
138it [00:01, 80.66it/s][A
150it [00:01, 88.87it/s][A
172it [00:01, 107.42it/s][A
187it [00:02, 113.17it/s][A
201it [00:02, 115.04it/s][A

## Normalization
Normalization refers to a series of tasks that put all text on a level of playing field: converting all text to the same case(upper or lower), removing special characters(punctuation) and numbers, stemming, lemmatization, ... Normalization puts all words on equal footing and alows processing to proceed uniformly.

In [5]:
importlib.reload(preprocessing)
df["content"] = preprocessing.normalize_df(df["content"])
# backup save
df.to_csv("data/backup_preprocess/content_normalized.txt")

0it [00:00, ?it/s]
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\utilisateur\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\utilisateur\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\utilisateur\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
333it [03:28,  1.59it/s]


## Tockenization
 

In [6]:
importlib.reload(preprocessing)
df["content"] = df["content"].progress_apply(nltk.word_tokenize)
df.to_csv("data/backup_preprocess/content_tokenized.txt")
df.head(5)

0it [00:00, ?it/s]
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\utilisateur\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\utilisateur\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\utilisateur\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
333it [00:02, 158.79it/s]


Unnamed: 0,title,content,topic
0,Accounting,"[account, account, measur, process, commun, fi...",business
1,Commerce,"[commerc, exchang, good, servic, especi, larg,...",business
2,Finance,"[financ, term, matter, regard, manag, creation...",business
3,Industrial relations,"[industri, relat, employ, relat, multidiscipli...",business
4,Management,"[manag, manag, administr, organ, whether, busi...",business


# Clustering


In [120]:
importlib.reload(wiki_graph)
wiki_pages = df.to_dict(orient="records")
graph = wiki_graph.WikiGraph()
graph.build_graph(wiki_pages)

      | 0/332 [00:00<?, ?it/s]
100%|██████████| 332/332 [00:00<00:00, 12296.97it/s]

100%|██████████| 331/331 [00:00<00:00, 33049.60it/s]

100%|██████████| 330/330 [00:00<00:00, 10650.68it/s]

100%|██████████| 329/329 [00:00<00:00, 10280.77it/s]
  1%|          | 4/332 [00:00<00:09, 33.06it/s]
100%|██████████| 328/328 [00:00<00:00, 8003.74it/s]

100%|██████████| 327/327 [00:00<00:00, 12577.14it/s]

100%|██████████| 326/326 [00:00<00:00, 8150.35it/s]
  2%|▏         | 7/332 [00:00<00:11, 29.43it/s]
100%|██████████| 325/325 [00:00<00:00, 10156.68it/s]

100%|██████████| 324/324 [00:00<00:00, 6894.61it/s]

100%|██████████| 323/323 [00:00<00:00, 8075.49it/s]
  3%|▎         | 10/332 [00:00<00:12, 26.26it/s]
100%|██████████| 322/322 [00:00<00:00, 9751.17it/s]

100%|██████████| 321/321 [00:00<00:00, 21392.03it/s]

100%|██████████| 320/320 [00:00<00:00, 14544.93it/s]

100%|██████████| 319/319 [00:00<00:00, 12760.56it/s]
  4%|▍         | 14/332 [00:00<00:11, 27.64it/s]
100%|██████████| 318/318 [00

In [121]:
for n in graph:
    print(n.wiki_page.id, len(n.wiki_page.content), len(n.wiki_neighbors))

0 660 332
1 51 332
2 620 332
3 510 332
4 1091 332
5 393 332
6 952 332
7 382 332
8 1757 332
9 1121 332
10 709 332
11 221 332
12 335 332
13 472 332
14 1659 332
15 156 332
16 556 332
17 1375 332
18 1022 332
19 483 332
20 543 332
21 180 332
22 391 332
23 707 332
24 127 332
25 2320 332
26 66 332
27 173 332
28 329 332
29 1074 332
30 481 332
31 1247 332
32 273 332
33 1058 332
34 949 332
35 442 332
36 221 332
37 1622 332
38 1741 332
39 648 332
40 1053 332
41 1544 332
42 861 332
43 1095 332
44 1238 332
45 296 332
46 308 332
47 496 332
48 178 332
49 85 332
50 1685 332
51 1220 332
52 468 332
53 428 332
54 364 332
55 723 332
56 1635 332
57 921 332
58 430 332
59 533 332
60 623 332
61 44 332
62 134 332
63 61 332
64 492 332
65 729 332
66 447 332
67 1091 332
68 418 332
69 233 332
70 393 332
71 254 332
72 838 332
73 1435 332
74 313 332
75 957 332
76 54 332
77 671 332
78 181 332
79 664 332
80 1162 332
81 1330 332
82 319 332
83 554 332
84 644 332
85 1125 332
86 419 332
87 1010 332
88 204 332
89 1661 332


In [122]:
graph_copy = deepcopy(graph) 
print(len(graph_copy.wiki_nodes))

333


In [127]:
graph.build_graph(wiki_pages, constraint=200)

100%|██████████| 333/333 [00:00<00:00, 111131.70it/s]
  0%|          | 0/332 [00:00<?, ?it/s]
100%|██████████| 332/332 [00:00<00:00, 11861.54it/s]

100%|██████████| 331/331 [00:00<00:00, 65915.61it/s]

100%|██████████| 330/330 [00:00<00:00, 14331.33it/s]

100%|██████████| 329/329 [00:00<00:00, 19376.63it/s]

100%|██████████| 328/328 [00:00<00:00, 9108.76it/s]
  2%|▏         | 5/332 [00:00<00:08, 37.04it/s]
100%|██████████| 327/327 [00:00<00:00, 19236.15it/s]

100%|██████████| 326/326 [00:00<00:00, 12547.19it/s]

100%|██████████| 325/325 [00:00<00:00, 17107.58it/s]

100%|██████████| 324/324 [00:00<00:00, 9258.44it/s]
  3%|▎         | 9/332 [00:00<00:08, 36.53it/s]
100%|██████████| 323/323 [00:00<00:00, 11964.78it/s]

100%|██████████| 322/322 [00:00<00:00, 16104.05it/s]

100%|██████████| 321/321 [00:00<00:00, 26781.74it/s]

100%|██████████| 320/320 [00:00<00:00, 15231.24it/s]

100%|██████████| 319/319 [00:00<00:00, 15950.59it/s]
  4%|▍         | 14/332 [00:00<00:08, 38.02it/s]
100%|█████

In [128]:
for n in graph:
    print(n.wiki_page.id, len(n.wiki_page.content), len(n.wiki_neighbors))

0 660 227
1 51 0
2 620 214
3 510 187
4 1091 254
5 393 176
6 952 246
7 382 171
8 1757 271
9 1121 256
10 709 224
11 221 38
12 335 156
13 472 199
14 1659 272
15 156 8
16 556 214
17 1375 259
18 1022 250
19 483 201
20 543 220
21 180 27
22 391 189
23 707 219
24 127 0
25 2320 278
26 66 0
27 173 11
28 329 138
29 1074 250
30 481 200
31 1247 255
32 273 123
33 1058 253
34 949 233
35 442 180
36 221 68
37 1622 269
38 1741 275
39 648 237
40 1053 253
41 1544 263
42 861 239
43 1095 248
44 1238 263
45 296 130
46 308 160
47 496 214
48 178 7
49 85 0
50 1685 271
51 1220 258
52 468 192
53 428 189
54 364 155
55 723 232
56 1635 267
57 921 239
58 430 161
59 533 146
60 623 25
61 44 0
62 134 3
63 61 0
64 492 182
65 729 227
66 447 179
67 1091 254
68 418 194
69 233 32
70 393 176
71 254 102
72 838 246
73 1435 264
74 313 161
75 957 243
76 54 0
77 671 231
78 181 11
79 664 187
80 1162 232
81 1330 250
82 319 109
83 554 222
84 644 223
85 1125 248
86 419 191
87 1010 256
88 204 11
89 1661 262
90 85 0
91 219 96
92 1061 25

In [129]:
clusters = graph.get_wiki_clusters()
print(len(clusters))

333it [00:00, 36926.38it/s]333



In [130]:
for i in range(5):
    print(clusters[i]) 

Cluster technology
Cluster business
Cluster business
Cluster business
Cluster business
