## Imports


In [1]:
import os, sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns  
from bertopic import BERTopic
from urllib.parse import urlparse  

if os.path.abspath("..") not in sys.path:
    sys.path.insert(0, os.path.abspath(".."))
from src.loader import NewsDataLoader
from src.db import *

  from .autonotebook import tqdm as notebook_tqdm


## Load the DataSets

In [2]:
loader=NewsDataLoader()
ratingData = loader.load_data('../data/raw/data.csv/rating.csv')
trafficData = loader.load_data('../data/raw/traffic_data/traffic.csv')
locationData = loader.load_data('../data/raw/Domains_location.csv')

## Prase the Domain from the Url in the Rating Dataset

In [3]:
ratingData['domain'] = ratingData['url'].apply(lambda x: urlparse(x).netloc)
ratingData.shape

(58356, 14)

## merge the location and trafficDataset

In [4]:

traffic_location_df = pd.merge(trafficData, locationData, left_on='Domain', right_on='SourceCommonName', how='inner')
traffic_location_df.shape

(24185, 15)

# Create the four table 
 - 1. domain == which has the domain name and domain_location_id
 - 2. domain_location == which have the list of location and country
 - 3. traffic_data == which contains the traffic data of domain and also linked to the domain with domain_id
 - 4. articles == which contains the aritcle data and also linked to the domain with domain_id

 ** creat these tables run python db.py in the src directory **

## Insert loacation and the domain to the domain and domain_location database

In [5]:
insert_domain_locations(locationData.drop_duplicates(subset='SourceCommonName').dropna())

Domain locations data inserted/updated successfully!


{'SG': 36937,
 'FR': 37422,
 'IT': 37470,
 'AR': 37416,
 'CH': 37471,
 'GM': 37459,
 'UP': 37435,
 'US': 37454,
 'RS': 37442,
 'EG': 37421,
 'IN': 37361,
 'TW': 36954,
 'UK': 37433,
 'CA': 37419,
 'SF': 37450,
 'GR': 37430,
 'NL': 37462,
 'CI': 37412,
 'IR': 36069,
 'CU': 33167,
 'LH': 37461,
 'HU': 37311,
 'UY': 34751,
 'BR': 36157,
 'SZ': 37455,
 'AM': 37074,
 'ID': 35677,
 'AJ': 35014,
 'BG': 35933,
 'GG': 35171,
 'TV': 36748,
 'VE': 35467,
 'SP': 37174,
 'LG': 37467,
 'CM': 30329,
 'MX': 37405,
 'AE': 37272,
 'PO': 37424,
 'KU': 30150,
 'BU': 35452,
 'VM': 37334,
 'TU': 37447,
 'BN': 35208,
 'MP': 24708,
 'LO': 37458,
 'KG': 35346,
 'PL': 37465,
 'AA': 172,
 'SY': 37242,
 'HR': 37223,
 'BK': 30001,
 'RI': 37456,
 'SI': 37457,
 'MK': 35300,
 'AS': 37154,
 'ET': 37300,
 'IS': 37257,
 'NI': 37423,
 'KZ': 37413,
 'JA': 37189,
 'CK': 34114,
 'KN': 35234,
 'SA': 37044,
 'NZ': 36588,
 'RO': 37429,
 'CO': 37411,
 'AU': 37261,
 'BO': 37270,
 'EZ': 37441,
 'TH': 36302,
 'LU': 36699,
 'NO': 3

In [6]:
insert_domains(locationData.drop_duplicates(subset='SourceCommonName').dropna())


1
2
3
4
5
3
7
8
8
8
8
8
3
8
8
8
17
18
19
8
2
2
17
24
8
17
17
28
17
30
17
17
33
34
17
36
17
17
17
17
17
17
17
17
17
17
18
8
49
49
49
49
49
49
49
49
18
17
17
19
17
7
17
7
17
3
17
17
19
17
17
3
73
17
17
17
17
17
79
17
17
17
17
17
17
17
17
17
19
90
17
92
19
17
17
5
18
98
17
17
101
17
103
104
18
5
17
18
17
17
17
112
113
8
49
116
117
18
18
18
30
17
18
8
125
126
127
18
104
2
131
5
49
17
5
5
33
17
17
140
30
142
143
144
19
98
147
17
149
4
151
4
8
30
155
156
157
104
159
3
33
162
73
103
19
17
167
168
2
17
17
172
173
7
168
176
177
178
18
8
181
182
17
17
3
18
17
30
189
3
17
189
17
194
17
196
34
17
18
18
168
17
18
18
156
30
18
18
103
210
24
19
182
156
215
5
156
17
18
220
221
222
19
19
19
189
17
3
34
17
220
5
18
17
49
30
237
19
239
17
173
237
19
237
17
19
247
36
49
19
251
104
253
17
17
210
17
17
259
260
196
5
8
98
33
156
131
156
220
18
17
272
30
189
30
8
17
17
17
17
34
17
7
3
19
30
7
36
8
17
17
189
18
8
17
49
17
33
17
173
49
17
156
18
17
5
8
17
17
17
17
17
17
5
17
156
144
318
2
17
34
17
34
17
33
17
1

## insert the traffic data to the traffic_data table

In [7]:
insert_traffic_data(traffic_location_df.dropna(subset='Domain').drop_duplicates('Domain'))


13339
11145
37183
2147
20983
36252
777
36586
1760
4183
10887
36994
33916
23489
27064
24135
20649
9466
11966
3097
32310
6642
36183
3421
3422
5719
27927
14132
21272
35670
34386
22506
4204
36782
4946
21924
30467
15073
22406
10947
29825
6616
33174
13225
7922
23932
36357
34629
31606
31530
6629
14833
9455
19509
3637
15531
24876
31413
12369
35894
7167
5683
5073
26635
18680
14921
35655
32961
14905
7930
11349
34417
29654
14207
4654
22461
31928
26606
16361
16129
36168
22578
10590
20327
4389
34637
36621
14264
19442
96
20451
9566
5899
4990
20497
6804
35938
2096
25357
34664
34377
15193
15596
23595
30532
8322
32088
25936
12156
36999
3872
6335
26850
562
35074
26839
24948
9150
29016
8497
4253
34559
34159
5652
12058
37285
29019
5015
1761
20364
30019
23323
24123
34382
24742
34223
36960
11299
498
33793
8747
20309
5197
23279
29032
9291
35419
36381
8315
11239
15513
13079
16758
32422
34224
10472
6602
9725
29560
10558
34700
9612
11383
662
34221
2407
16609
762
20625
14794
35586
34292
14516
25502
36704
29804
1

## insert the article data into the article table

In [11]:
insert_articles(ratingData)

Domain ID created for domain: www.forbes.com
Domain ID created for domain: www.channelnewsasia.com
Domain ID found for domain: time.com
Domain ID found for domain: phys.org
Domain ID created for domain: www.digitaltrends.com
Domain ID created for domain: www.aljazeera.com
Domain ID created for domain: www.bbc.co.uk
Domain ID found for domain: phys.org
Domain ID found for domain: deadline.com
Domain ID created for domain: www.euronews.com
Domain ID found for domain: phys.org
Domain ID created for domain: www.rt.com
Domain ID found for domain: phys.org
Domain ID found for domain: www.forbes.com
Domain ID found for domain: www.forbes.com
Domain ID found for domain: punchng.com
Domain ID found for domain: www.aljazeera.com
Domain ID found for domain: www.euronews.com
Domain ID found for domain: www.channelnewsasia.com
Domain ID found for domain: www.forbes.com
Domain ID found for domain: www.channelnewsasia.com
Domain ID created for domain: www.ibtimes.com
Domain ID found for domain: india

## Read from the domain,location,traffic and articles form the database

In [20]:
domains = read_domains()
locations = read_domain_locations()
traffic = read_traffic_data()
articles_df = read_articles()

(1, '00221.info', 1)
(2, '01net.com', 2)
(3, '01net.it', 3)
(4, '0223.com.ar', 4)
(5, '022china.com', 5)
(6, '02blog.it', 3)
(7, '02elf.net', 7)
(8, '032.ua', 8)
(9, '0362.ua', 8)
(10, '0372.ua', 8)
(11, '048.ua', 8)
(12, '0542.ua', 8)
(13, '055firenze.it', 3)
(14, '0564.ua', 8)
(15, '057.ua', 8)
(16, '061.ua', 8)
(17, '06880danwoog.com', 17)
(18, '07kbr.ru', 18)
(19, '0lf.net', 19)
(20, '1.zt.ua', 8)
(21, '10-18.fr', 2)
(22, '1001web.fr', 2)
(23, '100freeclassifieds.com', 17)
(24, '100millionideas.org', 24)
(25, '100realty.ua', 8)
(26, '1011now.com', 17)
(27, '1015nashicon.com', 17)
(28, '101media.com.tw', 28)
(29, '101pressrelease.com', 17)
(30, '101touchfm.co.uk', 30)
(31, '1037theq.com', 17)
(32, '1045theteam.com', 17)
(33, '1047.ca', 33)
(34, '1049.fm', 34)
(35, '1049wmcg.com', 17)
(36, '104fm.gr', 36)
(37, '1055online.com', 17)
(38, '1067litefm.com', 17)
(39, '1070thefan.com', 17)
(40, '1073kissfm.com', 17)
(41, '1073now.com', 17)
(42, '1075theriver.com', 17)
(43, '107jamz.com', 