In [2]:
import collections
import numpy as np
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
import os
import random
import threading
import glob 
import shutil

In [4]:
def get_topic_information(topic_id):
  url = 'https://www.scientificamerican.com/the-sciences/'
  response = requests.get(url)
  response.encoding = 'utf-8'
  value = response.text
  soup = BeautifulSoup(value,'lxml')
  state = soup.find('ul',class_ = 'header-topic-list landing-header__nav__list')
  topics = state.find_all('li',class_ = 'landing-header__nav__list__item')
  topic = topics[topic_id]
  current_topic_href= topic.a['href']
  current_topic_text = topic.a.text
  response = requests.get(current_topic_href)
  response.encoding = 'utf-8'
  value = response.text
  soup = BeautifulSoup(value,'lxml')
  state = soup.find('ol',class_ = 'pagination__nums')
  pages = state.find_all('li')
  number_of_pages = int(pages[-1].text)
  return current_topic_href , current_topic_text , number_of_pages

In [5]:
topics_info = []
for i in range(8):
  topic_href , topic_name,number_of_pages = get_topic_information(i)
  topics_info.append({'topic_href':topic_href,'topic_name':topic_name,'number_of_pages':number_of_pages})

In [6]:
topics_info[0]

{'number_of_pages': 52,
 'topic_href': 'https://www.scientificamerican.com/arts-and-culture/',
 'topic_name': 'Arts & Culture'}

In [7]:
def scrap_data(topic_name,topic_href,topic_id,from_,to_):
  to_ = min(to_,topics_info[topic_id]['number_of_pages'])
  arr = []
  # print(topic_name)
  # print('the from ' , from_ ,  'the to ' , to_)
  for k in range(from_,to_):
    current_page_link = topic_href + '?page=' + str(k + 1)
    response = requests.get(current_page_link)
    response.encoding = 'utf-8'
    value = response.text
    soup = BeautifulSoup(value,'lxml')
    all_article_pages = soup.find_all('div',class_='listing-wide__inner')
    for article_page in all_article_pages:
      article_link = article_page.a['href']
      title = article_page.a.text 
      response = requests.get(article_link)
      response.encoding = 'utf-8'
      value = response.text
      soup = BeautifulSoup(value,'lxml')
      divParagraph = soup.find_all('div',class_='mura-region-local')
      video_state = soup.find('div',class_='podcasts-media podcasts-media--feature podcasts__media')
      if video_state is not None:
        # print('NONE')
        continue
      article_text = ''
      for div in divParagraph:
        paragraph = div.find_all('p')
        for p in paragraph:
          article_text+=p.text
      arr.append([article_text,title])
      # print(len(arr))
  df = pd.DataFrame(arr,columns = ['articleText','title'])
  file_idx = random.randint(1,10000)
  folder_name = '/content/drive/MyDrive/dataset/science dataset/{}'.format(topic_name)
  if os.path.exists(folder_name) == False:
    os.mkdir(folder_name)
  path = folder_name + '/{}.csv'.format(str(file_idx))
  df.to_csv(path,index = False)


In [8]:
def generate_threads(number_of_threads,number_of_iteration,start_topic,start_page,increment):
  ok = 0
  topic_id = start_page
  current_iteration = 0
  for it in range(number_of_iteration):
    thr = []
    for i in range(number_of_threads):
      number_of_pages = int (topics_info[topic_id]['number_of_pages'])
      topic_name = topics_info[topic_id]['topic_name']
      topic_href = topics_info[topic_id]['topic_href']
      st = start_page + current_iteration*number_of_threads*increment + i*increment
      thread = threading.Thread(target = scrap_data,args = (topic_name,topic_href,topic_id,st,st + increment))
      if st + increment >= number_of_pages:
        ok = 1
      thr.append(thread)
      thr[i].start()
    
    current_iteration+=1
    for i in range(number_of_threads):
      thr[i].join()
    if ok:
      topic_id+=ok
      current_iteration = 0
      ok = 0 



In [None]:
generate_threads(5,500,0,0,2)

Arts & Culture
the from  0 the to  2
Arts & Culture
the from Arts & Culture
the from  4 the to  6
 2 the to  4
Arts & Culture
the from  6 the to  8
Arts & Culture
the from  8 the to  10
1
NONE
NONE
1
1
2
1
2
3
NONE
NONE
1
3
2
4
NONE
2
2
3
3
NONE
4
4
4
5
5
5
3
4
6
NONE
5
NONE
7
8
6
6
7
5
6
6
7
7
8
9
9
8
10
7
NONE
8
NONE
9
NONE
NONE
10
8
11
9
9
NONE
10
NONE
NONE
10
12
11
13
NONE
NONE
NONE
11
10
11
NONE
NONE
NONE
12
14
13
15
12
13
12
NONE
14
16
1413

15
NONE
11
17
16
15
NONE
14
NONE
17
12
18
13
14
NONE
16
19
18
20
15
21
15
16
16
NONE
NONE
19
NONE
NONE
17
17
18
22
20
17
NONE
NONE
19
21
18
20
22
19
NONE
23
24
21
23
20
18
19
25
NONE
NONE22

26
NONE
24
27
NONE
28
29
25
NONE
23
24
NONE
20
NONE
30
NONE
26
21
NONE
NONE
25
21
31
NONE
NONE
22
NONE
26
NONE
NONE
NONE
NONE
27
23
NONE
24
32
NONE
NONE
33
NONE
34
NONE
22
23
NONE
NONE
NONE
24
25
26
NONE
27
28
29
30
Arts & Culture
the from  10Arts & Culture
the from  12 the to  14
Arts & Culture Arts & Culture

the from  16 the to  18the from  14 the to  