In [None]:
from google.colab import drive
import os
import re
import requests
from bs4 import BeautifulSoup
import csv

In [None]:
!pip install langdetect

In [None]:
from langdetect import detect

In [None]:
#get data form Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#path to a folder of Drive that contains the files
path = "drive/MyDrive/adm3/"

In [None]:
"""
Parameter: 
1) filename: the name of file to scraple
extract informations (title, author, plot and so on) from content_x.txt file
using BeautifulSoap.
Return: the number x present in "content_x.txt" and a list of informations.
"""
def extract_info(filename):      
  f = open(filename) 
   
  #get the number x from the filename content_x.txt   
  number = re.search(r"([0-9]+).txt$", f.name).group(1)    
  soup = BeautifulSoup(f.read(), features='lxml')  
  f.close()
  #title
  bookTitle = soup.find(id="bookTitle")
  if bookTitle:
    bookTitle = bookTitle.text.replace('\n', '').strip()
  else:
    print('title not found', filename)
    return
  #serie
  bookSeries = soup.find(id="bookSeries")
  if bookSeries:
    bookSeries = bookSeries.text.replace('(', '').replace(')', '').strip()
  #book author
  bookAuthors = soup.find('span', itemprop='name')
  if bookAuthors:
    bookAuthors = bookAuthors.text
  else:
    print('author not found', filename)
    return
  #other information: ratings, plot, link
  ratingValue = soup.find(itemprop="ratingValue")
  if ratingValue:
    ratingValue = float(soup.find(itemprop="ratingValue").text)
  ratingCount = soup.find(itemprop="ratingCount")
  if ratingCount:
    ratingCount = int(ratingCount.get('content'))
  reviewCount = soup.find(itemprop="reviewCount")
  if reviewCount: 
    reviewCount = int(reviewCount.get('content'))
  Plot = soup.find(id="description")
  if Plot:
    Plot = (Plot).find_all('span')[-1].text 
    try:
      if detect(Plot) != 'en': #check if is a English plot
        return
    except:
      pass
  Number_of_Pages = soup.find(itemprop="numberOfPages")
  if Number_of_Pages:
    Number_of_Pages = int(Number_of_Pages.text.split()[0])
  try:
    Published = (soup.find(id="details")).find_all('div')[1].text.split('\n')[2].strip()
    Characters = ', '.join(map (lambda x : x.text,  (soup.find(id="bookDataBox")).find_all('a', href = re.compile("^.characters"))))
    Settings = ', '.join(map (lambda x : x.text.replace(',',''),  (soup.find(id="bookDataBox")).find_all('a', href = re.compile("^.places"))))
    link = soup.find('link', itemprop='url').get('href')
  except:
    #if the information is not present on the page set them to void
    Published = None
    Characters = None
    Settings = None
    link = None
    
  return number,[bookTitle, bookSeries, bookAuthors, ratingValue, ratingCount,Plot,Number_of_Pages,Published,Characters,Settings,link]
  

In [None]:
"""
Parameter: 
1) info: a tuple containg a number and a list of informations.
Given the number x and a list of informations, write on a file named
article_x.tsv a row containting the informations separeted by TAB.  """
def create_file_result(info):
  file_name = "drive/MyDrive/articles/article_" + info[0] + '.tsv'
  with open(file_name, 'w') as output:
    writer = csv.writer(output, delimiter='\t')
    writer.writerow(info[1])


In [None]:
#list of all page from page_1 to page_300
page_list = ["drive/MyDrive/adm3/page_" + str(i) for i in range(201,301)]

#list of all book in page_list i.e. from content_1 to content_30.000
content_list = []
for dir in page_list: 
  for content in os.listdir(dir):
    content_list.append(os.path.join(dir,content))


In [None]:
for book in content_list:
  info = extract_info(book)
  if info:
    create_file_result(info)
 
  

In [None]:
#save on Drive
drive.flush_and_unmount()
print('All changes made in this colab session should now be visible in Drive.')

All changes made in this colab session should now be visible in Drive.
