# Webscraping Tapas Forum 

In [None]:
''' instructions: run main() to get the json, pickle, and csv files with the latest 1080 posts 
    note: files that are stored on colab notebook disappear when runtime disconnects. save files immediately. ''' 

In [None]:
!pip install beautifulsoup4 

In [None]:
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin
!pip install selenium

In [None]:
from selenium import webdriver
options = webdriver.ChromeOptions()
options.add_argument('-headless')
options.add_argument('-no-sandbox')
options.add_argument('-disable-dev-shm-usage')

In [None]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import json
import requests
import time
import datetime
import pandas as pd
import numpy as np

In [None]:
def get_source_code(url): 
  wd = webdriver.Chrome('chromedriver', options=options)
  wd.get(url)
  source = wd.page_source
  return source

In [None]:
# making the soup 
def make_soup(url):
  page = requests.get(url, timeout=120)
  if page.status_code == 200:
    soup = BeautifulSoup(page.content, "html.parser")
    return soup
  else: 
    return None

In [None]:
def special_convert_str_to_int(string): 
  if string[-1] == 'k': 
    float_string = float(string[:-1]) 
    final_string = int(float_string * 1000)
  else: 
    final_string = int(string)
  return final_string

In [None]:
# get link, href, title, replies, views data (located in a cell/column) on one post (located in a row)
def get_table_cells_data(table_row): 
  cell_data = {}
  link, href, title, replies, views = "", "", "", "", ""
  base_url = "https://forums.tapas.io"
  td = table_row.find_all("td")
  for cell in td: 
    # print("cell class: ", cell["class"])
    if cell["class"] == ["main-link", "clearfix"]: 
      href = cell.a["href"]
      link = base_url + href 
      title = cell.a.text
      cell_data["href"] = href
      cell_data["link"] = link 
      cell_data["title"] = title
    elif cell["class"] == "posters": 
      continue
    elif (cell["class"]==['num', 'posts-map', 'posts', 'heatmap-high']) or (cell["class"]==['num', 'posts-map', 'posts', 'heatmap-']) or (cell["class"]==['num', 'posts-map', 'posts', 'heatmap-med']): 
      replies = cell.a.span.text
      num_replies = special_convert_str_to_int(replies)
      cell_data["replies"] = num_replies
    elif (cell["class"]==['num', 'views', '']) or (cell["class"]==['num', 'views', 'heatmap-med']) or (cell["class"]==['num', 'views', 'heatmap-high']):
      views = cell.span.text
      num_views = special_convert_str_to_int(views) 
      cell_data["views"] = num_views
    elif (cell["class"] == ['num', 'age', 'activity']) or (cell["class"] == ['num', 'age', 'coldmap-low', 'activity']):
      continue 
  '''
  print("href: ", href)
  print("link: ", link)
  print("title: ", title)
  print("replies: ", replies)
  print("views: ", views)
  '''
  return cell_data

In [None]:
# get table rows data (link, href, post_id, replies, views) for one page (30 posts)
def get_table_rows_data(url):
  
  source_code = get_source_code(url)
  soup = BeautifulSoup(source_code, "html.parser")
  tables = soup.find_all("table")
  print("Number of Tables: ", (len(tables)))

  post_data_lst = [] 
  post_id = ""
  tbody = tables[0].tbody
  table_rows = tbody.find_all("tr")

  for row in table_rows: 
    post_data = {}
    post_id = row["data-topic-id"]
    post_data["post_id"] = post_id 
    ''' print("post_id: ", post_id) ''' 
    extra_post_data = get_table_cells_data(row)
    if extra_post_data["href"] == "":  # if cannot get href do not add to list
      continue 
    post_data.update(extra_post_data)
    post_data_lst.append(post_data) 

  return post_data_lst

In [None]:
# gets posts links, hrefs, titles, post_ids, replies, views for x number of pages. returns list of dictionaries. str & int parameters
def get_all_meta_data(url, num_of_pages): 
  all_meta = []
  latest_page = "/l/latest?page="

  # just scrape 30 * 36 = 1080 data points for analysis; Dont want whole forum 
  if num_of_pages > 36: 
    num_of_pages = 36
  
  for i in range(num_of_pages):
    page_num = "% s" %i
    print("page_num: ", page_num)
    new_url = url + latest_page + page_num
    print("new_url: ", new_url)
    next_lst = get_table_rows_data(new_url)
    if next_lst == []: 
      continue 
    all_meta.extend(next_lst)

  print(len(all_meta)) 
  return all_meta

In [None]:
def get_date_published(soup): 
  date = ""
  date_data = soup.find("time")
  date = date_data['datetime']
  end = date.find("T") 
  date = date[:end]
  return date

In [None]:
def get_post_body(soup):
  post = ""
  original_post = soup.find("div", {"class":"post"})
  lines = original_post.find_all("p")
  for line in lines: 
    post = post + line.text
  return post


In [None]:
# adds date and post body to list of posts dictionaries 
def add_post_date_and_body(posts):  
  count = 0 
  for post in posts: 
    url = post["link"]
    soup = make_soup(url) 
    if soup == None: 
      continue 
    date = get_date_published(soup)
    content = get_post_body(soup)
    post["date"] = date
    post["post_content"] = content
    count += 1
    print(count) 
    print(url)
    time.sleep(3)
  
  return posts

In [None]:
def scrape_category(url, pages):
  # just scrape 3 * 36 = 1080 data points. dont want whole forum. thats too much 
  if pages > 36: 
    pages = 36 
  
  category_posts_lst = get_all_meta_data(url, pages) 
  print("===== Scraping Post Content ==================================================================================================")
  final_lst = add_post_date_and_body(category_posts_lst)
  return final_lst 

In [None]:
def clean_data(posts_lst):
  clean_df = pd.DataFrame()

  # create dataframe from list of posts data
  posts_df = pd.DataFrame(posts_lst)
  posts_df.rename(columns={"date":"date_published"}, inplace=True)
  
  # change values of date published to datetime.date() object 
  posts_df["date_published"] = pd.to_datetime(posts_df["date_published"]).dt.date
  
  # sort rows by date published (most recent to oldest)
  posts_sorted_df = posts_df.sort_values(by="date_published", ascending=False, inplace=False) 
  
  # delete duplicate rows 
  posts_sorted_clean_df = posts_sorted_df.drop_duplicates(keep='first')
  
  # reorder indices by date order (most recent to oldest)
  date_index_df = posts_sorted_clean_df.set_index("date_published", inplace=False)
  clean_df = date_index_df.reset_index()

  return clean_df

In [None]:
base_url = "https://forums.tapas.io"
latest_page = "/l/latest?page="
url_categories = {"event":{"url":"https://forums.tapas.io/c/events-challenges","last_page":25,"number_of_posts":754},
                  "announcements":{"url":"https://forums.tapas.io/c/announcements","last_page":4,"number_of_posts":119},
                  "off_topic":{"url":"https://forums.tapas.io/c/Off-Topic","last_page":202, "number_of_posts":6076},
                  "art_comics":{"url":"https://forums.tapas.io/c/art-comics","last_page":265,"number_of_posts":7974},
                  "writing_novels":{"url":"https://forums.tapas.io/c/writing-novels","last_page":44, "number_of_posts":1305},
                  "reviews":{"url":"https://forums.tapas.io/c/reviews-feedback","last_page":25,"number_of_posts":755},
                  "collaborations":{"url":"https://forums.tapas.io/c/collaborations","last_page":94,"number_of_posts":2844},
                  "questions":{"url":"https://forums.tapas.io/c/questions","last_page":275,"number_of_posts":8266}, 
                  "answered":{"url":"https://forums.tapas.io/c/answered","last_page":14,"number_of_posts":421}, 
                  "tech_support":{"url":"https://forums.tapas.io/c/tech-support-site-feedback","last_page":49,"number_of_posts":1474},
                  "promotions":{"url":"https://forums.tapas.io/c/promotions","last_page":191,"number_of_posts":5734}}
url_categories_lst = [{"category":"event","url":"https://forums.tapas.io/c/events-challenges","last_page":25,"number_of_posts":754},
                      {"category":"announcements","url":"https://forums.tapas.io/c/announcements","last_page":4,"number_of_posts":119},
                      {"category":"off_topic","url":"https://forums.tapas.io/c/Off-Topic","last_page":202, "number_of_posts":6076},
                      {"category":"art_comics","url":"https://forums.tapas.io/c/art-comics","last_page":265,"number_of_posts":7974},
                      {"category":"writing_novels","url":"https://forums.tapas.io/c/writing-novels","last_page":44, "number_of_posts":1305},
                      {"category":"reviews","url":"https://forums.tapas.io/c/reviews-feedback","last_page":25,"number_of_posts":755},
                      {"category":"collaborations","url":"https://forums.tapas.io/c/collaborations","last_page":94,"number_of_posts":2844},
                      {"category":"questions","url":"https://forums.tapas.io/c/questions","last_page":275,"number_of_posts":8266}, 
                      {"category":"answered","url":"https://forums.tapas.io/c/answered","last_page":14,"number_of_posts":421}, 
                      {"category":"tech_support","url":"https://forums.tapas.io/c/tech-support-site-feedback","last_page":49,"number_of_posts":1474},
                      {"category":"promotions","url":"https://forums.tapas.io/c/promotions","last_page":191,"number_of_posts":5734}]

In [None]:
def main(): 
  for category, info in url_categories.items(): 
    print("=== Now Scraping " + category + " category =========================================================================================")

    # page numbering starts from 0. so total number of pages is last_page+1
    category_posts = scrape_category(info["url"], info["last_page"]+1) 

    cat_json = "tapas_" + category + "_first_1080_posts_2021.json"
    with open(cat_json,"w") as cat_write_file: 
      json.dump(category_posts, cat_write_file)    
    with open(cat_json, "r") as cat_read_file: 
      category_posts_lst = json.load(cat_read_file) 

    cat_clean_df = clean_data(category_posts_lst)
    
    cat_pkl = "tapas_" + category + "_first_1080_pickle.pkl"
    cat_clean_df.to_pickle(cat_pkl)

    cat_csv = "tapas_" + category + "_first_1080_posts_2021.csv"
    cat_clean_df.to_csv(cat_csv,index=False)    


In [None]:
# scrape as many categories as you want from anywhere in the list
def scrape_multiple_categories(url_categories_lst, start, stop):
  for i in range(start, stop): 
    cat = url_categories_lst[i]
    cat_name = cat["category"] 

    # page numbering starts from 0 so total number of pages is last_page+1
    category_posts = scrape_category(cat["url"], cat["last_page"]+1)

    cat_json = "tapas_" + cat_name + "_first_1080_posts_2021.json"
    with open(cat_json,"w") as cat_write_file: 
      json.dump(category_posts, cat_write_file)    
    with open(cat_json, "r") as cat_read_file: 
      category_posts_lst = json.load(cat_read_file) 

    cat_clean_df = clean_data(category_posts_lst)
    
    cat_pkl = "tapas_" + cat_name + "_first_1080_pickle.pkl"
    cat_clean_df.to_pickle(cat_pkl)

    cat_csv = "tapas_" + cat_name + "_first_1080_posts_2021.csv"
    cat_clean_df.to_csv(cat_csv,index=False)

In [None]:
# preventing google colab from disconnecting *works for about 7 hrs
# inspect page, go to console, enter code separately 
'''
function ClickConnect(){
    console.log("Clicked on connect button"); 
    document.querySelector("colab-connect-button").click()
}
'''
'''
setInterval(ClickConnect,60000)
'''